From 3f1086894ebf8c71f503c38b74d646dbe24c368d Mon Sep 17 00:00:00 2001 From: Avijit Ghosh Date: Sun, 15 Feb 2026 19:42:36 -0500 Subject: [PATCH] Delete data directory Since we have now moved to GH --- .../c8ab4e94-d8e8-417f-be18-fececf3c815c.json | 515 --- .../402c8833-1827-46fc-a497-46b40a6794ff.json | 515 --- .../acd2082a-ce0c-418f-9383-f3c9f11735a2.json | 515 --- .../c65ed336-b283-46c2-8284-c4695cad588d.json | 515 --- .../5ebb009d-b548-4f2b-b075-feb76ca295d2.json | 515 --- .../c7df2916-bde4-4987-9139-fcfd18a14ac1.json | 515 --- .../56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json | 515 --- .../ad3211a9-4390-4247-b64d-600191a88a75.json | 512 --- .../1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json | 515 --- .../129c8b21-f97e-4284-9574-33d5932332f7.json | 515 --- .../3644fd67-0f46-4de3-b542-edf219d0e0cd.json | 515 --- .../c0692e14-6484-4d02-8dac-55ce4373fb15.json | 515 --- .../ab4940d1-118c-479a-bd37-1ea2da6f02a3.json | 515 --- .../85552093-435f-4d85-897d-4e74c3655533.json | 515 --- .../4ddc0062-6577-4ab9-85f1-791fd2822776.json | 515 --- .../50fc4840-933b-43ec-847e-1834b30f9f14.json | 515 --- .../6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json | 515 --- .../a668c931-34e4-4702-a84c-97d8c6f59ef4.json | 515 --- .../3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json | 515 --- .../938a35f1-195d-49c8-9a16-90fab96692bd.json | 515 --- .../ce756801-f75e-4250-9721-1d627a37f055.json | 515 --- .../b83b41d4-6c95-4c7d-a290-65d89bf776c2.json | 515 --- .../31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json | 515 --- .../a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json | 515 --- .../8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json | 352 -- .../7d2d1dba-1b31-47b2-8308-f2668cf36c99.json | 352 -- .../3a056f7b-1bdf-4543-9e67-1101ace67179.json | 352 -- .../275cf2e5-5ccd-40be-be55-938c82ef6688.json | 352 -- .../43e7be99-4872-4eb1-b30b-75c44b298ab4.json | 345 -- .../cfc99298-4570-48cf-9187-aa0d167cc0ba.json | 345 -- .../a2162367-d16d-4274-aa89-43435cea5c0b.json | 345 -- .../51ef4580-da13-415a-a37f-45e2036ed4c2.json | 345 -- .../3fa605db-fcff-4f05-9398-6af77c9dcada.json | 345 -- .../9d58ac39-fef7-47c8-920a-8be2069f5662.json | 345 -- .../dd9b10af-ad39-45ef-8f91-097340d376c7.json | 345 -- .../30a6de14-c57c-483e-92e9-26fc4c7f4772.json | 345 -- .../bed1a799-77a6-40a1-9f37-d54fe9d4d055.json | 345 -- .../6c226cad-23f1-4c09-8038-eb7b776cdee4.json | 345 -- .../98887061-09d6-44ba-9cff-0267045a26ef.json | 345 -- .../6693f0e2-3514-413d-be61-d10f7372b3dc.json | 345 -- .../ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json | 345 -- .../0d9a856d-01bf-4a82-9872-33d561cf4a57.json | 345 -- .../3ff2ab7d-2c0f-4313-8223-8f514fde595a.json | 345 -- .../2a46e8da-1996-428c-b567-cd0287b29d9f.json | 345 -- .../30a92593-398e-4c2f-8be7-455be166aeaf.json | 345 -- .../e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json | 345 -- .../dfc2717d-ead8-4287-885e-5e0fc09c35e3.json | 345 -- .../e97292eb-7031-4a3a-a415-44c137898e3f.json | 345 -- .../4263a6be-9640-40a1-8881-768624949d47.json | 345 -- .../a808cecf-8925-428f-99ea-b6c2f8bce96e.json | 345 -- .../55e44a3b-1fac-4ad5-b25e-85702f33883d.json | 345 -- .../5b5b339b-7631-4b77-ac51-df49d3e946eb.json | 345 -- .../eaec6d66-6da7-4592-baca-2539240acc5d.json | 345 -- .../2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json | 345 -- .../eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json | 352 -- .../75b5943a-67be-4b2f-85da-a52533edc76f.json | 345 -- .../8bec35b7-271a-457d-b665-9f69baa248aa.json | 345 -- .../c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json | 345 -- .../c308b0a5-4c44-4369-9b23-8664959aa927.json | 345 -- .../1a1edfb2-f0f1-4930-82c0-99293ec76645.json | 345 -- .../9aa5af51-8c55-4896-b634-162a9d82b58e.json | 345 -- .../21461a52-2f25-48c9-be19-f9233317d817.json | 345 -- .../bdea0967-fcc7-493c-a18d-70727842deb9.json | 345 -- .../f7404ea3-62c7-47fc-9106-44c208470381.json | 345 -- .../2817820c-4b28-4235-a8fd-ad02d0f504bc.json | 345 -- .../f3da71fc-fc88-4dda-b423-168d11eab317.json | 345 -- .../2f7c0db9-b5de-4674-a130-5315520dea68.json | 345 -- .../4dcb8022-fe54-42f7-b43f-9866de173731.json | 345 -- .../c436f3d1-84ee-49df-9287-0305925f7cf4.json | 345 -- .../90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json | 345 -- .../07c823ba-9e17-47e4-858b-a1f2a514a276.json | 345 -- .../eb1bb443-71ad-4b79-8308-2b66c5e8c631.json | 345 -- .../e14d42a9-9639-4c35-8a0c-e395e754c46c.json | 345 -- .../3754df44-ddce-4a66-9074-f65f5677ae27.json | 345 -- .../a540b282-e9d6-403e-96df-a1d27ad14d3a.json | 345 -- .../758851b3-9ac9-43d8-8b6a-3d9688752d80.json | 345 -- .../1d9ac688-ca0d-405b-a262-e95673e79250.json | 345 -- .../c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json | 345 -- .../35a31e19-2ef5-4caa-a848-422af42adab8.json | 345 -- .../7de0bda2-ce56-444a-b293-a310a5b2d7ab.json | 345 -- .../dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json | 345 -- .../9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json | 345 -- .../07763926-3a19-43f9-a23f-095f6cb78799.json | 345 -- .../56e024b3-c963-4172-9f52-7605276b3854.json | 345 -- .../6f660e47-1d86-473d-9864-208111dcea31.json | 345 -- .../91ef1f96-a708-4c53-ac9d-208ef3420668.json | 345 -- .../c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json | 345 -- .../505c6245-88d1-4557-9e34-63a4e8086210.json | 345 -- .../9a473236-f187-4926-ae8a-e8b84fe2a060.json | 345 -- .../1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json | 345 -- .../aeabfb59-74db-445c-9693-7a088ac5073c.json | 345 -- .../eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json | 345 -- .../12fdea65-94eb-4c85-876c-65f0528bde12.json | 1613 --------- .../d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json | 1613 --------- .../1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json | 1613 --------- .../deddbc80-70ac-43e7-b052-753d127f8390.json | 1613 --------- .../e4780862-bf3c-4856-b1e7-02616afe931a.json | 1613 --------- .../cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json | 1613 --------- .../13a22d40-f274-4384-adcc-1539da821c6a.json | 1613 --------- .../a01f642e-730b-461d-8afe-9c077ab3f149.json | 1613 --------- .../813802a3-483e-443d-9e49-7cd581b5ea6d.json | 1613 --------- .../90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json | 1613 --------- .../d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json | 1613 --------- .../3dc29785-a884-4496-a6f4-a8bf19892e50.json | 1613 --------- .../ff8dc291-bbaf-4149-854e-e1780b0c86d5.json | 1613 --------- .../b8932181-b669-4b0e-8879-1dfbf9afea12.json | 1613 --------- .../c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json | 1613 --------- .../579fb908-3c36-4ff8-a262-fd5388806b83.json | 1613 --------- .../68ff9f10-0357-4ea8-b758-de6c7f51d669.json | 1613 --------- .../b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json | 1613 --------- .../8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json | 1613 --------- .../8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json | 1613 --------- .../6bbe052f-46f7-4541-80a3-dbb86433db7a.json | 1613 --------- .../9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json | 1613 --------- .../742a59e8-c813-42ef-938a-4897e25dcdad.json | 1613 --------- .../5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json | 1613 --------- .../509360bc-86f5-49dc-899c-2899d8b6bc6c.json | 1613 --------- .../8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json | 1613 --------- .../8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json | 1613 --------- .../7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json | 1613 --------- .../d65d8f48-8b8e-4ec6-af68-f61af5408adf.json | 1613 --------- .../dff69882-cb8b-4323-b587-60f295085459.json | 1613 --------- .../90220411-5e4d-4b74-a74c-ca2ad030d50e.json | 1613 --------- .../8c2465b2-deca-476c-bb41-836685ceab35.json | 1613 --------- .../4b0f6a03-1054-4047-82d1-53992f0378ee.json | 1613 --------- .../78bc128a-6e53-4086-9498-2b3428e1d884.json | 1613 --------- .../2be7887e-6c91-437c-bbfc-8b68de3330da.json | 1613 --------- .../f135ce21-655f-4ebf-9cc6-d83ada0f177b.json | 1613 --------- .../48912a61-af54-4208-b36d-2f3a283e5c5d.json | 1613 --------- .../cc85315f-4472-4b22-9f0a-e4609676ce13.json | 1613 --------- .../ab773619-db5e-449b-8d6b-da743cb038bb.json | 1613 --------- .../5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json | 1613 --------- .../32cc2aa3-be26-41bd-8124-a8b1073c84c4.json | 1613 --------- .../42a86a4a-7e76-4c7d-af48-e765a38df589.json | 1613 --------- .../f9746ed1-887f-4850-ac2d-700de18acbaf.json | 1613 --------- .../899521d0-e5eb-4e1b-af5a-78b3bd32e232.json | 1613 --------- .../1fb2c6db-2495-4609-a96b-57815c579953.json | 1613 --------- .../a5b6cc8b-676d-4c19-8093-0b893937e3d4.json | 1613 --------- .../0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json | 1613 --------- .../bc207557-fb49-4a87-8401-22c3ce853e7c.json | 1613 --------- .../895266ee-71a5-4ca5-b3f9-62df6383ff95.json | 1613 --------- .../8828e9e8-5716-41b4-a2d1-233bb056dc32.json | 1613 --------- .../f267ba72-b239-4126-99c5-675f79b1ae95.json | 1613 --------- .../f386e763-8078-454b-bd14-32b106663d53.json | 1613 --------- .../a4739cda-028b-48e0-b3b5-ca9b583d03f5.json | 1613 --------- .../837e20ff-fed1-4431-b643-63b904055c66.json | 1613 --------- .../e411f017-22c6-4d49-9bf9-5d99c1091791.json | 1613 --------- .../7bd2b266-5a65-4c63-bf18-5e4114564bfc.json | 1613 --------- .../49a1423e-d5f4-4665-b81e-d491f492a316.json | 1613 --------- .../8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json | 1613 --------- .../ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json | 1613 --------- .../a2b4ed40-b04f-481f-986b-25a2c26bbb79.json | 1613 --------- .../e88f9163-5334-43ed-9b56-154bf543f898.json | 1613 --------- .../6d436bd5-9d49-4895-8c07-7814b2eef12c.json | 1613 --------- .../681d0d6d-de06-4b8e-a7e2-964d98e2806e.json | 1613 --------- .../e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json | 1613 --------- .../cb80bd5f-204a-4dd8-96ec-40c7df93975f.json | 1613 --------- .../f84f84a8-7191-42ac-8951-5d7141a0f700.json | 1613 --------- .../9ba74767-b675-460a-bb68-e82adb6acd2f.json | 1613 --------- .../e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json | 267 -- .../60724488-914d-4efe-98d6-f3ff26fe8fbc.json | 267 -- .../2aaae404-b510-41e0-9a4a-b2d053731454.json | 267 -- .../053badb4-b50a-434a-909c-c4d939c00b4e.json | 267 -- .../7b4a4c6d-e302-4010-a099-5b01c874ffe8.json | 641 ---- .../db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json | 641 ---- .../f6808908-79d9-4de5-8434-94e4bdb854f2.json | 643 ---- .../1a039ef6-5957-4246-82b2-bc607b6554e7.json | 641 ---- .../fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json | 641 ---- .../0e2790d3-40f1-4124-ba41-b65bd9de1852.json | 641 ---- .../d55129d3-4eae-4009-a897-fa1624cea6a2.json | 641 ---- .../6332f0b3-7fab-41ed-a8da-46b142051377.json | 641 ---- .../0cb33741-ca10-40f5-90d3-28e300901ad3.json | 643 ---- .../80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json | 643 ---- .../de41775f-f60e-481e-a8ef-3df9a9b65a5a.json | 642 ---- .../bc29d5c6-b5c8-473b-b69c-054026829089.json | 641 ---- .../ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json | 644 ---- .../4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json | 644 ---- .../9ef56d5a-de00-4d89-930c-a4c74211dd78.json | 644 ---- .../5598d3ed-5b37-4aec-b186-0b16c394633b.json | 641 ---- .../a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json | 641 ---- .../54bac699-aa82-4133-8c10-c6510c2a7f95.json | 644 ---- .../79b23601-3148-4256-88ce-67e439a87c5b.json | 641 ---- .../e92648e4-75c6-4944-9ec1-880823fefc87.json | 641 ---- .../449feffd-d2e3-4a08-ad69-b8ad522532ae.json | 641 ---- .../d297b253-0f4f-4caf-864b-9f457ab589da.json | 641 ---- .../d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json | 641 ---- .../cb409208-034d-42fd-acce-ab5cc4227383.json | 641 ---- .../b2572ef8-446a-45b4-b557-45736418753b.json | 641 ---- .../70d85516-b710-4b27-b664-03a6a822773b.json | 641 ---- .../a8208df4-eb37-47d2-8845-f821e80e9858.json | 643 ---- .../22cde248-40ab-43b0-a408-6d8b84692f22.json | 643 ---- .../b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json | 641 ---- .../ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json | 641 ---- .../8721a15b-9102-4b1a-bde8-e5371f00f1b5.json | 641 ---- .../23b3a30c-8aa3-4684-be54-adae003720fc.json | 643 ---- .../7022c444-d6b8-4374-be0c-14835e5fd281.json | 643 ---- .../bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json | 643 ---- .../bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json | 641 ---- .../527418d0-2591-43c9-b639-17328292b110.json | 643 ---- .../8ddc465f-4f2d-4213-81c4-70b584d48047.json | 641 ---- .../eca63d17-7fc2-4722-8bb3-0be99a257100.json | 644 ---- .../e40a10b3-e682-4715-b2ee-4efcae050a58.json | 641 ---- .../56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json | 643 ---- .../f47ca10d-cd45-485e-b9cf-0c6592d63656.json | 641 ---- .../7f0e318e-31bf-4044-bffb-357c1238d4fd.json | 641 ---- .../818d6d72-0b5c-4fcf-b808-1d186223301e.json | 641 ---- .../f09b853b-dbbc-4252-a0f0-a2c45c29f670.json | 641 ---- .../f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json | 641 ---- .../83c6a723-87a0-43d4-968e-86d186578e9e.json | 641 ---- .../daaf221b-1759-4619-91fb-938e81975787.json | 641 ---- .../6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json | 641 ---- .../1043b815-b247-4444-bf8c-0b92b793c57f.json | 643 ---- .../28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json | 643 ---- .../73dedd31-7d40-4ee6-994d-00eb7d656597.json | 643 ---- .../18da1dfa-5366-477b-a9cf-af29c5a99b68.json | 643 ---- .../80057cc1-45ab-4976-878e-be963eaa83b1.json | 643 ---- .../d896249f-bbd9-4657-a5db-5968544cb5fa.json | 643 ---- .../9f73f3e5-b573-45d4-8c98-82f5c496f786.json | 641 ---- .../a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json | 641 ---- .../4ff688da-61a0-43ce-9c2d-e1c197887683.json | 641 ---- .../181003ea-7587-4c93-8b89-c5c76958313d.json | 641 ---- .../66688228-e59a-4caa-b3fb-c5df1efc9db4.json | 643 ---- .../2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json | 641 ---- .../077fe37f-b3a4-483a-93a5-034c6445fe98.json | 641 ---- .../4fbb173c-b900-4e11-87bd-1ac6a489d014.json | 643 ---- .../e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json | 641 ---- .../0925f9b7-08f8-485f-84bc-a153a54aa417.json | 641 ---- .../08082277-8305-4007-97cd-88202fc0115c.json | 641 ---- .../fe554cbd-2480-40bd-b2f5-464cad700c14.json | 641 ---- .../9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json | 643 ---- .../d9654997-1d3e-41c3-9f16-05a36dde9b02.json | 641 ---- .../73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json | 641 ---- .../4d01d929-b5e2-42dc-89ee-20560f560db5.json | 641 ---- .../76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json | 643 ---- .../69ea0ef0-c136-4cff-9607-6ae12e0692c3.json | 643 ---- .../bbe708f3-fb78-49e9-876d-cae57f1231cc.json | 643 ---- .../ab7b7951-0792-4538-8a7a-6baee8602cbb.json | 643 ---- .../fc94c95d-9678-4f23-b82f-190a08ece307.json | 641 ---- .../3f92e2fc-9831-4c2c-b94e-af33d457fa82.json | 641 ---- .../3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json | 643 ---- .../6b2891bd-2444-4286-8ccf-c91181856d29.json | 641 ---- .../bd924bd3-e13c-48e0-b339-8c15c5072038.json | 641 ---- .../b8a6f32a-9904-43bb-9add-89404093a9db.json | 641 ---- .../c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json | 641 ---- .../9c1fc50a-437d-458b-926c-33cabdcc4aeb.json | 643 ---- .../5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json | 644 ---- .../10e1abfa-83de-4960-8d4c-c5099894cb80.json | 644 ---- .../40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json | 643 ---- .../2abf3bb8-a78f-4a59-807e-52da4e6426fd.json | 641 ---- .../ae28615a-b7fa-4782-89e1-4b8e4804dc62.json | 641 ---- .../52bb6ab9-e80b-4bf0-a375-7706f16d311d.json | 643 ---- .../fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json | 649 ---- .../1158720a-9a0e-492e-a677-9b0936f4cde5.json | 641 ---- .../254ded81-4051-420d-b402-2e7b80a23848.json | 641 ---- .../ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json | 3021 ----------------- .../7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json | 3021 ----------------- .../5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json | 3021 ----------------- .../0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json | 3021 ----------------- .../92e0b1b9-c167-4e07-b770-2b78527eb4eb.json | 3021 ----------------- .../3da06ad4-0770-45f5-a6a2-9ef9500cef05.json | 3021 ----------------- .../c1c79360-60bd-4f5d-a746-e0411b94f69b.json | 3021 ----------------- .../bb904716-048c-4b41-9f64-4d17c485afe3.json | 3021 ----------------- .../063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json | 3021 ----------------- .../c8949c55-8987-4ed3-b74b-8b13b4381806.json | 3021 ----------------- .../ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json | 3021 ----------------- .../bc9cedd7-5cb2-44b2-abda-470322570e14.json | 3021 ----------------- .../305a7f25-6e22-4146-9678-6a687a701567.json | 3021 ----------------- .../c6059976-85a1-40ce-b02f-67e182aa2f7d.json | 3021 ----------------- .../6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json | 3021 ----------------- .../f397ca7a-41c4-4926-b075-2523639f0a50.json | 3021 ----------------- .../acdf4701-e1c2-4867-bd85-d34ae8fb0991.json | 3021 ----------------- .../3cd855af-9679-4fd0-bc3f-34db697c7855.json | 3021 ----------------- .../78fb6814-e32f-4b15-b958-9e001637ba07.json | 3021 ----------------- .../f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json | 3021 ----------------- .../cefc3b25-0779-4fb3-93a5-3c7a285304af.json | 3021 ----------------- .../7e00e082-0e79-45e0-b0ff-5458cc2aff85.json | 3021 ----------------- .../ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json | 3021 ----------------- .../c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json | 3021 ----------------- .../7ea5b404-d98f-4282-81d8-6ca5f6629429.json | 3021 ----------------- .../7056c7e7-f68a-4764-aa48-a8368ae2e317.json | 3021 ----------------- .../5e67014d-6ca1-4e65-a85a-84d91e147d4d.json | 3021 ----------------- .../3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json | 3021 ----------------- .../46d5e547-507e-4c98-98a9-bad1bfad7f7b.json | 3021 ----------------- .../ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json | 3021 ----------------- .../2b31b441-caa9-465c-a2d2-051c951c7be3.json | 3021 ----------------- .../b7ea6c93-af70-4c0f-ba50-03a539416a8b.json | 3021 ----------------- .../fe4cec30-e483-49a8-80ea-00b2c6231740.json | 3021 ----------------- .../53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json | 3021 ----------------- .../af88b02d-cb29-4d2c-bb33-5fddcf316a95.json | 3021 ----------------- .../a0abcd19-58a1-478a-9786-d044a4181241.json | 3021 ----------------- .../95eda13a-cd34-4170-b2db-f2ead47250f9.json | 3021 ----------------- .../7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json | 3021 ----------------- .../9da7439c-e96b-444f-b4fa-7ef638080740.json | 3021 ----------------- .../294b22a0-1676-4d8c-8ad2-5cdc40267255.json | 3021 ----------------- .../1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json | 3021 ----------------- .../78f2484e-bc73-4026-929b-db345e92cf5a.json | 3021 ----------------- .../8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json | 3021 ----------------- .../41af381a-3637-4578-a582-59d9b1327d95.json | 3021 ----------------- .../96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json | 3021 ----------------- .../bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json | 3021 ----------------- .../e036de72-b425-4aa5-9448-dc52560e60db.json | 3021 ----------------- .../65423181-18f1-4296-98c2-171356106404.json | 3021 ----------------- .../41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json | 3021 ----------------- .../f78d6e0a-a397-4a41-a37e-696bda5a1987.json | 3021 ----------------- .../d2bf70ce-341f-49d7-bd03-87b523826953.json | 3021 ----------------- .../b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json | 3021 ----------------- .../08590b6e-7050-413d-844b-1f3f1c5aa444.json | 3021 ----------------- .../2d18fd88-73b5-4d4c-a1cc-e66a20316605.json | 3021 ----------------- .../567918be-be6f-4e41-b613-727828fe8a44.json | 3021 ----------------- .../c2be131b-808c-4947-b24f-69ef6af499d7.json | 3021 ----------------- .../24955250-a2e9-475f-a866-30a835579e03.json | 3021 ----------------- .../de6f7e19-b54a-4bd3-b624-29f66afbee15.json | 3021 ----------------- .../e4c3032d-04e0-414b-a7e9-e30756d82000.json | 3021 ----------------- .../e9a41d4b-56c7-47f0-b439-72ad1e463000.json | 3021 ----------------- .../a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json | 3021 ----------------- .../fd6aea24-dc18-41ce-bc19-23f461a39032.json | 3021 ----------------- .../625d33ce-a320-4bfd-a962-451b8c22d392.json | 3021 ----------------- .../e51be257-610e-4d38-b58a-a3b29fc06a83.json | 3021 ----------------- .../9e0b9f48-f913-4bbe-a135-59e596c9e479.json | 3021 ----------------- .../189e6cc5-1c8f-4712-8dda-c108f18f836d.json | 3021 ----------------- .../4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json | 3021 ----------------- .../ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json | 3021 ----------------- .../fa6a6772-671b-402e-9480-d61e0fb4a61e.json | 3021 ----------------- .../b5279e94-ae7f-4671-9315-874e162a24fd.json | 3021 ----------------- .../de00e8da-9c83-40df-b642-b94719ce1ac2.json | 3021 ----------------- .../119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json | 3021 ----------------- .../80aabdf4-60b7-493b-98d8-1854f1c41c10.json | 3021 ----------------- .../29958cee-32c9-4d51-8f14-72db4273459f.json | 3021 ----------------- .../72537b16-feda-4e5e-a477-f415650db847.json | 3021 ----------------- .../7df68af5-667a-4125-9c12-e71fb5af0a74.json | 3021 ----------------- .../1845eb8b-4c94-4d22-8771-012f7230dc62.json | 3021 ----------------- .../b2c8cfd1-f09a-4616-8038-c7e1930bce74.json | 3021 ----------------- .../12976629-cefe-4329-b974-bb17f88d385d.json | 3021 ----------------- .../0d7928c3-c769-474e-8249-7a5c70c4c559.json | 132 - .../f63536ed-752b-4538-9b92-2514a617a4bf.json | 132 - .../8ff13de2-ea43-4392-992f-ba70b6023e96.json | 132 - .../02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json | 132 - .../74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json | 132 - .../ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json | 132 - .../05307b41-d832-4533-99bd-c8608bf8e64c.json | 132 - .../c09bd9b0-6f85-4120-94a9-b628c68bccb7.json | 132 - .../9f971385-1146-4436-91a6-0e52d4db1f07.json | 132 - .../80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json | 132 - .../db88e3f5-58a9-4783-9093-a6df96483342.json | 132 - .../8cd90f8a-d8dc-469b-95b9-260fcef804d2.json | 132 - .../b2c82703-2b5c-407d-b84f-a8f8261ac894.json | 132 - .../55462e67-5eca-4e9d-9095-51fcf12de5fa.json | 132 - .../25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json | 132 - .../efc036b6-d8de-4393-87a1-d4f86fb44d91.json | 132 - .../a5144406-eb85-43b2-a49d-be6b06d6b04a.json | 132 - .../900184ad-656d-416b-956f-5f6e3a991d1b.json | 132 - .../7a58954a-5d7d-4640-99fd-773249640237.json | 132 - .../4ea3146c-b912-424a-b0a9-7c37348348c8.json | 132 - .../b0276278-6d86-49c0-a246-cd9110ac1deb.json | 132 - .../04216f67-1385-43bf-b7de-5bae7a60f379.json | 132 - .../fbf7b76b-7ced-4217-8e14-1d02184e271c.json | 132 - .../74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json | 132 - .../295938e1-ade2-4d36-beca-3cbe506b5b90.json | 132 - .../f331782f-ea09-41bd-8c6a-e964c88d7e09.json | 132 - .../e4e3d79a-1de9-43be-a029-0be4f60e472b.json | 132 - .../6914ac28-b543-4f36-81f1-f7491c018e3b.json | 132 - .../b7378f41-46ab-41af-94cc-e7fb10738658.json | 132 - .../acedae59-6192-4ac4-a354-d520ecd6ba36.json | 132 - .../ff105961-761d-4261-8a44-20acf2e7f440.json | 132 - .../fa0901f6-514e-44ae-84dc-0b793f26169e.json | 132 - .../d2dff5df-343b-40f3-85de-14eb72dab050.json | 132 - .../8fa3010f-b7a1-4fc1-9156-ba70453add86.json | 132 - .../58034f99-3b01-46d6-aea9-90c75d073bb0.json | 132 - .../e6c08c9c-6d01-45c7-8a24-219b756b8632.json | 132 - .../cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json | 132 - .../95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json | 132 - .../082f25f0-994c-438a-8086-b1e439aca466.json | 132 - .../31423cbd-08cd-4079-b1c5-ba412acf1b51.json | 132 - .../2669bd86-da65-4d87-8464-bfa8c741ce0b.json | 132 - .../ab2c19ff-5671-446f-b09e-731e2ae515ca.json | 132 - .../36250dc3-cb51-43be-8ab0-6788eb5bda7c.json | 132 - .../cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json | 132 - .../9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json | 132 - .../038c32da-add5-4299-ac17-df6ef3fdea58.json | 132 - .../25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json | 132 - .../77655d60-872f-468a-acc6-d584ef5bf46a.json | 132 - .../4de378c8-ccf6-4f0b-8287-3d138a8645b9.json | 132 - .../8039cadf-6644-44e7-8452-90e9c8069e28.json | 132 - .../8914d89d-c873-4704-998e-dc807e96030b.json | 132 - .../c2e9fc29-db07-4b49-a98a-084158831ac4.json | 132 - .../58724539-6fc5-40d9-ba43-87410959894d.json | 132 - .../b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json | 132 - .../782b2df0-d1b3-414c-a4bd-59052a4441a9.json | 132 - .../b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json | 132 - .../2824e8d4-2749-4b18-a3a1-b987ed215ac6.json | 132 - .../53176984-ba93-4a64-b81e-21f6e0f65bcd.json | 132 - .../53252698-7d17-4f2a-9106-3b744ae7a985.json | 132 - .../6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json | 132 - .../35f11d5e-88c4-4a95-8d06-a40bee648b00.json | 132 - .../ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json | 132 - .../95733620-e1e7-4442-b9c3-a699165df5e7.json | 132 - .../cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json | 132 - .../72be5537-198a-43e9-9840-a803083158d3.json | 132 - .../2e9a3443-970d-4f37-a356-277a11c81754.json | 132 - .../1188402f-aa1c-4306-b031-c92ff0a5dd64.json | 132 - .../ee2f567a-6403-46d5-9a6b-bd029f81d660.json | 132 - .../d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json | 132 - .../87d66efc-173f-4c14-b76c-d8b7e00d575d.json | 132 - .../47f62378-c3cc-408f-a0d1-71eb3f522f57.json | 132 - .../dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json | 132 - .../e4087285-1d1a-465e-ac88-91310e939710.json | 132 - .../09f189d9-74fd-47bb-b5fb-7994cba56ae2.json | 132 - .../5754c262-6ddf-4f54-9722-22ff20a8d76f.json | 132 - .../cc1bd811-ec88-4514-8b47-4140ded4f03d.json | 132 - .../3f08155d-8551-4472-86fe-7988cd6df78b.json | 132 - .../339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json | 132 - .../4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json | 132 - .../7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json | 132 - .../6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json | 132 - .../99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json | 132 - .../b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json | 132 - .../c3d39b6c-02af-410d-8a5c-224495b04572.json | 132 - .../0426fcba-3db4-492d-b622-e34ab8d3fc8f.json | 132 - .../aa099cfe-ac9a-42dd-8357-f4d8115133ca.json | 132 - .../ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json | 132 - .../b359a7a3-cf2c-4952-b308-333672dadcec.json | 132 - .../0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json | 132 - .../e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json | 132 - .../45d019ab-b23c-4fc3-baf5-d57576e9945c.json | 132 - .../e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json | 132 - .../9be442e8-4b77-43e0-a981-887338e59b78.json | 132 - .../a07b6326-f393-490e-b696-d8b45f593d4b.json | 132 - .../b66ed91a-98d5-407c-9896-9c2e2a31e9da.json | 132 - .../9c70921d-956b-4727-9201-1addbd01bb8b.json | 132 - .../4ba6d51e-314a-4db4-9552-568a4093e01a.json | 132 - .../835f5056-56bf-4a6c-886f-fbe6f263ac07.json | 132 - .../c2a63afa-9d25-41dc-b25f-848f5a640501.json | 132 - .../f64f9d24-e448-4bb6-89c3-edb66499bac9.json | 132 - .../2de14bfb-844a-4711-815e-8f63487a78fd.json | 132 - .../f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json | 132 - .../98187b98-0cc8-4756-9cb7-c53deb998f90.json | 132 - .../8c79c60d-ebf4-4409-be4f-928a54cedd1d.json | 132 - .../5d5cebeb-faf0-4fdf-8749-6307080e82f2.json | 132 - .../e926ce8f-45bb-4f3d-b579-ecadb3df6468.json | 132 - .../070609d6-5f41-4712-9ad7-e215b1a6bb81.json | 132 - .../8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json | 132 - .../53587959-25f9-43aa-a34b-f274d8bc93af.json | 132 - .../2a7f80ed-d404-4c81-b000-b65c83069121.json | 132 - .../f0983645-4adb-4ddb-bf2f-33480cb7f421.json | 132 - .../161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json | 132 - .../694a02f9-4729-4d0b-97ce-80adaef29be2.json | 132 - .../0521f51d-22c1-4821-8f04-23c533411668.json | 132 - .../8fdea71b-5e68-4a78-aefc-8a00650464c4.json | 132 - .../e2ba5674-9251-4a4e-9eb8-046c834da400.json | 132 - .../4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json | 132 - .../886e0b8b-b2dc-434f-a299-50f668006241.json | 132 - .../7a6a9443-f331-4dfa-acf9-6aa30049bade.json | 132 - .../6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json | 132 - .../cfecfce3-090d-4c2e-826c-03c0c5337e98.json | 132 - .../5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json | 132 - .../ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json | 132 - .../114f246a-6049-40bf-ad86-9a822d13cf74.json | 132 - .../82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json | 132 - .../ed3c1349-a154-4866-890f-2b115ffaf127.json | 132 - .../47942c55-5ddb-4fda-9c5b-34676ae2046a.json | 132 - .../d860210b-4c8a-4d15-ad3a-4e39905f91ed.json | 132 - .../d137f429-2b65-4ee9-9d66-3f619b270fad.json | 132 - .../1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json | 132 - .../6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json | 132 - .../676342d2-f37a-4b6a-967d-3ac750243470.json | 132 - .../950b7108-0192-4875-b4e9-c3e43ab71e08.json | 132 - .../85672df5-2f35-43be-8648-9937c66872dc.json | 132 - .../051c5642-3b23-4879-9d10-639d1b3127d7.json | 132 - .../2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json | 132 - .../8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json | 132 - .../703df6c3-dae4-437f-9379-f8c264797adc.json | 132 - .../1e349ad3-d29b-4a4b-97e7-b82055e41b07.json | 132 - .../8f677a76-932c-4c35-9708-4b723226aa19.json | 132 - .../ebfe625f-ff1f-45f9-826c-9351ea4134e1.json | 132 - .../66e6a757-ac22-47f3-82ce-81af45e1d3cf.json | 132 - .../1cd840c7-d432-495c-a3df-af1fa6264259.json | 132 - .../066f520f-9a64-4564-abfc-6435732c3585.json | 132 - .../aced5181-040a-48c0-bc5f-78d0de3afae8.json | 132 - .../a4889a38-84d2-4ae1-b8a9-297b4400602d.json | 132 - .../d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json | 132 - .../9859afee-02ca-4c48-acc8-acfd20c37e4e.json | 132 - .../e222d12b-c796-4890-a584-cd689bae7ea6.json | 132 - .../c16850f8-0b80-4455-8f38-8ec453cd1d41.json | 132 - .../0d400b0f-cc82-4c86-b600-93a31b133f9d.json | 132 - .../90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json | 132 - .../6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json | 132 - .../e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json | 132 - .../5e9c1273-536d-4280-8fff-9931f46dc968.json | 132 - .../460ca160-ac34-4091-ba2d-986b53532b55.json | 132 - .../ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json | 132 - .../a29a69d3-d64e-4463-aa52-0a9d6d012c98.json | 132 - .../4539c16e-1ac6-47f4-88eb-a09842497330.json | 132 - .../2ff33c55-1236-4c57-8809-2d3076e43cc7.json | 132 - .../281ba822-49a2-4746-bc04-8de046439508.json | 132 - .../0606d916-95ea-4318-af0c-3942329071c6.json | 132 - .../005159f0-da68-480d-972c-c160d145a682.json | 132 - .../2f6abb5d-52b3-44b0-b960-115793485fb1.json | 132 - .../6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json | 132 - .../26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json | 132 - .../d4536913-5708-45e4-a024-45ae37fdae13.json | 132 - .../848860aa-7de3-4fae-afca-ac11224b96c5.json | 132 - .../0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json | 132 - .../20b69120-d476-4e34-b3c6-8cef11d6ee78.json | 132 - .../696bbbfc-49dd-444e-a90b-76821845a726.json | 132 - .../e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json | 132 - .../b26ba2b7-1365-4b1c-a1be-35d588e02d36.json | 132 - .../64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json | 132 - .../c4e572cb-1d12-4baf-a4d8-a55422692207.json | 132 - .../c6123e10-b1f9-49dc-888b-083881e6ef09.json | 132 - .../e1647f10-fec5-463d-b8e5-6b2b880bd687.json | 132 - .../6d5fa235-8d69-456e-9f23-0f702760baf4.json | 132 - .../e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json | 132 - .../603e95c9-7e7f-4892-93f7-92f92b256865.json | 132 - .../3e2fd38a-186e-49aa-915c-7eb3cde50562.json | 132 - .../16d55e66-9015-4d72-81e4-3f14c42b0368.json | 132 - .../696644b9-bd40-4047-bb85-0cb19510a96c.json | 132 - .../cbae8c39-0aec-4859-98bc-3b2d065833ad.json | 132 - .../15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json | 132 - .../357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json | 132 - .../a50a542b-668e-47b1-a37e-805a58eea3d1.json | 132 - .../00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json | 132 - .../26782941-b918-44c5-a7f6-5f770e47c3d6.json | 132 - .../5547ddaf-8fbb-4259-8b88-e946fc3d2404.json | 132 - .../bee5ea59-b97a-4783-b763-b6bd432d4558.json | 132 - .../8150333f-8e79-4230-af8b-7ddb1d5eeb21.json | 132 - .../be8510a9-ecd4-4ac7-9930-3200cacb7b50.json | 132 - .../887e4574-f876-4e75-afb8-e543bcb30020.json | 132 - .../fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json | 132 - .../c0f05e38-6592-478a-9c46-26567f24ff85.json | 132 - .../06cc2913-8e05-44bf-a128-9a7c4aeff536.json | 132 - .../86368d5b-0509-4b52-b988-58bcf7e1043e.json | 132 - .../77b89fe6-464b-4017-a77f-8750e2668a82.json | 132 - .../d2e47d86-23dd-4c95-a7fb-99518615d09f.json | 132 - .../0a09891e-ac97-4c3a-8364-7106a851f1a8.json | 132 - .../eb41fe62-ac46-4630-bb2d-6b907f271737.json | 132 - .../d540a6c8-e9ec-4413-b9d2-dee68533c377.json | 132 - .../5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json | 132 - .../6701738c-27e4-4bbd-b614-fbc297c3164f.json | 132 - .../7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json | 132 - .../32b6e4af-69ba-49b7-9367-dfafe3e390e8.json | 132 - .../e16deaf7-da55-40ba-ac18-860fa3f14d34.json | 132 - .../8a7a5886-0618-4615-9cdf-46f5d19a29fe.json | 132 - .../66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json | 132 - .../a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json | 132 - .../4a6237a7-019c-4310-971e-84b08d1b5067.json | 132 - .../996e781e-5939-41ac-b347-95c99037c34a.json | 132 - .../e880fa0e-ae49-4398-91bd-eadf8695425f.json | 132 - .../da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json | 132 - .../6d709396-1ae1-4e5c-a03c-13c1e9425202.json | 132 - .../5b616df9-e15a-4f84-98b4-c2cb532c1b95.json | 132 - .../0f6552d9-3cbe-447e-909b-068e5ceed4c9.json | 132 - .../2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json | 132 - .../51a64f37-256c-4fe7-b28c-6117520f04ec.json | 132 - .../03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json | 132 - .../3b0f5dea-db9b-4657-9807-6b3e56d38823.json | 132 - .../2d19e9ff-e331-4171-ae90-47e44f3f8885.json | 132 - .../6bfb8b24-1abd-405b-b01d-7d7111705dbb.json | 132 - .../c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json | 132 - .../72569796-1b11-48cc-ada7-e8c09522dd54.json | 132 - .../58403e30-bd2b-4f4c-ad41-daa890c77d40.json | 132 - .../eb8e1f1d-c6b3-407c-b172-d240553d2f89.json | 132 - .../356d75a0-6520-46c1-afa9-7dbb2596a5c1.json | 132 - .../78681e0c-5fe2-4920-af7b-99345cea3efe.json | 132 - .../ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json | 132 - .../17d0d377-bca4-411c-be11-6c5cfce07798.json | 132 - .../d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json | 132 - .../389821ff-d8e2-4d1d-8fb2-57a689867ac5.json | 132 - .../7913f782-29b0-48bd-bc62-37da9a5ac7d9.json | 132 - .../b0930974-999e-4372-9d21-b9790e0bad4c.json | 132 - .../8265f577-f504-4a56-9cf0-42c34766559a.json | 132 - .../82044cd2-1a46-406e-bc68-397ce41b29ea.json | 132 - .../de09e323-8cf1-4aa9-9537-e8ad30a8c297.json | 132 - .../bfe543b4-ec38-488e-ae04-125cd358b61f.json | 132 - .../be36d8ae-b81c-4b4e-aa2f-5999c7582237.json | 132 - .../342b435f-89e9-48ad-ab0f-2c1f52f4571a.json | 132 - .../b0c8737d-d838-4da1-909b-b218e22119dc.json | 132 - .../4cd40f28-842f-44d5-9eb2-86238077fc55.json | 132 - .../0758051c-2d75-402e-af0e-769096cbb17c.json | 132 - .../c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json | 132 - .../b8467118-d895-41fa-81c7-89892e1844d5.json | 132 - .../30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json | 132 - .../89b92cda-c5b6-45ed-a534-361c9d34794a.json | 132 - .../48cdf76a-886d-41ec-8580-00ed4232b601.json | 132 - .../116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json | 132 - .../bb103828-70fe-4767-9302-6750d839129e.json | 132 - .../7b58ab54-239b-4e49-93f1-c3940df61474.json | 132 - .../559067a2-816c-4091-893e-b1c7860171ec.json | 132 - .../ec502619-880b-4b7c-acfe-c43cf6514e3f.json | 132 - .../6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json | 132 - .../636e2f93-3242-491c-9df5-003aa1dacecf.json | 132 - .../1f4efa23-816d-49be-8659-feb003f4b3ef.json | 132 - .../d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json | 132 - .../9ab53055-86f5-4a88-976f-015dd9c9e832.json | 132 - .../ba34083a-9b13-46d9-8f36-aa3ddd586711.json | 132 - .../6a39d734-ad73-4c4a-9583-3563e336d4b3.json | 132 - .../2af71e88-4931-4359-b92a-c64fa33df802.json | 132 - .../bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json | 132 - .../2de872b2-10c7-44dd-91c3-f20205207da6.json | 132 - .../5cabed09-d8ea-46c2-bb78-012dac954d6b.json | 132 - .../8236db6a-ff8a-4237-af5a-03bb258f8e59.json | 132 - .../1a7b078e-bc1f-400f-a0cd-f7b535548f23.json | 132 - .../fdaf561c-567c-416d-a74a-ac3c07c5be5b.json | 132 - .../58900b3b-303b-49c8-b807-7b8d06601568.json | 132 - .../7ac5a45a-7b41-4f63-8556-8737638a00ea.json | 132 - .../3cb55475-30c8-43c8-8d7d-394450fdc117.json | 132 - .../f5e140ff-0c0e-4769-8116-63cf50255773.json | 132 - .../df85ec6e-1325-40ce-8087-d960a1d767dd.json | 132 - .../a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json | 132 - .../11842dd9-0572-41ef-aaa0-8d19f3420efc.json | 132 - .../01abccec-1cea-4060-89be-289987d0a2ce.json | 132 - .../dce8226c-57bd-4255-b813-8a70494f0a1a.json | 132 - .../7f80e69c-eec6-49ac-a088-6248ee25f736.json | 132 - .../e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json | 132 - .../e6ad37be-28f4-43b4-9df1-b7b47d31232e.json | 132 - .../5514368a-1f7d-4cd0-b7f7-d116b753f975.json | 132 - .../c0e29cf8-897f-4e07-abb4-71c801d34301.json | 132 - .../68310379-65b2-482d-892b-f76547bce2b0.json | 132 - .../a034c4ec-d4cd-439b-8dbd-e67685ea7616.json | 132 - .../e4b761d3-bb84-4433-b9fb-4c92ecae6279.json | 132 - .../38d78d30-be6d-476c-a3aa-d9a40f570a56.json | 132 - .../36e60f6c-60f7-4b17-88fe-82810e195fc7.json | 132 - .../a6c647e8-ed24-4150-8563-dd9b20e21498.json | 132 - .../b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json | 132 - .../5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json | 132 - .../1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json | 132 - .../097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json | 132 - .../db8c6169-bfc1-48bb-be53-fa93c673f051.json | 132 - .../41437fc9-6d48-4317-a8de-ab4e63b2cf46.json | 132 - .../e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json | 132 - .../3349d66c-e12b-49c1-a406-e0e77b697458.json | 132 - .../7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json | 132 - .../ac749485-df6d-485e-8fa7-63bdfd744167.json | 132 - .../54363a4b-312b-4035-a1c3-b5321311cec4.json | 132 - .../aa9e2b9e-cd25-4492-9801-eba7d40b4365.json | 132 - .../c6b484b8-f6f3-4516-aff5-c2f6438c9047.json | 132 - .../c6c760c9-a345-4e25-b333-b403bf6db389.json | 132 - .../65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json | 132 - .../92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json | 132 - .../59ddd478-c1cd-4bd8-80c3-fdebe762414a.json | 132 - .../02f63fc6-9376-4fb5-b067-63493238cc27.json | 132 - .../dd7597fd-27f5-4e77-a44f-b01d0db82719.json | 132 - .../20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json | 132 - .../c4e9d045-3769-4828-a2ca-7fa508873089.json | 132 - .../0a0501ec-4ecd-47c1-914b-d473f795cef2.json | 132 - .../beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json | 132 - .../79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json | 132 - .../def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json | 132 - .../5e1aa809-ef20-445e-a05b-eccd585d5991.json | 132 - .../7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json | 132 - .../cfe4ea72-ddb9-49b5-9599-99f215e112e5.json | 132 - .../81d63d8e-88dd-4b16-b9b8-d07604878f8f.json | 132 - .../81f8208b-f7e7-4685-bb84-321d9e097470.json | 132 - .../a0c9a434-9b8c-47c5-b511-9daac7901686.json | 132 - .../28b60eae-1b38-4404-8db1-3fb2997583f4.json | 132 - .../746862a2-a90c-4612-91d0-f989b9eed1a5.json | 132 - .../715ee057-9c9a-4e04-991c-7040b1eef65b.json | 132 - .../4dc1d103-3458-4b8c-9e63-b98effd69667.json | 132 - .../070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json | 132 - .../8406a5b8-a87d-489b-b75b-00e9f675f09f.json | 132 - .../11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json | 132 - .../6b542f5a-ea62-45ce-8e98-436a4d058877.json | 132 - .../9b280640-bfee-4730-acc3-386a54b2434c.json | 132 - .../eff5171b-6119-4013-8aa8-8a4f0215b045.json | 132 - .../471c5fed-f155-4521-9d9c-b5370ca91bec.json | 132 - .../690be099-3ace-484f-b01f-2fe6b324d12a.json | 132 - .../71fbd15f-5eec-40d9-84e8-07323f3ffac6.json | 132 - .../eb93dd3e-3d13-4234-bb66-f6177648aa2b.json | 132 - .../f7ec1ed7-cc30-4879-8ab1-4909011553d5.json | 132 - .../3e100704-dbd3-4d05-b325-5bb4bc90e51c.json | 132 - .../12f003ef-1098-4d3f-aed7-7343034157bc.json | 132 - .../9de2e564-3a30-4f1c-80da-6432a245a64f.json | 132 - .../dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json | 132 - .../8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json | 132 - .../7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json | 132 - .../baf93ef6-56f3-4809-93f6-32dcf4730388.json | 132 - .../f6df14bd-207c-4fea-b789-c9f9aef749b3.json | 132 - .../97766a7f-cf5b-46ae-b51e-5c5702ae000b.json | 132 - .../d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json | 132 - .../275d4bf0-566c-4b50-86b9-38c7f45df143.json | 132 - .../aa504db9-81f3-424f-b7d9-683ebe31f5d8.json | 132 - .../2cc209b7-ef10-435d-a840-b904ab741491.json | 132 - .../9b9390ac-fd65-4a58-9834-5352aa340cdc.json | 132 - .../4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json | 132 - .../4bc5a0db-1c88-4c61-9343-1d340305ecc5.json | 132 - .../74527f51-dcec-4b82-8ba8-075c933404f5.json | 132 - .../ac31bc90-3854-4d38-925d-ef8dc7e75d24.json | 132 - .../88583cff-1adc-4b1b-8e68-07f0074d0ae2.json | 132 - .../fadbac9e-7224-41d1-abfa-7039cbcba9f6.json | 132 - .../1fb90540-0fa0-44ca-ad67-1e3503f6b729.json | 132 - .../047784e2-c1ee-40d9-a60d-e43504825801.json | 132 - .../ee60453d-2d51-46f7-8a18-c651d590f0e7.json | 132 - .../b0ac4b11-f7b4-4753-baae-310a92f08259.json | 132 - .../324db8b3-38c7-4a2c-82e8-7bebfa38e760.json | 132 - .../54dd9033-61b9-4f26-9cde-e04c7136524b.json | 132 - .../d0973d6c-373c-41cd-9e62-52470c044dac.json | 132 - .../da15da67-b316-4c2e-86a5-c1f88eece9cb.json | 132 - .../b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json | 132 - .../bce7b15d-1670-46db-bdff-24fb38bc3fd9.json | 132 - .../15e5e02f-27b9-4063-b601-42c2b17180f9.json | 132 - .../51b0c546-0dde-4668-a8b8-3b9753a31aa0.json | 132 - .../45842b1c-cf68-44a7-928f-2da454cdd13f.json | 132 - .../c15cdefd-dbe3-432e-aab0-3c43540cd320.json | 132 - .../1f489afa-a01d-40f3-836a-9e386c502d1d.json | 132 - .../94bcc87e-eb06-4321-9b72-2f99168cf92a.json | 132 - .../c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json | 132 - .../b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json | 132 - .../a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json | 132 - .../f07c3a4a-2a8e-45c4-a726-be95726df2db.json | 132 - .../f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json | 132 - .../65acabdc-ea5f-426c-820b-2b79f2b20b44.json | 132 - .../96b00cfa-1383-4b36-a043-17eb39678ffc.json | 132 - .../3b8a796e-6bde-4506-8335-bd3cc72482e1.json | 132 - .../a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json | 132 - .../65d9e237-2757-459e-94e7-e382213e4eeb.json | 132 - .../c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json | 132 - .../2e7d3674-d0b0-4b87-8bd8-8202114b7665.json | 132 - .../30d21295-beb1-4179-8c6f-7bac79b29474.json | 132 - .../e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json | 132 - .../7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json | 132 - .../336effcd-d8fc-4477-846f-70fc40bdc111.json | 132 - .../28f87820-d587-498e-b713-7c0af0cdc324.json | 132 - .../f1b671ab-ebb3-43ec-86fa-832982d04cc1.json | 132 - .../327cde83-d107-4455-bc03-7e03026c52e6.json | 132 - .../7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json | 132 - .../92c8afbe-7735-40c8-af0e-29da687c2070.json | 132 - .../bca052ac-6556-49d8-94e3-f4bda560a5d3.json | 132 - .../5f74fe6e-8575-4cea-959b-e6ba03c7e273.json | 132 - .../b0f696f5-ed70-4293-999d-a9121192c137.json | 132 - .../18751a6f-062c-4915-bbe0-ae222cf9ae0b.json | 132 - .../398ebe04-638f-4a11-b99d-6778ff3ff97b.json | 132 - .../b4f197f2-3456-4221-b222-10dfbbb50f56.json | 132 - .../0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json | 132 - .../1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json | 132 - .../496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json | 132 - .../37071760-d24c-43cc-9965-d8c7873c0ee8.json | 132 - .../91a71a49-5dd4-43b1-9e1c-fd9492236712.json | 132 - .../d1d48abb-6dcf-4905-958f-c3a3e75feac6.json | 132 - .../68282f29-f56f-420b-bd1e-9cc54783c1a5.json | 132 - .../cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json | 132 - .../22a9d3b8-ac45-4433-8926-5d28681af922.json | 132 - .../57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json | 132 - .../24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json | 132 - .../6ed62f64-c2be-4bca-b17d-bd0184a3d498.json | 132 - .../db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json | 132 - .../7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json | 132 - .../6f286418-d8e3-4c11-8941-cfe5a18b1037.json | 132 - .../b0a83b1f-3af2-45e8-9d88-d7302a529112.json | 132 - .../0462fce1-51b4-48d8-8278-a90048ffd637.json | 132 - .../e02f597c-c368-4223-ac90-c99d82c90634.json | 132 - .../32e63ffc-c64e-4562-ba99-14873f5bac2e.json | 132 - .../6af4faad-05c2-488b-9685-e11ae4e1cbf0.json | 132 - .../8aa7701b-7019-44a0-851f-cfc9108fdfbd.json | 132 - .../a2f95fad-5ab5-47d0-b9aa-33358c673caf.json | 132 - .../aef73a77-9df7-4d4f-89ef-50905d326198.json | 132 - .../e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json | 132 - .../8ff39438-907c-465f-ac7a-5a25cfd8d824.json | 132 - .../83d831c5-a74f-4699-9961-664a7a51b7b8.json | 132 - .../83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json | 132 - .../3811cc34-45cb-4932-b862-39bf042331e0.json | 132 - .../5b2a16a1-7a2a-40b7-add6-b99378b6af00.json | 132 - .../1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json | 132 - .../742e0a1c-7496-4076-bdbf-ada0a8e528c2.json | 132 - .../f0664035-3256-444c-b848-ef603e0d46b5.json | 132 - .../9159aaa6-8663-491f-901a-74da4c343d20.json | 132 - .../5179b145-9fdb-4ab5-8cca-87966ecf6519.json | 132 - .../da872193-1d25-4e8e-bc22-9138a9d121ba.json | 132 - .../967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json | 132 - .../dd615b4c-189e-4361-bcf4-879fd59b28a2.json | 132 - .../0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json | 132 - .../8c583b51-4349-48af-98d9-8eaaf43d60b6.json | 132 - .../34aab556-5e97-4ea2-9ada-d17dc3624be2.json | 132 - .../fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json | 132 - .../b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json | 132 - .../7f371c11-e8f0-4233-b359-aac39c0a1110.json | 132 - .../9f758d4e-d121-4688-8ece-8dc67a499811.json | 132 - .../903b8c71-d54d-4ce4-9845-71eb8ca8733a.json | 132 - .../9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json | 132 - .../28109e00-87c1-4809-a4fc-dddebba52621.json | 132 - .../6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json | 132 - .../03a8091c-473e-4fbe-af70-35f791a23a0f.json | 132 - .../ed75e9ed-841b-4783-a201-bc72651afd0a.json | 132 - .../38cd418c-9770-49d2-8b30-ac47e445cee3.json | 132 - .../d49b6a48-ae81-467d-87c5-b17f9ca306f8.json | 132 - .../39b7e250-9f71-4833-941e-85692a48b6e6.json | 132 - .../c0d102a2-ff8c-45ac-a825-31472b98b871.json | 132 - .../7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json | 132 - .../d34b899e-b067-4c9c-9fa2-439f8b2d589d.json | 132 - .../8c7b2332-510b-42d3-bcbb-e177c35d27d5.json | 132 - .../685f107f-e431-4dba-a117-8d6f1dd2c296.json | 132 - .../e1570804-85b6-4518-a099-5f21ab27d12c.json | 132 - .../a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json | 132 - .../1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json | 132 - .../c901a9ee-069a-4e3e-ac52-3017d67d8800.json | 132 - .../08317b59-ff74-43c8-bea5-2a266c38816e.json | 132 - .../4106d4d3-344a-4c1f-b9ce-a3140d435013.json | 132 - .../2b308fad-8494-4056-8b84-82733cd2710a.json | 132 - .../93c867d0-4f10-440c-838c-91d1633fe584.json | 132 - .../1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json | 132 - .../151226ba-9744-45bc-b923-30df57f7aa3e.json | 132 - .../98363657-0793-4eb3-94de-28961afc92ea.json | 132 - .../a32b4ded-6bff-441e-afbd-736e6d8cce5c.json | 132 - .../326bcf4a-02e9-4218-8bf2-55a94a79435e.json | 132 - .../145facc2-ab11-4c68-b841-762e0ad9bd5a.json | 132 - .../d3e6aae6-9284-4309-8d8c-02c9e797a58b.json | 132 - .../6ee8537c-90e8-4455-83ca-c8c375a5ead7.json | 132 - .../6efbfb38-57e5-46c7-b765-f7d0356afb97.json | 132 - .../f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json | 132 - .../1c9b325b-92b3-499a-a3ea-026269c63c88.json | 132 - .../c546ccde-cef3-4de2-a49f-24517d76dde5.json | 132 - .../e85d3ccf-f48d-4e5c-b893-771a107773d4.json | 132 - .../b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json | 132 - .../97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json | 132 - .../b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json | 132 - .../3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json | 132 - .../7ba52efb-3890-4691-8740-9f051f1f645e.json | 132 - .../7b192b49-057e-418a-b47d-44b0ec82a6b6.json | 132 - .../f2120d53-bef6-44d6-84a6-a6f8e3537188.json | 132 - .../f5408aa9-85c8-46e5-b225-0480b2e18e97.json | 132 - .../c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json | 132 - .../52659d37-67f8-45b8-88e4-11917dc90488.json | 132 - .../556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json | 132 - .../048fc971-3baf-4740-a132-2f9476d01b7a.json | 132 - .../abd28d25-01e0-474d-be35-08d816d281f5.json | 132 - .../17f49724-6553-4baa-b354-45ffd0f2c844.json | 132 - .../3e60d982-d7d5-432b-962e-b7734cc90534.json | 132 - .../79a0fdf3-b432-4598-be62-f9eb57fa5a43.json | 132 - .../662566e0-2af3-40d6-90de-9b361bcae355.json | 132 - .../d81c0035-a0b1-426c-9080-8ccbf745642b.json | 132 - .../100bc243-158c-4e5c-918b-1439bf26fee8.json | 132 - .../45e32080-1464-40e0-a232-310fdda967eb.json | 132 - .../e89b279f-d548-4aa8-b5e5-0bffdd98b840.json | 132 - .../777a53f9-891c-4f9e-99a8-bb1988f61f19.json | 132 - .../f15846b1-8eaa-411b-88f7-25064161af4e.json | 132 - .../e803fc85-fb98-4db8-aab0-a63100dcd5fc.json | 132 - .../50620749-5ecf-41eb-a131-611675560e07.json | 132 - .../2d40a551-6440-4d71-87e4-639d486c1c5e.json | 132 - .../22235942-2e3e-4ef4-b7a0-5800f507571a.json | 132 - .../ac06867d-3a34-42f6-9e2e-226cf86748f6.json | 132 - .../394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json | 132 - .../03e52d4f-78d7-453c-9685-844dd1636904.json | 132 - .../3ce136d5-be81-4b8c-a7dc-4e1346935d35.json | 132 - .../fb35accf-0c5d-4f72-8d73-ba366a41a76d.json | 132 - .../75e5ca5d-cce1-4463-b398-553399ce6833.json | 132 - .../c426bae7-b98d-4343-b419-ac8206196a95.json | 132 - .../b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json | 132 - .../58ba7ca1-8cca-4668-836b-824491d9cf01.json | 132 - .../23da100a-13b9-42a7-ba79-234be551d0e4.json | 132 - .../2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json | 132 - .../4b87eea2-169c-411e-9d15-caf6b7826590.json | 132 - .../62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json | 132 - .../0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json | 132 - .../b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json | 132 - .../7fadc486-767e-45ef-979d-74ecb858cb99.json | 132 - .../d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json | 132 - .../0999a066-1151-4445-b130-00d8fe4a516e.json | 132 - .../1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json | 132 - .../1a59412f-fe78-4ecf-8951-8f2996dd374f.json | 132 - .../b5403311-2069-488d-af98-27da14496c15.json | 132 - .../6c10c176-b2b6-4216-91c0-1444944612f7.json | 132 - .../80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json | 132 - .../0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json | 132 - .../4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json | 132 - .../8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json | 132 - .../524e634f-280c-4f3a-9f1f-bdda19fad740.json | 132 - .../cb82e92b-f207-4fbd-9bfe-43184769cdbd.json | 132 - .../0b674103-4e55-41f4-accb-b7be73671801.json | 132 - .../fa0290e0-723f-4502-90b6-c77007fffc1f.json | 132 - .../c3827ecd-d02a-4464-a098-110f4fb54516.json | 132 - .../af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json | 132 - .../959a4e4d-211c-4e45-94f1-f8f877e0b36f.json | 132 - .../96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json | 132 - .../ed5d2ca8-d551-493d-8877-348204ef91cc.json | 132 - .../04e20a14-8346-4801-8515-189861c857cb.json | 132 - .../eec2da56-ba0a-418f-afe1-8a46882b9839.json | 132 - .../321cf68b-9220-4ada-89da-061341a20a9d.json | 132 - .../86fda025-2345-4a40-9094-223b96b21f13.json | 132 - .../3c734233-9868-4ba6-83c0-2b63f2ce8980.json | 132 - .../7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json | 132 - .../f5e0e809-08b8-43dd-a44d-875f365610c3.json | 132 - .../8d267135-a7e6-4ec5-ae09-66478804bb66.json | 132 - .../4940ed0e-2c1e-4408-9806-49ceed30a69e.json | 132 - .../5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json | 132 - .../5244ee3c-7d65-434a-acfe-cdb277ff5264.json | 132 - .../eba4644f-d455-4a23-a16f-8ecb038ffe7f.json | 132 - .../fb270319-7010-4946-b60c-409aebe41aaa.json | 132 - .../d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json | 132 - .../0220984e-fe8c-4e72-bc3e-92b949ffe769.json | 132 - .../16482634-ec03-463a-9deb-2230ee955800.json | 132 - .../4c1db32d-96fc-4a66-b083-530a3e75ad6d.json | 132 - .../c0c5c846-395a-47ac-9e8e-e598939f317d.json | 132 - .../6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json | 132 - .../d017e3bf-2abe-4b84-810e-e0eaf973adc3.json | 132 - .../62a3ecb8-f6d1-429c-807f-5545b2a5897f.json | 132 - .../748557ce-1a49-4b3a-9c38-9007dc04aafb.json | 132 - .../95d43d01-a75e-4af4-a2cc-b60f832071d3.json | 132 - .../4dc7c889-7839-4047-b48c-33be5b688e72.json | 132 - .../751851c8-9a7f-4135-a106-eab4efbd0734.json | 132 - .../2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json | 132 - .../c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json | 132 - .../33b8b64f-7da5-45aa-bf80-7145ef704229.json | 132 - .../2662d257-49e2-430d-b44f-b0b347c61271.json | 132 - .../870b639b-ee7a-4b13-872b-52657539c836.json | 132 - .../6ff20678-a335-4fa8-8126-9f96ce247f34.json | 132 - .../19c4ea89-896a-4577-a386-c2470eaf743f.json | 132 - .../22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json | 132 - .../aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json | 132 - .../071ca686-5950-4af4-80f2-969b1008e370.json | 132 - .../78977c34-33f8-4037-86e0-dfce1d01c3f8.json | 132 - .../480e4294-c8d9-4088-9b8c-7a239d57f683.json | 132 - .../be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json | 132 - .../b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json | 132 - .../985e479b-658a-4548-9b5e-c9c04b8838c1.json | 132 - .../d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json | 132 - .../5050c787-2f95-4a17-a4b0-c094860627b5.json | 132 - .../bb5c8274-4324-47f2-94c5-d0c831ce0de7.json | 132 - .../8113a26a-5941-4f3d-872a-bdde5456ad97.json | 132 - .../5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json | 132 - .../88d79858-3a35-43eb-8da6-95b80b5deef6.json | 132 - .../63266a49-01ea-40f1-83ef-778f391aff2b.json | 132 - .../f0da069a-833f-489a-a923-c79542a3a9a6.json | 132 - .../205b9da8-d561-41ec-946e-1d2f9a43e437.json | 132 - .../2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json | 132 - .../c086f693-cef1-4212-9c17-669b210f4caa.json | 132 - .../290995f2-9982-4f29-ac74-dc646905206c.json | 132 - .../c60e65e6-d771-4c53-80d0-c1e09aa39377.json | 132 - .../fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json | 132 - .../5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json | 132 - .../a0b4a345-3530-4da2-8403-87259bbd1405.json | 132 - .../3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json | 132 - .../707270e3-334b-4eba-84c0-2795ae53d79a.json | 132 - .../c827bee3-a181-42bc-9387-ca132d59c8ba.json | 132 - .../d3e8949b-f6f8-459f-891b-f4900ff806cd.json | 132 - .../35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json | 132 - .../4cf4479a-622a-4bc2-86f2-aa526216f24c.json | 132 - .../6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json | 132 - .../87b5e360-7867-4edd-b45e-e7bb92a91b69.json | 132 - .../d93116b8-28ff-41ea-8273-56f7ae11cf18.json | 132 - .../ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json | 132 - .../5e5602cc-b4de-4247-aa6d-940817fc849b.json | 132 - .../cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json | 132 - .../aec03bd9-808a-4c3f-bbde-40bcac5775fb.json | 132 - .../b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json | 132 - .../c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json | 132 - .../a93c5674-599b-429c-a322-3c6bc7248f45.json | 132 - .../5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json | 132 - .../c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json | 132 - .../1b49cb06-3ee1-4945-aaed-12c868d9e45e.json | 132 - .../65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json | 132 - .../7fecc176-debf-4bf7-b3f3-479d05678a1e.json | 132 - .../3c965626-a264-40db-93e1-cd7659d0662e.json | 132 - .../50fa6f0c-d689-4380-b619-253209b5badc.json | 132 - .../adb25c88-6113-4307-bbf0-d377f757bc18.json | 132 - .../b9ac5e03-c878-4e46-a89c-1906f3b91dce.json | 132 - .../d6a6badf-4472-44b5-af9e-4282e4406a8e.json | 132 - .../92e62d3a-3091-4538-b6da-ba705e11687a.json | 132 - .../04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json | 132 - .../5013ccfc-6bc5-4862-898c-1ca781f92572.json | 132 - .../38fff98c-72b1-453c-a2cf-cf077dd19d10.json | 132 - .../42911928-ef64-474b-828a-02ce3383773e.json | 132 - .../7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json | 132 - .../5b9acd52-7eb6-4099-98be-ecd6cae07835.json | 132 - .../666bef5a-2d62-4743-bff1-07365716ab19.json | 132 - .../85de411c-2308-4824-bd6e-3327eeb6fe3e.json | 132 - .../df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json | 132 - .../6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json | 132 - .../c41df02e-5aff-4de6-a1c4-d45b5585e29d.json | 132 - .../aa587b4a-9c19-4231-ba72-9b66446460f9.json | 132 - .../be14e75e-4fb1-41aa-b168-1ec23eb305e0.json | 132 - .../73be4a2b-28c9-4208-8107-3734fea25008.json | 132 - .../0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json | 132 - .../9f8fc05a-8658-4ed3-994a-965e6882d242.json | 132 - .../ced11f6e-490d-42e9-8f3e-00e22cfc2910.json | 132 - .../70ba788b-fe8c-4667-a859-0fb122de22b9.json | 132 - .../e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json | 132 - .../15cacfe0-bdfb-4b87-a813-bfa70ff71984.json | 132 - .../cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json | 132 - .../e1eab0cf-2c6d-44b2-8aaf-a75347741529.json | 132 - .../ed221db8-cf81-4257-8785-db9381eec5b7.json | 132 - .../b314468b-401a-4318-b022-c966bf3366aa.json | 132 - .../a0dbb2eb-66c7-48a3-a85c-725b49141edf.json | 132 - .../812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json | 132 - .../77af2424-0a23-49f3-97b0-316d04a33547.json | 132 - .../6f422676-2d7e-40ed-a5e3-4afc25564cfc.json | 132 - .../43923dd6-838a-4259-a938-7766dfd9c07e.json | 132 - .../dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json | 132 - .../16a782dc-0795-4281-aad6-4f664a0940ab.json | 132 - .../5d24d4ad-9f37-4634-ba23-74fbc74fd298.json | 132 - .../043cd315-fcb7-4871-ae79-dee3fdefaef0.json | 132 - .../3c377d7e-14bc-4c82-9ada-7560552abbe4.json | 132 - .../43bb650b-8bb7-41b4-866a-cb2dad1499d6.json | 132 - .../bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json | 132 - .../14a1872c-7afd-4cd4-ad87-853e4fc0847e.json | 132 - .../887e4ca9-ed48-4b33-b933-f8534a8d0377.json | 132 - .../c585488d-4043-482f-b1fa-4a61e96f7f0f.json | 132 - .../d64541f6-19ef-4f04-a991-93efec6fe24f.json | 132 - .../1c13e194-8bee-4456-a249-f71e7e34b0eb.json | 132 - .../1d3db737-20e7-4da1-a311-e60de0b41c93.json | 132 - .../7b73d50e-358b-4961-8b58-63765ce5a82a.json | 132 - .../81dfd69c-cf01-4114-8157-fd09af6f490c.json | 132 - .../f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json | 132 - .../01863b4f-9550-49c3-ad83-74c0bb535eb9.json | 132 - .../edd25437-38bc-443c-9da3-bc041270447e.json | 132 - .../31836d43-5022-488f-ba9e-379195809069.json | 132 - .../2a5a3ed6-7137-49e2-a141-497ceba88757.json | 132 - .../0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json | 132 - .../69423132-adc9-4b97-b799-15f37de1d7e5.json | 132 - .../54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json | 132 - .../cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json | 132 - .../7fbc0323-1c78-46b6-a08a-6e5870c64e53.json | 132 - .../1c769f0d-b99d-4b82-a529-f5264f7b3349.json | 132 - .../a9365685-e299-48e2-931a-c63e123a9e00.json | 132 - .../bdf2d61a-daa1-4b1f-9245-43ff263540fb.json | 132 - .../f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json | 132 - .../29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json | 132 - .../c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json | 132 - .../0b11eb9a-61c8-4af1-8335-24bef2597e5d.json | 132 - .../7d31e5fd-700a-42a8-bea8-8989e8c52603.json | 132 - .../f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json | 132 - .../2fae7e4a-8c28-4be8-9391-ca79077e32c2.json | 132 - .../436e651e-6f04-44ff-ab3d-db8ed0d639bd.json | 132 - .../9fbccac2-c840-494e-a24d-a6f0c9a07b88.json | 132 - .../a4ee6a33-df51-4a4e-a13d-45488a094fd7.json | 132 - .../a3923f10-e64c-4556-9616-4fe7072eff60.json | 132 - .../ca15d972-9075-42df-884b-5d069f6ff425.json | 132 - .../905909a5-abef-46bf-9392-c97873e229df.json | 132 - .../95bd05cf-8f59-409d-a99e-d249bad6c561.json | 132 - .../76b12246-33f6-4992-a0ab-38704dcf6345.json | 132 - .../e4415806-0ec0-465a-b28f-9c8741436fb4.json | 132 - .../98e62ab5-d35a-42dd-904b-bed9c50f3745.json | 132 - .../8fb3596e-224e-492b-bdb6-a95a16656eb0.json | 132 - .../154203c4-d86e-4c36-806b-c45c5cc568ce.json | 132 - .../e42c01f7-2869-4103-bbfd-81aa5a15c140.json | 132 - .../323d2f94-5e04-4627-9f74-129217f53eea.json | 132 - .../6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json | 132 - .../691cace3-5316-4f5b-8693-67efb24a0a06.json | 132 - .../d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json | 132 - .../f6f515d3-f5e9-4362-be51-bb8fc05527e6.json | 132 - .../2e1e215f-b622-439f-a13f-531441e25ae3.json | 132 - .../d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json | 132 - .../ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json | 132 - .../4eedd6d4-279f-4660-8d71-708a27bb53e0.json | 132 - .../9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json | 132 - .../e5c0fbc9-f424-4b04-839a-8335adaf89cc.json | 132 - .../d91107fa-eb8d-4d01-90a2-fc9831f337b2.json | 132 - .../926999bf-1ba6-4321-82b2-fcced4336739.json | 132 - .../57d481bf-0db9-4208-afda-dcd20df13964.json | 132 - .../eb417e47-fe63-4dc5-b3e5-28782f3782da.json | 132 - .../b0f516dd-7185-4906-87a5-3c6f019894d0.json | 132 - .../1e562944-a205-4ef7-aff1-3776595d131c.json | 132 - .../6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json | 132 - .../2064938d-9f05-4740-a4d4-2a2da0eac21d.json | 132 - .../43240184-8245-43ff-a971-678523918fe0.json | 132 - .../b3b854b6-700c-4297-b335-6acc3c385f84.json | 132 - .../a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json | 132 - .../88e1dd78-d3bc-401b-88e9-d963bac181db.json | 132 - .../a41bd607-f319-4063-a6e4-813f43e40568.json | 132 - .../8629aef1-c673-4b17-a9cc-b361a53bdaa7.json | 132 - .../532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json | 132 - .../843f9927-9865-4066-9cc0-f0522d3b914f.json | 132 - .../eeecb2cb-e286-443f-84aa-d825702a4ad8.json | 132 - .../36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json | 132 - .../c4e810f1-ffb3-4ece-b445-64e339761530.json | 132 - .../025725b6-0034-48c0-a720-5fc210e5e24b.json | 132 - .../7bdd8928-c336-494e-9c87-de9ecc2749b8.json | 132 - .../ff7369dc-3ff2-424b-80b0-e06a141b54f3.json | 132 - .../a6dc7253-75fd-4897-be85-8ac89fc11f8e.json | 132 - .../296ceacc-542a-4000-bf9b-ae59b33a53ce.json | 132 - .../13870577-7579-48b4-9c92-202318ca6ecc.json | 132 - .../6ebd2806-2623-4773-93bd-1036ff01cb8c.json | 132 - .../99d6a44b-d556-4674-8ade-a5b30cf99255.json | 132 - .../605118a3-316a-46b5-9719-f596e361a2a8.json | 132 - .../271d2829-fbd4-438e-9f09-59539af68c8b.json | 132 - .../107bc549-75c1-4272-b567-f8ab9f6cd675.json | 132 - .../dfb451e9-c1c1-45a1-8082-155763366129.json | 132 - .../b2d80977-d079-42ec-b057-5aac530b9d70.json | 132 - .../16b33b80-3b4b-4edb-b89f-3d93dca8969c.json | 132 - .../63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json | 132 - .../538f2b43-328c-456d-8a40-ff2b37924453.json | 132 - .../fb7a68e6-716e-48c6-96c0-d227735f9a7c.json | 132 - .../3593d4b8-5602-4cca-935f-a76e342f060a.json | 132 - .../72d503fc-b221-498e-811a-a806769175d6.json | 132 - .../ad7d9698-d9e6-4f2d-9767-987835626c8c.json | 132 - .../98899942-fcf0-41de-8587-44d7429bea47.json | 132 - .../bb51eb59-88f6-49c2-814a-11b2c80313d0.json | 132 - .../d8563f36-e299-4186-a5dc-9dae51824e1f.json | 132 - .../43bc0528-7bc5-4eac-8848-c9995079450f.json | 132 - .../ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json | 132 - .../24629e14-d197-4a5b-adff-7840af652f22.json | 132 - .../9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json | 132 - .../46548403-6eb5-4f7a-874c-1327420f4cab.json | 132 - .../0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json | 132 - .../aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json | 132 - .../ad03cae6-b126-4157-a225-9576e4d651d0.json | 132 - .../0d57b65d-3dd4-4185-b8cf-531105e94b5e.json | 132 - .../f8882044-6e71-4788-b2ee-f51f85e67ecc.json | 132 - .../3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json | 132 - .../e26743b9-4caf-46f8-bd5a-7e4445c850b1.json | 132 - .../febd4016-3a30-4b26-93e5-f7b556781b9b.json | 132 - .../ae82125e-94ac-48ca-8240-807e4b7ef9a0.json | 132 - .../5321fa0b-b010-4e1d-9f20-a97b56f4f937.json | 132 - .../d25a4602-ea50-4a53-952c-112ba250123b.json | 132 - .../232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json | 132 - .../975f54fe-a581-4ce1-b0c1-7becb7605f09.json | 132 - .../92ae4461-48bc-47fe-a3ad-ea4c3452d395.json | 132 - .../638e1cc0-9baf-4555-a278-4b21c46af86f.json | 132 - .../cef4161a-4e1c-4a92-bca8-b07f957a13b1.json | 132 - .../715b556b-2bc0-4864-b4b1-b7413a5d45bc.json | 132 - .../7552ad5c-5d1f-478b-a931-036083b2954e.json | 132 - .../7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json | 132 - .../821d67e5-da8d-4383-8825-3bfa72a91fc9.json | 132 - .../c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json | 132 - .../dc35237c-606d-4609-927a-566bea767312.json | 132 - .../3924d1af-e167-4186-a34b-d9b4b8c26d59.json | 132 - .../f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json | 132 - .../08f933a0-b096-4271-890e-0df7e20d1d20.json | 132 - .../8434e448-ed77-45f2-9c31-39128912f842.json | 132 - .../d801037b-1eb0-4058-9096-429e5237e015.json | 132 - .../e0c46f18-598e-402f-8955-68e71fab67cd.json | 132 - .../4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json | 132 - .../ec658058-1075-4918-9dc9-fc79d0dcf897.json | 132 - .../b68baa86-3e1a-4888-98ba-2ecede79b4a7.json | 132 - .../0b11c8ab-2cfa-425d-9d81-d999f94401db.json | 132 - .../a3e48db8-3679-4f19-853d-82a73ef49400.json | 132 - .../7dbf35b2-80c1-4181-80f9-850ea51cead2.json | 132 - .../231f47db-1662-4313-9ff4-f32883f5615c.json | 132 - .../c79df898-14c6-4f00-9f65-0d01cd34ed61.json | 132 - .../2c52917f-c396-410d-bc78-c93c433797fc.json | 132 - .../0f1d2925-4e1c-495b-94be-f3515fbd53d7.json | 132 - .../5cbb1972-9895-4689-9f6f-7e0037829a78.json | 132 - .../6bc42e37-1f31-47cb-97e4-9d0b28b53691.json | 132 - .../a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json | 132 - .../78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json | 132 - .../e4c06400-da86-4448-b421-23476f50bdb3.json | 132 - .../48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json | 132 - .../cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json | 132 - .../ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json | 132 - .../9018f443-a63f-4e07-b10b-272f66d1eb0d.json | 132 - .../548d1536-b941-43a9-a60b-ae5448b70933.json | 132 - .../99853109-17d9-46fa-a502-e4c977c1fb8f.json | 132 - .../e171a0a0-f46d-404f-84e8-539155284e17.json | 132 - .../eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json | 132 - .../151cb8c4-0a7d-4886-80ea-560902e1f932.json | 132 - .../1acb97c4-a9d2-4ec8-9486-77eb6857646c.json | 132 - .../1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json | 132 - .../81562e50-23c5-4ef1-b98c-b40625f3b8c6.json | 132 - .../95fa292a-ee64-4844-9646-ce3cc7f730d2.json | 132 - .../4d14c584-b5a1-41cd-9605-78088dfebd7f.json | 132 - .../1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json | 132 - .../4b0ab369-e72f-4229-b449-3a21ee9d2c95.json | 132 - .../478b6c1f-3329-4c9b-9d90-59b8b551c1af.json | 132 - .../212f8dd2-3c61-45bd-a3de-2326334feb73.json | 132 - .../9251282e-f72f-406e-a2cf-e7063516f624.json | 132 - .../91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json | 132 - .../aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json | 132 - .../1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json | 132 - .../f374772b-2685-41e2-a455-9002e48e3739.json | 132 - .../6db801f8-5253-47c0-b87e-6779bff42f6b.json | 132 - .../0d704671-c0b6-4296-85b5-eaf972d6be6a.json | 132 - .../7e31545f-0865-4843-914b-a71f8a84314f.json | 132 - .../431c7130-5a19-4a71-8a92-fea9726769ac.json | 132 - .../ca850c4a-14d0-4145-9977-0d33e6e3e362.json | 132 - .../7389caa3-6d8f-43e3-b3f2-d9320e56f621.json | 132 - .../1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json | 132 - .../1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json | 132 - .../e4085c6a-bc16-4328-a724-4b9838b55faa.json | 132 - .../b929b955-1fbb-43d0-add1-4d58fdc4097c.json | 132 - .../df723a0f-9a32-42f3-9421-780159f7d821.json | 132 - .../c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json | 132 - .../60c02070-7554-4764-8a02-841ca75a0d5c.json | 132 - .../d243f226-149b-4824-837e-e80ab68bae9d.json | 132 - .../4f9361d0-2ad9-44da-a1d9-876d43451ae6.json | 132 - .../6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json | 132 - .../7cd2c0da-15b8-4ad6-8cad-feb68631c079.json | 132 - .../36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json | 132 - .../1fd0d1db-1d75-4b10-bae8-33023c2c7466.json | 132 - .../c6c02512-6c91-4818-a084-c48915fd83de.json | 132 - .../326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json | 132 - .../b3a190d1-5b86-4439-a21e-1f118239db82.json | 132 - .../b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json | 132 - .../05a59445-b816-4982-9b1a-1c2394ffbaa9.json | 132 - .../ff952579-e92d-4af8-9497-f49fed5efba0.json | 132 - .../b541ede0-6de9-4557-8280-43567fd3dd96.json | 132 - .../8514f601-0bb2-4639-90cc-29e96088e7de.json | 132 - .../57e6d0cf-943a-4b83-a1f4-4f03b5066523.json | 132 - .../ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json | 132 - .../14b260e6-4300-43ec-b7af-587a2f5b03fb.json | 132 - .../53de1fc9-7097-4103-b731-588a7bf39f80.json | 132 - .../1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json | 132 - .../51b62d59-f39c-49ca-af0a-73df6440e29d.json | 132 - .../622a0ae1-0eb5-49f0-bc44-d396c7233e27.json | 132 - .../71291a41-283e-42ca-b192-7b759e3c3712.json | 132 - .../7e504fef-b304-4c1a-856d-06e56a8869d7.json | 132 - .../f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json | 132 - .../099ce031-1e11-4a07-bac1-03bef9b915d6.json | 132 - .../75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json | 132 - .../cbc43c7a-d8ac-4b03-a383-703f7fa51757.json | 132 - .../72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json | 132 - .../5eb10878-11e6-43ad-9bb5-658a3495129c.json | 132 - .../23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json | 132 - .../03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json | 132 - .../273f0d50-aa4e-4469-8360-2ce0a2e1a850.json | 132 - .../79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json | 132 - .../9da9a0e6-257a-41f6-b3a3-e3279a4924db.json | 132 - .../dfed058c-48b2-4e1e-9a29-624771e3e9dd.json | 132 - .../bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json | 132 - .../8438a108-0d5d-48b6-b73a-981d13329daa.json | 132 - .../88616292-1e38-4481-af30-6b60e28fb097.json | 132 - .../44094907-0b09-4706-a117-116a7e10a6e5.json | 132 - .../d19e8078-87e9-4760-9b91-6b5f478820e1.json | 132 - .../896464f1-01bc-4370-8d90-3368323b2908.json | 132 - .../9889f0b9-9051-485c-bd44-32b1e56b865c.json | 132 - .../6563ce79-6df4-4c78-89e2-064f1250d898.json | 132 - .../b1778755-e6e6-47e2-925d-44d786c4ff62.json | 132 - .../3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json | 132 - .../40831e23-0a9e-4bdc-a365-9399b6b82ff9.json | 132 - .../4a60fa82-34dc-4b0c-9102-65adac5039e4.json | 132 - .../75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json | 132 - .../d7962833-660a-4b9b-9836-8a2f3251f38e.json | 132 - .../ad8ecabf-a868-496e-892b-582efb54fa6a.json | 132 - .../49f25d3d-80c9-4723-8fa9-1501d44d70aa.json | 132 - .../70ea520c-3e0c-4412-9dbe-40a00801335c.json | 132 - .../8e7f8bad-812b-4f6c-8dea-1cf44584c300.json | 132 - .../3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json | 132 - .../702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json | 132 - .../20e5d087-7b20-4a39-81da-7334354b61f0.json | 132 - .../4c5a769c-0472-402c-8e97-d24e5b302bac.json | 132 - .../96166735-ed03-4931-81c9-d3daed1913d9.json | 132 - .../06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json | 132 - .../776fd8d8-9846-4359-97d4-2340425d1315.json | 132 - .../197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json | 132 - .../1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json | 132 - .../57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json | 132 - .../304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json | 132 - .../6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json | 132 - .../fc7284d9-a73f-4562-a781-5cb87247183f.json | 132 - .../26ab447c-a850-4197-983a-a0dca4532029.json | 132 - .../ee9e2131-aa99-49e1-9814-f0664614354b.json | 132 - .../23c472f7-f060-4a69-8f72-12490675825a.json | 132 - .../04172bef-c06b-4c08-b2af-9e1fe4d97664.json | 132 - .../3436355a-d2fe-411f-a764-4cb8284deb4c.json | 132 - .../265655c0-2ead-4dd7-8c7e-4bee69d51bce.json | 132 - .../645cae82-9e7b-4d1b-b944-e3783089c1c1.json | 132 - .../ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json | 132 - .../03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json | 132 - .../ce7e3a31-c65b-4521-b685-fcbd067c75d9.json | 132 - .../adb53e2c-5dee-4840-8eae-e0186c6e103f.json | 132 - .../ba89563d-f53a-4bf0-91e1-92ac950523d8.json | 132 - .../3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json | 132 - .../ed816bcb-bbe9-48ae-a6ac-3603779a985f.json | 132 - .../f347ed24-066a-4cba-8478-f03628cb2b5b.json | 132 - .../ffddfea0-d17e-44e7-8931-a9601e9cb26b.json | 132 - .../ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json | 132 - .../a0038c34-130b-49dc-a93f-94706a3dad50.json | 132 - .../cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json | 132 - .../b902e2b2-a0b3-4467-b076-b98717c40d74.json | 132 - .../4c749665-59ff-49df-a193-0262f66e6003.json | 132 - .../c99899c6-95e1-4dea-ac12-f8df49728a3b.json | 132 - .../13deca9f-073e-444b-bf79-35e816f7c312.json | 132 - .../c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json | 132 - .../b146daaf-ce1f-4520-bc19-21ce8679b220.json | 132 - .../45e1d037-1ed0-472c-a311-c651fde270fc.json | 132 - .../3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json | 132 - .../470d52be-9dbd-4714-b004-f65cc82d245f.json | 132 - .../c836fd05-1969-439c-91e1-fd0cab816f6c.json | 132 - .../14774c6b-eb03-4abc-92df-1e7a196ca8a4.json | 132 - .../5293ae0c-8022-44d4-b2f5-4f5390dff93e.json | 132 - .../9020f91f-a8f0-447d-af68-247aa81a25c6.json | 132 - .../0cd6837a-8c3f-4529-9ea0-8755e1725467.json | 132 - .../7cb17011-cf77-4e86-b67f-84e6ff4b8086.json | 132 - .../086831f9-c677-428b-a997-4da58733633c.json | 132 - .../d71893b8-b82c-490b-a700-b579d64e0610.json | 132 - .../9893689f-c27d-4148-a27f-cd07b07e98b7.json | 132 - .../90f2df23-a9ec-44be-ade5-89b59cb7368a.json | 132 - .../afd545da-390a-478a-b0f5-ea819f088f27.json | 132 - .../ce776f68-856f-4aee-b7e4-e55d15e8d714.json | 132 - .../9b015729-524c-44f3-9c2c-c42981d7a61e.json | 132 - .../56a54ffc-4692-496c-95df-8e4ad19d4d95.json | 132 - .../4b105969-2ce5-4c62-89ef-efd392c2ca89.json | 132 - .../31af79b1-48c1-4399-9d16-8582c92996ee.json | 132 - .../59a67f29-cb7d-497c-b7bb-1764a665ae33.json | 132 - .../fe57367c-74b7-483e-af54-4f404cbea75b.json | 132 - .../fda2277b-1513-416e-b586-ed05920a0bb4.json | 132 - .../b3dde216-f80a-4664-aadc-b5f5dd3e5895.json | 132 - .../07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json | 132 - .../ba76c356-cd6a-4636-8ab1-18bb9df69881.json | 132 - .../c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json | 132 - .../6f296f0e-80ca-49b7-94e7-cb45b795c715.json | 132 - .../b5509e11-820a-4ad4-8c6a-0294762502a8.json | 132 - .../90d73665-8d83-4e74-ab7d-29b1d3b6181b.json | 132 - .../72387647-cbac-4b72-9c22-db7029a39457.json | 132 - .../6219ec01-4b6a-4acd-aee1-96c3e8e48643.json | 132 - .../5c323d7c-25cd-4718-8a1f-54d986cadaf2.json | 132 - .../adfab21a-941b-4efc-8b63-fdfb3074ba9b.json | 132 - .../350d00a4-7501-4130-a069-323530bc9729.json | 132 - .../ea809d28-178e-4a0b-ab5a-34739077c5ff.json | 132 - .../243d5ccd-58f3-4da5-8718-553f3f456490.json | 132 - .../a45537a7-76a6-4855-b83b-abe965f13460.json | 132 - .../9be911b6-b9f4-47b1-849d-62eb20c9e944.json | 132 - .../33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json | 132 - .../4355fbdd-ac72-4f26-8e07-b7e8d774d238.json | 132 - .../4bffc633-e20c-4874-b7db-d1b7dabb8070.json | 132 - .../2d5c844d-d950-4254-bac2-0a986659c541.json | 132 - .../f6e74b3c-9ee4-40c3-bf92-35d965503a04.json | 132 - .../8f1d2600-7347-48b8-9759-11570598459d.json | 132 - .../cd653bfd-2c06-4224-aeeb-bf591995a69e.json | 132 - .../cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json | 132 - .../4828bd36-5453-4383-8985-08d04a7ebecd.json | 132 - .../4c2baa59-c2f1-4779-9d21-1f69c0821968.json | 132 - .../555c1079-c4d0-4b9e-9d2d-769e7ba32429.json | 132 - .../58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json | 132 - .../eea2a38a-4f1b-48d0-894c-09974894f264.json | 132 - .../3d8063ab-0ad5-43e4-83ff-90b46dee766f.json | 132 - .../da5e0284-7c44-42d4-a110-a23880de277f.json | 132 - .../bef017bb-47b1-48e4-93c4-3b222a16af7a.json | 132 - .../401c83b0-b7d2-4987-9e46-f127fdbb595f.json | 132 - .../c6fde59b-73ed-4179-a907-076be068b262.json | 132 - .../90997fea-6c67-493e-bd8e-5327cfb33ea4.json | 132 - .../08957d63-7462-44ff-9dd8-060a5801a31b.json | 132 - .../a434f569-e7d6-4464-afa8-6104be43fa06.json | 132 - .../e32ed251-e817-409f-b4c3-8f168f1ff822.json | 132 - .../1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json | 132 - .../608398da-ae2a-4be2-aaf9-6ec8899aa63d.json | 132 - .../80e04641-be7d-4351-a4f6-1318981ef834.json | 132 - .../e74222c6-636c-4075-8d4d-30c73fa70fda.json | 132 - .../aed80361-9304-44a0-934a-52976d7f1bf3.json | 132 - .../709bd280-b03e-4908-808f-34566bc968f4.json | 132 - .../66c495b3-4b09-42ad-b742-4d753c3bde7a.json | 132 - .../e24f7be6-3051-4990-8b93-121aec5402eb.json | 132 - .../0321571b-4246-4490-bd6c-7b106eb8e15a.json | 132 - .../54dbf947-ab18-40dd-9cd7-a496289b2e72.json | 132 - .../d841e204-ed6a-439d-8408-d5cfb3b38dae.json | 132 - .../96b57891-83e3-4948-ad48-64a2a370e166.json | 132 - .../30301818-6dad-45f9-acfb-a68ccc7c0609.json | 132 - .../50743107-30de-4c5d-bf83-cc003af8a5db.json | 132 - .../625ee1b3-e0a1-4a86-83a4-6e66b380f864.json | 132 - .../89fda762-1989-4850-837c-f79ef538c58c.json | 132 - .../1de1f906-0e36-4f79-b159-16ef8ee33ab3.json | 132 - .../d8588222-9e4b-47c1-9f86-92f47c9c8e38.json | 132 - .../15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json | 132 - .../81225b85-1523-49c1-b770-897112d2e6ae.json | 132 - .../254deaf7-a253-4d41-a10d-1143f86b288c.json | 132 - .../ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json | 132 - .../eed0b3b4-e277-49ee-aed5-f3599b2d5653.json | 132 - .../96a21b6e-ed47-40fb-85cd-15924330e60d.json | 132 - .../f41f5471-6384-4510-85d2-41f236082583.json | 132 - .../2728eccc-525f-4350-901b-dbc352c78014.json | 132 - .../3e7ae935-46c3-427c-8713-41c659c1828a.json | 132 - .../66782676-c942-4aff-b754-b96cd96cf1f9.json | 132 - .../941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json | 132 - .../caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json | 132 - .../d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json | 132 - .../615bf89b-9357-46f4-82ed-f49b0021da01.json | 132 - .../06398630-23ad-4000-8ea2-fcca230568d7.json | 132 - .../bdfa30f8-da0f-418f-adaf-caafda4c81a5.json | 132 - .../bd5e550c-5355-4e01-bafc-2ca89899253a.json | 132 - .../f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json | 132 - .../3a09590f-28f3-4161-8a93-d42cec62aa90.json | 132 - .../0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json | 132 - .../f276ad54-4e3b-4718-ae1f-0479565e4565.json | 132 - .../dec20396-6555-4773-bf02-2cd1fcedda89.json | 132 - .../eebc33e1-0016-4adf-815a-72653a34c01b.json | 132 - .../803c3898-c1a6-4832-ac3a-a86139489810.json | 132 - .../bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json | 132 - .../99debdd2-1dea-4eb6-be5c-c144656cfe20.json | 132 - .../ad67bb88-7f74-4eb4-b771-0b3b60be4416.json | 132 - .../af2f579d-1e8a-47d8-8e44-a599bee83e37.json | 132 - .../763c840e-ea73-453e-8e54-5f4fd6fda9cd.json | 132 - .../4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json | 132 - .../ffc4ef41-4a28-4816-be54-8ffd8e153073.json | 132 - .../f75fe902-f1c7-4e6c-87d6-128688db8d94.json | 132 - .../dbd3098b-4532-441b-a81c-072c52579be6.json | 132 - .../438e4aa3-5e02-446e-bd3a-07ef724d24ff.json | 132 - .../027fdc55-61eb-416c-b6ad-4408912d151b.json | 132 - .../37a4895d-def5-494d-9b62-d8c97ba9350b.json | 132 - .../0d53c27e-962c-428f-b540-35ab027883a8.json | 132 - .../6f7b2d91-24d6-442c-93a5-9afc88e9a308.json | 132 - .../21793520-7d1a-4040-bb96-fa7fe98ae580.json | 132 - .../59d53c40-5b16-4a70-a693-5fb554cf7614.json | 132 - .../b28a569c-6bdf-4547-a2ce-c3e224764be3.json | 132 - .../2de129c8-2259-4367-a619-85d9e8f61e06.json | 132 - .../c242030f-fb2b-42dc-a5d1-687273b17282.json | 132 - .../3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json | 132 - .../ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json | 132 - .../f8c131a4-1fee-4694-8753-88853418ef4b.json | 132 - .../27dec9ff-fb18-43dd-949f-7c0587a5858f.json | 132 - .../060df34d-ab67-43e1-bd56-ebaceb77abd3.json | 132 - .../a6357673-3daa-4593-8593-2b65a7d5477e.json | 132 - .../121d4877-1955-48db-a23a-6b0ad0623b9e.json | 132 - .../1f1eab02-219e-4ad8-af50-e103541e1c9d.json | 132 - .../b4cccfb3-1c17-48a3-a211-a26c44de757f.json | 132 - .../05e97a86-681d-42a2-8a47-beade25d8fc9.json | 132 - .../6c0899b4-f066-45f6-827d-11c535ef0634.json | 132 - .../f9660557-b9f6-4ecc-b260-c245f0e62b5b.json | 132 - .../89168032-5840-4c2c-821e-b3d717ade46f.json | 132 - .../10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json | 132 - .../6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json | 132 - .../5e715199-7030-47b4-89c6-83ba0968c07c.json | 132 - .../3fca39e8-443d-47da-a858-83a68c18eec9.json | 132 - .../b7518bd2-d3af-49e6-823a-f8d507e8e60f.json | 132 - .../fa399f16-1652-430c-be19-afaf5ab96be1.json | 132 - .../cbe5032b-122c-4a0b-a099-50e998a4bc77.json | 132 - .../fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json | 132 - .../1a18d49c-ad7b-4823-abbc-7191e9d659cd.json | 132 - .../9e2c614e-1104-43a6-9e8f-b7851562e01a.json | 132 - .../7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json | 132 - .../a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json | 132 - .../21f6688c-be52-4352-9c95-d37c0a5f6c94.json | 132 - .../e92ba586-7bee-4a9b-b388-e35efde3d36f.json | 132 - .../45ed0bb3-efbf-4a32-9735-d814aa08790a.json | 132 - .../eff28375-89a7-4970-9342-428b07d0c6f4.json | 132 - .../23877e30-b8fb-45ea-a803-47df757ea909.json | 132 - .../8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json | 132 - .../d2d4b5a5-109d-4d26-a166-3d97b341584e.json | 132 - .../ac404d92-7a06-4758-ab1d-fcf840c2b995.json | 132 - .../95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json | 132 - .../c101e272-24d2-44db-9b0f-2ed4d17cec41.json | 132 - .../2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json | 132 - .../a414aefd-ce24-49a9-b431-0c6014ebfbd8.json | 132 - .../91fcb6a3-d351-48c8-87e8-e2a06642e925.json | 132 - .../3cd90efa-ddf0-43c4-884c-84337ded14b2.json | 132 - .../c66c21e9-a332-40f9-ae87-bdd78a25d753.json | 132 - .../0b4def91-29df-45d9-8dd4-c4097ec47ba3.json | 132 - .../2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json | 132 - .../8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json | 132 - .../ce4cc270-57da-4d08-9130-62508b409cb2.json | 132 - .../4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json | 132 - .../de3c949d-bab5-4430-bdd1-48e1b7860934.json | 132 - .../011e53cd-409f-479b-9c3d-bfce75a1277b.json | 132 - .../1ff40e45-5be4-4625-9f66-5599a829903d.json | 132 - .../fed97d94-2949-4383-8f25-fa79bd413508.json | 132 - .../f4820bc8-7dfd-4439-af95-21b6cc9367ac.json | 132 - .../36e576bb-de50-49ec-a91f-f134c11bbe38.json | 132 - .../0edd388b-7a1b-4334-9b72-52d84653ff67.json | 132 - .../b3199674-328e-41a0-9aa4-bf39aec735bc.json | 132 - .../52db4d79-7040-4525-934e-0f33e4acec63.json | 132 - .../ee34821e-9182-433f-a8b0-745711e23738.json | 132 - .../10ef0990-5356-432f-b24c-dd107188ec5f.json | 132 - .../47de680d-33b1-4441-92da-4b97a5fc513f.json | 132 - .../96ac0351-2ade-4d76-bcf9-bc0f633f8694.json | 132 - .../31aae266-c14b-451f-8bab-62ee7d5d382e.json | 132 - .../f6edb102-e867-46d1-afdc-3c45166bd510.json | 132 - .../8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json | 132 - .../dcf33a22-5e57-4476-a2cb-ebd60407a920.json | 132 - .../15659480-be0b-41c8-a463-873be444b194.json | 132 - .../0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json | 132 - .../93aa3a13-5069-410f-a1df-6944e0231e0e.json | 132 - .../427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json | 132 - .../c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json | 132 - .../cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json | 132 - .../5b5d42d7-8012-46f1-826f-32d839806048.json | 132 - .../5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json | 132 - .../21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json | 132 - .../c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json | 132 - .../6586fa94-9f43-4814-8c8a-8ed244ac94e7.json | 132 - .../df7d7db2-867e-47f0-9abf-d71b79e97630.json | 132 - .../e2502e7e-3a10-49f3-b5c6-b20496fed998.json | 132 - .../51cde18f-09b0-4b66-a962-811ee49e192f.json | 132 - .../4ea48b42-8026-4799-b35d-46757fd2753f.json | 132 - .../52e9b4ae-9119-4f26-87e4-6532d1148ecd.json | 132 - .../4bda68c0-cc09-4945-961b-48776b7b5fc8.json | 132 - .../18ea0ad0-a216-4906-a96c-c8b040398dbd.json | 132 - .../1e2321f6-93bd-4acf-9f5b-c82807a40233.json | 132 - .../13032961-52a1-43cf-b69d-1802c43e1bcc.json | 132 - .../9d444061-2c29-499a-8906-77ef58aba34d.json | 132 - .../1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json | 132 - .../8ce733ea-e6e9-4f9b-ab28-f93202507265.json | 132 - .../0e88aa91-609c-4d2d-9296-25b06eeb0342.json | 132 - .../3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json | 132 - .../5e31a55c-f222-4192-b031-27bb40ba56fa.json | 132 - .../11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json | 132 - .../8e721067-898d-45ca-b4f5-9f523c4ce3d3.json | 132 - .../be5d5480-ce4c-4ade-8c6a-c08cd2826909.json | 132 - .../54dec074-29f8-4863-be37-2c08f6f2c3cb.json | 132 - .../88a15025-556b-469d-be77-c773f2c61038.json | 132 - .../b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json | 132 - .../97ce858e-a64f-4881-b6d0-0a2c0814336d.json | 132 - .../1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json | 132 - .../337bb321-9c6e-4751-9c9b-d8ba0120dd07.json | 132 - .../cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json | 132 - .../6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json | 132 - .../6cc4404a-f3e1-47b9-b56b-34e4269e1261.json | 132 - .../8d820e43-ff42-4247-9ad0-4ed8e70672b4.json | 132 - .../d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json | 132 - .../9813dd88-ff70-4d9e-86c5-9b73444275c5.json | 132 - .../ac677432-e7d1-4439-9c05-426059c285ef.json | 132 - .../018f270f-3cfe-403c-a236-483038a0b04e.json | 132 - .../718a40ea-26b1-4cf4-9584-57be798640ae.json | 132 - .../207a28a9-ae24-4a31-be95-96296b2e466d.json | 132 - .../72efedb8-d456-41ed-b1ae-4887cb6c18f8.json | 132 - .../ac91fb37-5742-4a3d-b93a-86c63b90cad5.json | 132 - .../c71d025d-e954-4420-b397-e07c3644d1f4.json | 132 - .../968c3759-de5f-4255-ba95-cafc7a3c70a7.json | 132 - .../5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json | 132 - .../1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json | 132 - .../7908f572-8886-4add-ae84-b4ec0ec17c26.json | 132 - .../9e04ec5c-2208-4569-9b63-4768ed4262b9.json | 132 - .../ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json | 132 - .../c7579616-0c21-443a-a149-0c51a0ae92ac.json | 132 - .../ef7a1429-db2f-433b-a606-339a9d868e7a.json | 132 - .../f531e13c-79ed-45da-a246-857fd2c884c1.json | 132 - .../0f525d93-663a-442c-9a51-1ad3a5054172.json | 132 - .../15af21e1-3193-47fa-a3fc-1f087216d4d9.json | 132 - .../67b270d9-3422-4770-9957-7bde65acca0a.json | 132 - .../e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json | 132 - .../4ff256af-73c7-4a5a-96da-19546a786c59.json | 132 - .../225cbeef-1d0d-40fc-949d-4ba6696fb690.json | 132 - .../24fcd662-5abb-4bf8-b8df-1c21b048cd92.json | 132 - .../7badcb45-7826-4fd1-b964-c697fbda76cc.json | 132 - .../bfb532f1-3319-46ff-80ae-0ca783a18bb6.json | 132 - .../ea304515-b41f-4e96-a0ec-78c897ebf9a4.json | 132 - .../1fe79ea5-1922-4a5e-8857-1c832353b0a6.json | 132 - .../9098d70f-cbcd-4f6c-bcba-0b1da743396e.json | 132 - .../df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json | 132 - .../f68957d5-20a1-438f-9931-6a787aaed467.json | 132 - .../416e0c04-9119-4230-ba71-b0f47e2d4997.json | 132 - .../d57780e2-154e-437d-ac2f-0007e1f9140e.json | 132 - .../027d464b-1375-4de7-aa57-e1473d16ba89.json | 132 - .../a81f20fa-57e8-498c-a162-6d8a9be09ee6.json | 132 - .../d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json | 132 - .../f681d612-f574-4641-b34e-95b6de97f9e8.json | 132 - .../cae1adaf-e424-4dcd-943b-5bbb708aca57.json | 132 - .../969ac825-92f2-448c-899a-226e69dee377.json | 132 - .../e108ad28-c155-4162-852c-0f588a136bdc.json | 132 - .../93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json | 132 - .../c1b16b84-9392-48f3-b483-0a9786925506.json | 132 - .../b0c6e08d-b426-49d5-8a66-ee3d70131b62.json | 132 - .../6a6651a3-b34e-404d-ac25-42c151fb9ba3.json | 132 - .../da63b789-5571-4ed8-976e-146d385b18e2.json | 132 - .../87b900e7-3bab-4e60-b0ef-349667cb2656.json | 132 - .../c9fd4740-4990-4174-b782-9b63c34d6407.json | 132 - .../2582a049-e940-408b-b2d9-7a7bdf470e49.json | 132 - .../99310118-d2ec-4647-85db-fcc22aee9161.json | 132 - .../bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json | 132 - .../6767e14a-bbfa-4a0d-8120-1f48a565474e.json | 132 - .../70260aac-1bbf-4913-9dcc-58633d055314.json | 132 - .../fba6e1a2-c197-4731-91ea-f6d059ba8b16.json | 132 - .../22e74d0c-70d6-43c5-be4d-62842d93fedf.json | 132 - .../f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json | 132 - .../ecdb4661-426a-46be-aefc-7e04483cebc0.json | 132 - .../236976b3-af46-45ac-a8a5-f5897e3468a1.json | 132 - .../fd175296-a5f6-4914-80e9-b8b75bc659de.json | 132 - .../d910bbaa-d55c-4b00-9320-856a8a6713c0.json | 132 - .../99a5f123-5d2e-469b-884e-c9a64c6bc197.json | 132 - .../ed17a715-f0ae-461c-9618-ac952c450ec5.json | 132 - .../3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json | 132 - .../b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json | 132 - .../39893637-552a-48d8-9b83-433415eb26c3.json | 132 - .../f9549713-f487-4e26-bfeb-ec6d394b7014.json | 132 - .../02579c41-f117-4412-9c00-ee7db3e9ab97.json | 132 - .../bfa1d761-00aa-4438-a5de-972d934c63d5.json | 132 - .../20a84d88-05c2-4e02-8c84-2afa84cc659f.json | 132 - .../84eedce3-3a93-4630-b914-aa281fd2efda.json | 132 - .../b3b7b62f-ac82-4ef9-9634-afb81645ec19.json | 132 - .../283c5166-b9c5-4d20-9653-0cd0346d87c1.json | 132 - .../478b54cd-6410-41e5-8a53-4e46bcd9d7af.json | 132 - .../de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json | 132 - .../ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json | 132 - .../553fd36d-08dd-46a3-ab04-77b9039e7921.json | 132 - .../e2bae853-cc0f-456a-a635-98d5f87ac47c.json | 132 - .../d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json | 132 - .../5d92e02f-b590-4b6b-8c64-30690f79e916.json | 132 - .../e10f38df-b5d5-47c6-924f-563c6f8a6616.json | 132 - .../27257dc9-750c-4673-8865-986434bc5c0e.json | 132 - .../e599f3f8-e5eb-4bfe-a102-efc5a967434d.json | 132 - .../8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json | 132 - .../f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json | 132 - .../2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json | 132 - .../567f8f54-225f-4d9b-be06-f24091adc1e6.json | 132 - .../ebb59730-9522-4c45-8f42-c0d941fd728c.json | 132 - .../2c44fa8c-ebd3-4ea6-8578-61da38965c09.json | 132 - .../3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json | 132 - .../0ab721ba-fbda-44ca-a349-1d3abfaabe62.json | 132 - .../2fea1128-4f0c-40d8-be87-72c42c0648fb.json | 132 - .../db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json | 132 - .../28399fd0-840c-49d3-8179-407ed83d3bfc.json | 132 - .../d7108c13-e14a-4366-9a39-204f853b1bee.json | 132 - .../56152d05-9273-4701-8c0a-723e2cab618d.json | 132 - .../55d2f23d-cb6c-42d2-8b57-837451d3c6df.json | 132 - .../7479ae87-e795-4e20-848a-291614176def.json | 132 - .../04ceb40e-bde8-487b-9d29-dc8f681af9be.json | 132 - .../e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json | 132 - .../9954194c-69b5-4eb4-8b32-859845548cb0.json | 132 - .../2afbc279-242a-4276-85f0-facd29c2d89b.json | 132 - .../ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json | 132 - .../d03c73ca-7364-4517-aea4-f0ac564c49df.json | 132 - .../1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json | 132 - .../f2363099-c39a-4874-bf77-ccc0fa087680.json | 132 - .../596eeee8-3600-4f8a-8888-978b610eb2ca.json | 132 - .../595ddba1-c450-4b69-85b7-0e3118c8c6c7.json | 132 - .../64890314-bba0-4fb2-8c21-38b413cff4c8.json | 132 - .../470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json | 132 - .../00a1579e-8636-4eca-9a63-c0b067a5f3dc.json | 132 - .../a52cc4c9-6d60-4083-ac77-591e247d86c9.json | 132 - .../ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json | 132 - .../c4d11b01-ae5b-4198-b102-07160f100a41.json | 132 - .../19405ead-2263-4613-8053-43beeafb4bfc.json | 132 - .../6c698a60-a813-4be7-b55f-b684029b492d.json | 132 - .../b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json | 132 - .../a20a529e-c52e-41b7-a8ee-909167048bfb.json | 132 - .../2735e6f4-839f-4ab1-8ede-3447891b1b26.json | 132 - .../e74e7e7f-8550-4cba-97cd-2626c82d6b29.json | 132 - .../14f4c00d-8915-413d-8e85-79f395127682.json | 132 - .../9119b586-d3b2-4ce0-a243-d584e2087184.json | 132 - .../629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json | 132 - .../a6ac828c-904b-413a-a5fa-a5ed06a28143.json | 132 - .../251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json | 132 - .../962b48a3-23d7-4104-b34d-4e5c2af31d58.json | 132 - .../e4b0be31-6f9a-4a57-b433-e561da9bd827.json | 132 - .../9a31f208-b7d8-4baa-b96e-99926ecb35af.json | 132 - .../8d933df1-60cb-471d-bfc3-b11c93150203.json | 132 - .../35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json | 132 - .../3530db9a-0d61-4cf8-9fff-b15f6488c845.json | 132 - .../7d9901e0-eafe-4d49-a5bb-fab059708bcb.json | 132 - .../ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json | 132 - .../6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json | 132 - .../0aa7572c-1aa6-4997-a2a2-3b557fbde639.json | 132 - .../6f5df760-2d3e-47b1-b55e-4031a5f11d41.json | 132 - .../ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json | 132 - .../2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json | 132 - .../95ebc5b8-a541-4fca-9e7c-692720e73362.json | 132 - .../09a2508d-a171-493f-9ff2-e7f375815c91.json | 132 - .../12a4a921-5859-4fd6-9d64-677a7d8ef696.json | 132 - .../b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json | 132 - .../d162cf7c-3ef4-420f-aab4-789a98b1195a.json | 132 - .../7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json | 132 - .../24677f2a-ea89-4289-bcb6-13699de9782f.json | 132 - .../3e09df3c-2224-4a29-8e55-18a485db2b25.json | 132 - .../cc0bd236-8fc4-43d3-a18f-4b2afb112946.json | 132 - .../5afd4c0f-b61d-452f-8c48-d298780d91d5.json | 132 - .../eac52141-4fd8-4e21-9c78-920ab8933e5a.json | 132 - .../8449837f-64ac-4293-b1f8-210e62779202.json | 132 - .../ab8a665c-8234-484f-a8a9-8ee79d73edff.json | 132 - .../a954242f-41a6-49d7-a71d-3bfe940cdb92.json | 132 - .../6d1c518f-3f42-49eb-9208-b30e27e7e87e.json | 132 - .../87931db7-42a4-48df-b5a5-8bd934061dbe.json | 132 - .../54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json | 132 - .../7129efad-8ab2-4f7a-b6ed-055989b3e131.json | 132 - .../cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json | 132 - .../0f053a45-cd79-4e51-9b4c-ae5c51006c17.json | 132 - .../d8002b35-1454-4635-a31e-b419c7000b53.json | 132 - .../4c08530e-d529-49a1-a3fe-2351c422981a.json | 132 - .../d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json | 132 - .../70656b13-e0a2-4ef4-af43-0d9995d57af6.json | 132 - .../6544f1ca-02a6-4e58-98f0-e19cc6082682.json | 132 - .../5cd3796f-fb31-49c1-a974-019c5c5b20ae.json | 132 - .../49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json | 132 - .../59720f7e-7e09-483f-8332-8dc7aa19ae78.json | 132 - .../a3a89e4a-0589-4776-a1da-227552482e94.json | 132 - .../b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json | 132 - .../448fda35-bfdc-42ae-90f9-d44383e0a454.json | 132 - .../0d97542e-82b6-4f27-9822-62b67e7690c2.json | 132 - .../2725bd69-839d-4427-8e05-0e289fff70de.json | 132 - .../adb71488-adb8-4848-bf1d-aecd04cb6718.json | 132 - .../c7736577-c4c3-4233-9308-a4bb9b2dbb89.json | 132 - .../76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json | 132 - .../1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json | 132 - .../5e1513f1-4375-4380-85fa-b96a419c013b.json | 132 - .../fadbf3b2-283a-4f8e-9acf-463d75924b97.json | 132 - .../c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json | 132 - .../a9aa164e-386b-4987-9f49-2dde64ade45c.json | 132 - .../e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json | 132 - .../1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json | 132 - .../80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json | 132 - .../afb24bf8-3c47-4278-9b84-19b05017745b.json | 132 - .../4f8cda4d-959b-41ab-a79d-d2b35968eb89.json | 132 - .../2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json | 132 - .../6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json | 132 - .../38cb02a8-862d-40e1-922a-e65f537df87e.json | 132 - .../f816e2a7-2629-4abe-9ed0-3d1299e95194.json | 132 - .../286fae5b-544a-4033-9092-d633fc80f47b.json | 132 - .../93477bf6-ea00-418b-8a2f-975a9554263e.json | 132 - .../3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json | 132 - .../d1e9a242-941f-4461-b75b-7043c2c01ef7.json | 132 - .../e39661af-ad93-41d7-8892-1230064f1a1c.json | 132 - .../595b61b2-5220-48f6-91a0-3aa0d37c63d8.json | 132 - .../3173263e-2a42-4e8d-956e-8175ef464e76.json | 132 - .../f77f8291-1573-4fb6-a984-1cc099c09621.json | 132 - .../c4681e14-513c-4e5e-af8c-88ca11849176.json | 132 - .../0c220edd-2563-4fec-99a4-ef8c210ca5ce.json | 132 - .../bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json | 132 - .../85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json | 132 - .../f180fddd-077f-43f9-b2d9-38c5f33be44d.json | 132 - .../ef384329-8406-4767-ac1a-3eba3131f726.json | 132 - .../2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json | 132 - .../38b2dbbe-be86-4ef0-a39b-89841f662141.json | 132 - .../999a8091-22bd-4c08-bee1-772202e7edde.json | 132 - .../fda91d98-d259-430c-929b-78852cab64ec.json | 132 - .../535bfa4f-ab63-4832-9f17-7b245ff2b2af.json | 132 - .../681a6cc5-5519-4b13-8b50-93adcab4a3f7.json | 132 - .../141dd12c-6901-4a96-a051-f35647ddcc73.json | 132 - .../5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json | 132 - .../7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json | 132 - .../7938a00e-4e11-4223-a900-fa53df168ab7.json | 132 - .../8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json | 132 - .../a334d998-21a5-4108-96e3-9935507a9f8f.json | 132 - .../941e27c6-81da-4ce1-b1c8-544c1426cd11.json | 132 - .../e409a374-685b-482d-82e4-2436dca37309.json | 132 - .../84713625-97b6-4fad-982d-41b5c500d73a.json | 132 - .../b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json | 132 - .../ec896115-21ef-4337-9fdd-32a04c574a05.json | 132 - .../d8e5f49b-7bf3-41d4-a91e-c566219609f6.json | 132 - .../ce1a92a3-6bec-410f-ab42-c567c5d23856.json | 132 - .../0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json | 132 - .../aeee0165-ac7e-4da6-8102-ba60f43587de.json | 132 - .../b47b8666-2556-45df-ba5b-9a5e94186784.json | 132 - .../0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json | 132 - .../86599961-3ec2-4837-89a4-809f1dd7226c.json | 132 - .../dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json | 132 - .../cd77d407-3be3-4b84-8a73-34a15744de93.json | 132 - .../1cd20db5-0225-4724-b1f9-7c32eae456e1.json | 132 - .../dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json | 132 - .../3da2a408-672c-47b8-be32-61f56a15e9f3.json | 132 - .../94700c3c-f18d-4f96-a794-65bcf483fca9.json | 132 - .../6f3481d4-076f-45bd-8564-d485109c7a63.json | 132 - .../9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json | 132 - .../f1932041-263a-4841-9c8b-c6cc9fa50c21.json | 132 - .../691bef38-bc9e-4f8d-b774-9d7c62eec72b.json | 132 - .../5795f693-9ebc-47c6-9d2c-185dd0d32044.json | 132 - .../eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json | 132 - .../f93b2053-11c4-4868-860f-90fbfe8288fc.json | 132 - .../8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json | 132 - .../a0f6f5de-578c-4290-85b5-c51aed985074.json | 132 - .../8ccc76ff-25c9-4706-b6a8-31b49f8be813.json | 132 - .../924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json | 132 - .../8e7dfd9f-350d-406c-811d-453f1744dd53.json | 132 - .../b713d1d2-351f-43a1-b77d-27723e1d4267.json | 132 - .../322a9442-174f-4223-b839-6f8f9664d5e5.json | 132 - .../b12e71d1-c435-4172-a28f-38e26791dadb.json | 132 - .../ad33b0e8-39c8-4118-81bd-bc86b482f122.json | 132 - .../db8a7864-293b-45e9-995b-5301071c902d.json | 132 - .../31e3beea-28dc-4b47-a5e9-5fafc89226db.json | 132 - .../49315a95-394f-4508-8e6c-7c1d5547c257.json | 132 - .../375d3a94-97af-47ef-82af-afd7581663d4.json | 132 - .../77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json | 132 - .../3d69ec7d-9999-4e16-8dc9-99fad35e156e.json | 132 - .../d2a7459b-8a12-4529-b978-c7237979f16b.json | 132 - .../e7a228ad-69de-471a-9f31-6bdc7221999c.json | 132 - .../9196ae39-adb0-4d53-8399-0ccd4d628065.json | 132 - .../ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json | 132 - .../05f69fd6-a77e-478d-ad86-3e83e615e892.json | 132 - .../5b8e9508-befb-4674-bd84-9c722a0864ce.json | 132 - .../8beb3730-23e8-4b89-933d-2d3f1a1d1365.json | 132 - .../07417712-1933-4920-8964-67ba74bf6d01.json | 132 - .../ae4cc05d-a65a-4f18-a99c-f133603686d1.json | 132 - .../54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json | 132 - .../a717d466-9157-4991-8459-f39847d914a2.json | 132 - .../15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json | 132 - .../921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json | 132 - .../863969d9-e567-43cc-a0a9-7f80eaba374a.json | 132 - .../2987fa45-363e-4a07-8e9f-db01586a135b.json | 132 - .../3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json | 132 - .../0cacf042-6b62-4b67-8821-97cd703788d0.json | 132 - .../9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json | 132 - .../c1308f95-6d55-4ff6-b14e-1bd09b467d99.json | 132 - .../4ab16120-8d39-4dea-aa76-5c249506848d.json | 132 - .../f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json | 132 - .../c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json | 132 - .../9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json | 132 - .../8b303795-557b-4fa1-bbc6-d36bd77ee739.json | 132 - .../7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json | 132 - .../5a09783b-82da-43ae-a607-2cfea550d931.json | 132 - .../6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json | 132 - .../121cb5fc-2fa2-4718-b325-c40014802e40.json | 132 - .../8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json | 132 - .../c8891914-c9fb-4b4d-9592-826f04520e7b.json | 132 - .../e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json | 132 - .../da237415-f34e-4cbb-9a94-3ff621f3df8d.json | 132 - .../479f3bfa-d614-46a9-88c7-9891852b0d8c.json | 132 - .../f5f0c7da-fb03-4023-81a7-801b0729a19d.json | 132 - .../40f51424-2922-498d-bbbc-d500667a8554.json | 132 - .../4f25d177-6bcf-4864-87a4-1beb21a7373d.json | 132 - .../b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json | 132 - .../d497a7e3-11c2-4e0c-8788-091caabede56.json | 132 - .../4a55bcf2-e1c1-4fce-8f79-472dae869b26.json | 132 - .../5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json | 132 - .../1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json | 132 - .../75065074-7ef6-41ac-be7c-496cc458640a.json | 132 - .../49a0287b-48d7-44db-bf20-a084919d332f.json | 132 - .../7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json | 132 - .../628542f9-fac6-42a7-8ec5-5cd93f977a7e.json | 132 - .../5b0924ae-cf52-4245-a687-91e4b1742c16.json | 132 - .../459c2b98-c3af-4334-a4bc-13334efe49b8.json | 132 - .../b2780aa3-d299-4180-8441-dd54e94255cb.json | 132 - .../f55d398d-0555-4e89-a37c-def04741a0dd.json | 132 - .../63caf8f8-9e55-4ef6-ae76-ee7184a50675.json | 132 - .../f82ccde3-bd3b-499c-8b8c-182822392cea.json | 132 - .../8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json | 132 - .../b7cbc2fb-2c52-4c13-9266-52103421f2ee.json | 132 - .../f4474361-e897-4dbb-a89e-5451a4724474.json | 132 - .../de257b5e-4629-4f8a-b08d-d2ca372593e2.json | 132 - .../a37aada3-104a-488a-898f-245ff257de46.json | 132 - .../d9d655d1-d94c-483a-a3a2-ca196e1391d1.json | 132 - .../77bf7126-0cb9-43ef-8d23-5f1395f91642.json | 132 - .../73f410be-3084-4994-8406-f8ac70880626.json | 132 - .../24caad7a-15fa-4820-91cc-0f544a34d173.json | 132 - .../e087b221-f813-4688-8d98-17980f98ac5b.json | 132 - .../f4d03bff-3b34-497f-a17f-0379bc562f11.json | 132 - .../2ca21612-ea90-41f3-b618-3ea81c09c3ae.json | 132 - .../d4dc2088-9911-4966-afe9-022df89dd522.json | 132 - .../ad03a075-8f24-46f6-ae04-5a04eb7061c1.json | 132 - .../2d1da226-e65c-48a0-aabb-46b1cf670a82.json | 132 - .../7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json | 132 - .../87018726-9f81-47b1-883e-609afea7fb37.json | 132 - .../292b9333-96c7-4fc7-bf35-78bbce9f10d3.json | 132 - .../b44224c3-ed2c-4120-9e2a-e6286358a4da.json | 132 - .../f7a2c9af-c55c-4307-bfef-1ca709525d82.json | 132 - .../d9655f35-edfd-4c53-b359-559870e8019e.json | 132 - .../afdd962d-652a-4395-92f7-c16dc874a779.json | 132 - .../2594e917-3ebd-428b-8f36-cb0da668695d.json | 132 - .../91a86644-ad96-4c66-8691-1c0b531b572c.json | 132 - .../331f56ce-5e45-46d8-9143-3f66be20b699.json | 132 - .../6138ebe0-8483-4cfb-8d95-b334bb09e831.json | 132 - .../4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json | 132 - .../a6b0f2bf-08da-472f-b858-8be967a44cdc.json | 132 - .../57c7553d-f3e5-4a31-8c16-66aae570d8ec.json | 132 - .../58c31bdd-f86f-4fbb-8549-191bb9f46f02.json | 132 - .../dd25c1dd-0edf-44ca-b18c-633dbd47368f.json | 132 - .../2a030613-b5f7-4393-ac39-d2d072c913dc.json | 132 - .../f8c73290-c400-4f1f-a00a-516592497b0d.json | 132 - .../b31908fc-5e7e-45d6-835f-4e86a05b23fb.json | 132 - .../4320cb98-7f9f-4510-bb88-448ce231bae8.json | 132 - .../28b986d1-2e67-4462-9165-6cb8f260b6c6.json | 132 - .../fe1e21cb-7934-4022-a74a-777172310021.json | 132 - .../90871638-b828-484d-8822-95ffceb20909.json | 132 - .../04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json | 132 - .../8c5c22af-f230-4d34-b80d-f42ef27e1675.json | 132 - .../f3466a90-541b-4a08-a9c6-d5a79b2299b0.json | 132 - .../ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json | 132 - .../859af708-ac37-4749-bc06-73d92338d1f5.json | 132 - .../e274380d-e0f7-47c3-afc3-e603e6cecf9e.json | 132 - .../19810be8-ea81-4db5-9854-1830b05a5732.json | 132 - .../1258c282-3672-4b42-9d4d-117568e17bf5.json | 132 - .../9b9f6e01-238e-4893-b398-4e1c83c44dfa.json | 132 - .../b267621b-dbba-4c4a-bb9f-fa85734d0f59.json | 132 - .../a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json | 132 - .../3d39dcab-55df-4ad3-bdc8-03ae684e4390.json | 132 - .../1b499881-9edb-4626-a919-977393d6bef1.json | 132 - .../84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json | 132 - .../2e070663-2622-4a8e-bd39-7f0ef9df399e.json | 132 - .../047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json | 132 - .../6d73016e-078e-4ffe-b2ae-5b829d1456df.json | 132 - .../0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json | 132 - .../03d51d90-fd15-42b7-ad5f-c7326cc642a7.json | 132 - .../d3e5c939-c53a-49d6-80cd-34420dbb176a.json | 132 - .../ab321358-26f9-4577-a5fb-1f5d4b8784b4.json | 132 - .../a43aae68-f12c-4a6d-b846-c498cf35f6cd.json | 132 - .../b84615c0-43c4-49ec-83fe-5d3f8e6026af.json | 132 - .../7e687d24-9e12-4ecf-b283-e222efb9473a.json | 132 - .../4aea143c-28fd-48bb-b911-37ac3fe58220.json | 132 - .../34a8daec-bfff-4cf4-9011-0542b30c1d10.json | 132 - .../3e919d7b-53db-41fb-ac93-224e2768b9c6.json | 132 - .../66becca1-d92b-409f-ab56-44d05cac66fd.json | 132 - .../6293b269-7c4c-44da-bd85-e51954c173a1.json | 132 - .../add3b058-e7bc-4b7b-bb98-0d7039979072.json | 132 - .../db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json | 132 - .../54b055d0-80ae-4bba-b729-bd77b3ec7502.json | 132 - .../5c22d0b3-5082-4c6e-865c-71da03cf9378.json | 132 - .../f8e5ee9f-519d-4ed8-bd2a-88897075f401.json | 132 - .../b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json | 132 - .../27df1e06-463b-4519-87eb-a1666ad3f98c.json | 132 - .../9d975b05-7bee-462d-a33a-afa0d5af94d4.json | 132 - .../9ef9135a-473e-43a5-a460-fd3ec50226f9.json | 132 - .../c57cae01-328e-447b-8945-e3cd2c4b8a7b.json | 132 - .../494c86cf-7f37-49d8-8160-b81859552c87.json | 132 - .../6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json | 132 - .../9b10cd14-82f3-4b36-a4be-5092127d68c3.json | 132 - .../bbd94181-0523-4543-80a7-056b041e03b7.json | 132 - .../e10d8573-e201-460e-a931-49a1b13ceeea.json | 132 - .../e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json | 132 - .../831246b8-5433-48e6-ba11-8a4239373106.json | 132 - .../8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json | 132 - .../5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json | 132 - .../cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json | 132 - .../3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json | 132 - .../fc817789-2f44-4d2b-b40e-2422fe33d104.json | 132 - .../5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json | 132 - .../b6740747-19ac-4a9c-892f-6556013ddc8b.json | 132 - .../3263ab46-09ae-4c24-9332-b6874d0d0330.json | 132 - .../a8706a7e-5693-4768-a955-a448549d2e77.json | 132 - .../3c932329-0440-4799-886f-10bc4a5aeb09.json | 132 - .../b1e42d9d-827d-4109-8d1b-182694033b21.json | 132 - .../0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json | 132 - .../73b07681-8e10-414e-8922-650908f9cf6a.json | 132 - .../8b1549f8-0602-4538-842c-abe9dca7baff.json | 132 - .../ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json | 132 - .../14c01681-fbef-49c4-b737-a7baaa02d393.json | 132 - .../3ad495c0-da8e-4776-8d05-bc7dce1fe120.json | 132 - .../0762ca9e-f0d4-408e-9992-e91a10e0e65f.json | 132 - .../ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json | 132 - .../1fc39812-77fb-4d0c-b9fb-706e94c40afe.json | 132 - .../fdc3c502-53ad-4bf7-85ce-51eaed72754b.json | 132 - .../3f74c1c7-f349-4193-95cf-b0033112fea0.json | 132 - .../36a803da-83ab-4c49-8855-9344aaa7a68b.json | 132 - .../df986996-249e-49f9-b074-91e8dcdf62e2.json | 132 - .../90f007e9-e323-4a82-b276-ac1b928030ca.json | 132 - .../2b627f93-5cc7-4a5e-b682-d129396362e5.json | 132 - .../2fde07ac-d218-4cc6-947e-8ceb87eedbee.json | 132 - .../2a141bfe-4632-4058-a232-1f2c5540c41f.json | 132 - .../fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json | 132 - .../c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json | 132 - .../c439478a-1734-4038-aa8b-bb2d12ec022d.json | 132 - .../4a36f73a-9495-4ea2-863c-220b8ca6bf99.json | 132 - .../faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json | 132 - .../a55bf380-d567-4228-b30c-57e9df31e844.json | 132 - .../dfd92311-4f3d-4355-8ccf-a59f29914b8f.json | 132 - .../d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json | 132 - .../32edb764-2a42-4efe-ac86-9eda81942b84.json | 132 - .../36855ebd-2030-4d5d-9c42-ca049244e694.json | 132 - .../9651a0a1-4004-42f3-ad8f-2aebb38ec967.json | 132 - .../a59e55dc-e2b5-43be-8469-49eee0e98d55.json | 132 - .../a956e306-f184-4dbc-ac7a-3793ae735801.json | 132 - .../c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json | 132 - .../415875b7-fe10-47e7-aca0-029c2f51c067.json | 132 - .../c505ee64-3d3b-48e2-9c8a-f59609a758e9.json | 132 - .../00003185-c291-40c5-bba1-f87eae0afc08.json | 132 - .../328f61d7-677b-4a06-b464-0da42153f9ae.json | 132 - .../9cb5b8fd-062c-4161-9301-640980d21b9f.json | 132 - .../09284b75-a2f9-40ea-8135-7aa61c626fa2.json | 132 - .../e2502331-6ac3-43bc-8218-259b44333283.json | 132 - .../8dde454d-aa48-4ee1-b5c6-f3353087d492.json | 132 - .../662c8ed2-2407-4606-ac1e-ec7ade185d2d.json | 132 - .../332aef8c-7c62-463e-ba3c-07ae0205d457.json | 132 - .../cfdfcf21-e445-430e-a295-946cb8c3fce9.json | 132 - .../a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json | 132 - .../465d473c-ef28-4725-8cac-02f2a031b22c.json | 132 - .../2c636544-8676-4eee-8bcd-d623be0275be.json | 132 - .../8b332fac-1cfa-498b-853a-52ec5492ddc7.json | 132 - .../2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json | 132 - .../69bb0243-75b2-4858-ba6b-5e70cfb516a7.json | 132 - .../4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json | 132 - .../87878b74-22ce-4554-914c-03e486d13de3.json | 132 - .../5030f8d4-f216-4f78-84f1-dd03b0324bb0.json | 132 - .../c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json | 132 - .../38261a01-62df-42b2-9b1d-f924598e70ef.json | 132 - .../5736f0b5-3903-4774-a84a-c3db260d36e4.json | 132 - .../70134d58-972e-49c9-8cde-4ba2691d3dc3.json | 132 - .../d4bb1440-2064-4752-bcb3-c9cec234fd1b.json | 132 - .../d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json | 132 - .../f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json | 132 - .../7bf23db0-877c-4700-95c8-e35dee5e57b4.json | 132 - .../07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json | 132 - .../8535ffae-f39d-46ed-89bb-a1656885db91.json | 132 - .../5e832121-9a67-44d9-973d-fffdb1b37975.json | 132 - .../92d3f67d-a026-49e3-a440-68c10fb358ae.json | 132 - .../9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json | 132 - .../489e8e84-5e30-46fa-a421-f52308f051e7.json | 132 - .../a208f807-c930-4e81-8ebd-dcbb4db76442.json | 132 - .../4956539d-a255-4c56-877f-257e463fa3e4.json | 132 - .../3451eb65-020c-4e34-9128-7410e6b293cd.json | 132 - .../b5cd0061-e4dd-4049-a51e-b16490e69120.json | 132 - .../c4686af6-0b7b-4df3-9152-14a3ef087b7f.json | 132 - .../155885ca-11e7-4cd2-b26c-53e001e2a6f9.json | 132 - .../d9ca5411-def6-43b3-a522-595131d8e5e6.json | 132 - .../e54553ab-0897-4cb5-9213-5bb72758d2b5.json | 132 - .../eed48cdc-18db-4c03-84bf-d2d50e3328b0.json | 132 - .../d7952aef-37e2-4c15-a1a4-598690773bbb.json | 132 - .../5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json | 132 - .../cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json | 132 - .../a12208ce-e9e1-4476-8054-0d565efad92c.json | 132 - .../f46e1eeb-8b8b-4d47-9510-445109b5518b.json | 132 - .../7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json | 132 - .../823e886a-1431-4078-81a3-4b941983461d.json | 132 - .../583609f0-de5b-43cd-a667-bb2c36679fd2.json | 132 - .../2d2cea8b-167e-4d63-b01c-537f372672f9.json | 132 - .../f584f596-3a17-404a-81a2-3033ad38cad6.json | 132 - .../ebb0930f-92be-4e1b-a2a6-779f69d2151c.json | 132 - .../b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json | 132 - .../4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json | 132 - .../05ffcb7a-2694-4276-bf45-73e1110bc494.json | 132 - .../dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json | 132 - .../154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json | 132 - .../998316d2-389a-4ce0-b0b0-0430c1361de7.json | 132 - .../ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json | 132 - .../2519485b-47cd-497c-a349-9e69db0266f3.json | 132 - .../56d86e26-4ee6-4652-9b7b-a538238a24d4.json | 132 - .../416b89e4-5e8a-4131-9403-e8967a4127b8.json | 132 - .../347a90e8-d8b7-4266-8242-ceac865796a0.json | 132 - .../389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json | 132 - .../6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json | 132 - .../d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json | 132 - .../f8d362f6-eafc-4d11-bc40-d169d69d3a95.json | 132 - .../4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json | 132 - .../de073f45-0d14-4f8a-9d3b-d4fd961186b8.json | 132 - .../fd88d234-b3f9-4f48-896c-af58f1a69880.json | 132 - .../273745b1-3761-463e-b9ab-7860968064eb.json | 132 - .../101d84d3-e741-4eb2-bd8a-db6c12022fe2.json | 132 - .../9c82deca-1998-4506-b038-c5dd592324d8.json | 132 - .../da620a94-4c0d-4c50-9619-10e12001fb5d.json | 132 - .../51dade8f-34e7-4237-8691-22655249bf76.json | 132 - .../cdd59385-0a54-4ca1-b24d-9316a70f2875.json | 132 - .../514a3103-e8a1-49e8-b9da-a85963f5b3dd.json | 132 - .../daafaafa-1e00-4433-95f3-91c169598ebd.json | 132 - .../50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json | 132 - .../bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json | 132 - .../99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json | 132 - .../362f5875-4dbc-4e68-90ce-789f692bb533.json | 132 - .../fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json | 132 - .../93f829b8-b8d9-4389-a210-2a38c3a30edb.json | 132 - .../6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json | 132 - .../70d749cf-2e92-4847-86de-7964fc8eb990.json | 132 - .../623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json | 132 - .../e1aca741-2765-4e47-b6a1-49f3d9532432.json | 132 - .../4f42366e-e6aa-4974-9a40-5781e350616d.json | 132 - .../4ec2231d-c012-4ad3-830c-8ff86c977202.json | 132 - .../1d2e5513-bd0c-4795-8487-f5266c6e368f.json | 132 - .../104172b7-86f5-410a-a454-63e1cfbeb87f.json | 132 - .../d28e04ac-7d18-43fb-80b8-82c0662fec79.json | 132 - .../20bb3819-9d85-4d84-99ba-65e33965f0c5.json | 132 - .../3a4bdf58-0137-4d85-b567-59b3fed3dad5.json | 132 - .../04f843ba-947c-4732-979c-2aeae7d34e5a.json | 132 - .../173a31d3-7d12-4ab1-a963-005a81aee767.json | 132 - .../d0555736-b614-43ca-91d7-8264e3566872.json | 132 - .../4b7b13b7-4aee-4462-87e6-aa6c15068236.json | 132 - .../4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json | 132 - .../acbcd5a5-bcd8-4209-b35f-425feada7e8b.json | 132 - .../cb9a415f-1a02-46ad-a731-bf825ddd78ae.json | 132 - .../92cde6db-47f4-43c6-9ad5-643c35faa226.json | 132 - .../5e88a037-f9bd-4b39-944f-f0781bb7884f.json | 132 - .../d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json | 132 - .../ac5adf39-f0a4-439b-9873-9141e0a554b1.json | 132 - .../62965c92-cdf4-4a3b-b035-990abaab615c.json | 132 - .../3866ece8-d70a-4061-9e86-0798ecd98bd6.json | 132 - .../ff484d0e-bb14-4a80-ae29-2351b03cf278.json | 132 - .../06ac1718-fe71-4e05-a47f-1200e067336c.json | 132 - .../4ddb1616-7889-45ef-96de-823fee338e1d.json | 132 - .../487dd91b-5bc4-4355-90d3-c82ecc789ab3.json | 132 - .../a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json | 132 - .../9a9239ab-9e0e-449b-bd1b-6ec280fad505.json | 132 - .../2c710cd5-75a6-46b7-8356-212da7bf864d.json | 132 - .../377d5240-73b5-48d0-bbdc-0960ad1d9069.json | 132 - .../9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json | 132 - .../104a0157-c614-44cf-b6cc-9f15dab4b187.json | 132 - .../bb379093-c169-44bd-ac86-edb8ab8fc225.json | 132 - .../e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json | 132 - .../43d87bf5-2620-4f8e-a8b6-f86fc157d987.json | 132 - .../735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json | 132 - .../0c6dcc87-343c-4973-a589-3e3393829184.json | 132 - .../7c1d1657-e9ae-433f-be9d-523431bfc7ae.json | 132 - .../0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json | 132 - .../e87e1d3f-1476-499d-a9f3-b6463b429262.json | 132 - .../246e8450-3c53-4bde-99bb-5663f751e88e.json | 132 - .../496b9e45-2f64-456e-b35e-12a94c5643b1.json | 132 - .../05890047-a95a-433e-b6b6-fb037592cdd1.json | 132 - .../4a30580c-1d25-49d4-984d-2d28ef3a5656.json | 132 - .../696d7966-d140-4f43-91df-54f02247b34f.json | 132 - .../fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json | 132 - .../9ac16d1f-d894-414d-8a14-110e971d0ba6.json | 132 - .../2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json | 132 - .../3b221b0e-6158-471f-bcd2-b09514f28bd7.json | 132 - .../c8af8428-aab6-4d19-b185-2b437c0334fa.json | 132 - .../c617d12b-c37f-47ef-9704-e19774c67aeb.json | 132 - .../577f31e2-1808-45e2-a528-5933019cfa85.json | 132 - .../7bd7f5c8-be9e-473e-be18-03ad22a195ee.json | 132 - .../5036a549-5583-4775-935a-1a12b6de3e7d.json | 132 - .../5c0ffff9-542c-424e-88e9-89584e686e12.json | 132 - .../5c6a045d-2c90-4938-9185-9c1a0f82903a.json | 132 - .../02480176-2058-4e71-a970-9698be8d235e.json | 132 - .../4be1e5b4-254c-4287-907d-cc845042de37.json | 132 - .../21b51852-5cad-414e-92d5-31878f025d67.json | 132 - .../9eb07d4a-1f01-4696-9137-d477ffca43be.json | 132 - .../4236485b-aa92-4bc4-a652-17ed3231ecf4.json | 132 - .../9c0d6b71-8c6a-4294-961c-972a002b847f.json | 132 - .../d1e906d5-8f0d-49c2-88c3-cf71774de600.json | 132 - .../798e4f83-6262-4d5b-a854-6ff114167209.json | 132 - .../dd2603d5-e99e-4778-95d0-159c788626cf.json | 132 - .../41c71990-e79d-447f-b082-63c96fd67a1f.json | 132 - .../b9e25948-2871-4b6c-933b-8a731e48e81b.json | 132 - .../7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json | 132 - .../ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json | 132 - .../b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json | 132 - .../7395fcde-49dd-47f4-a8ea-463eda40f5e3.json | 132 - .../a130087f-566f-4405-b662-1102f1664c49.json | 132 - .../3be58cf3-4761-4459-9f3c-eabf812a3c19.json | 132 - .../dbdd71ad-db5b-4b4b-8856-68b55adbe127.json | 132 - .../da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json | 132 - .../77d5f51e-5ad2-42a6-a32c-060cd844b949.json | 132 - .../724cc582-cc83-474b-9606-70dbc22f3581.json | 132 - .../8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json | 132 - .../0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json | 132 - .../ab2512fa-2335-4817-9a76-3259690bbc67.json | 132 - .../fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json | 132 - .../0e14484a-69d7-423e-bf6c-33d0992f408c.json | 132 - .../881eaa2c-af5f-4e84-8807-d0835c10ebd2.json | 132 - .../ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json | 132 - .../db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json | 132 - .../0c44a429-e705-4794-b702-1a731e52df90.json | 132 - .../92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json | 132 - .../5703e81d-055c-459b-8202-80ec382a8d5b.json | 132 - .../f6260b6e-52a2-4142-93ba-5393807fa0d4.json | 132 - .../83b84506-4826-48de-a6fe-2af6ae5d425a.json | 132 - .../7483e260-9853-4d3f-aa10-187796d96de9.json | 132 - .../f9925806-4252-44e8-b67e-917737572bd4.json | 132 - .../70470e6c-8d66-4249-b762-a5a2e3589a53.json | 132 - .../d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json | 132 - .../a35b06bc-d759-421a-94cf-f408a98e9273.json | 132 - .../bbac659c-7cf8-41d4-98d4-ded4c471bd98.json | 132 - .../0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json | 132 - .../a7ab6f16-717f-4567-8057-a4a18e1a1e77.json | 132 - .../2abe2c9d-032d-469e-852b-114eca5e84f8.json | 132 - .../2e8a83dc-c760-4f42-a361-e02cf3a65427.json | 132 - .../743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json | 132 - .../4e37c90b-65a8-4b71-bfc2-d63541fb8962.json | 132 - .../2e34d74e-1b69-4daf-8bee-77e5357fd439.json | 132 - .../0646e2f7-d2e6-42d3-8f09-f8daee302709.json | 132 - .../c66b1ff8-9c04-4f9c-b83e-088f31f79590.json | 132 - .../1bd2affc-9970-4149-b52b-51549b1f0029.json | 132 - .../f0479d74-4684-4b41-a63b-16d7fe0e3290.json | 132 - .../95deb890-a15d-4c71-8151-ed45c3dfb87f.json | 132 - .../1c07fc4c-a773-4e03-bb14-7144e7815c01.json | 132 - .../e7e8388e-db3c-4881-b67c-5177c60562b9.json | 132 - .../c4923208-2a47-45f2-a74a-4483e4b99bee.json | 132 - .../b5f06a78-5b57-45a5-93be-4f3c1b36f208.json | 132 - .../835f19d3-515c-4bc4-ab96-5cb5bece45dc.json | 132 - .../7dd96382-6fc1-4a39-924b-d9034b5b0839.json | 132 - .../77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json | 132 - .../e3eae267-46ab-4433-a8f3-2a2f8448299b.json | 132 - .../e31308c4-8eb2-4a72-8127-18049d58b814.json | 132 - .../c7098a7a-e865-4ecd-b511-abeb2c0872bd.json | 132 - .../b3a8c734-e63a-47f7-af2c-a3b6518802fa.json | 132 - .../35937965-2791-4f75-8954-5a2280381c91.json | 132 - .../4ab806fe-738d-4f5b-89e4-004134d2f7fe.json | 132 - .../a937e27e-b757-4de7-b679-01ac29d8bb22.json | 132 - .../1d906aab-33a6-4ffe-8a63-694482d83d09.json | 132 - .../9e101298-6482-4ae8-83e4-b948ba8fa550.json | 132 - .../3818710d-80a9-4e7d-90e3-f06afffb71ac.json | 132 - .../a18ec0c4-6f3f-4904-b69c-e40770df169e.json | 132 - .../529c2bd4-6b8e-4e3c-8737-c0b794444d13.json | 132 - .../9e994362-a1d1-48f7-9db1-dd9d532b9f35.json | 132 - .../cf35b7db-f675-4362-8916-36b0582b64f4.json | 132 - .../79ee7e34-36cd-4024-8978-86c1b059ae5f.json | 132 - .../9ec4fb99-ed4d-416e-9342-0c036aadd35d.json | 132 - .../8788e4fa-04c5-4f7c-bb4e-523287901f71.json | 132 - .../18097bf4-5149-40e9-9850-558c3f143ed8.json | 132 - .../b5942721-5c30-4c49-a6e1-fb5419539652.json | 132 - .../76d27de3-0309-4e4b-8d0d-0e402bde0a31.json | 132 - .../5c0553ff-4910-45a9-aa8d-3a76af098403.json | 132 - .../fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json | 132 - .../f77aa103-5a09-409c-ad72-7992b6049f94.json | 132 - .../0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json | 132 - .../044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json | 132 - .../ac6b884d-62ea-4ff5-8eee-cfce08869030.json | 132 - .../8ffa696e-adef-4808-ba0e-bb04921a433d.json | 132 - .../8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json | 132 - .../4f24fc46-3686-41fa-bf25-a0e39b252cc9.json | 132 - .../b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json | 132 - .../4ce062da-acfc-4684-95c2-679cbe5a697b.json | 132 - .../3d785765-befa-4e53-8672-769f7bb87dcd.json | 132 - .../ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json | 132 - .../31f0b186-1805-42ff-86cf-d8455a66d538.json | 132 - .../ed6b3e7e-d294-420d-b9b9-460a52cd0239.json | 132 - .../91dec0c0-9854-4790-a0a5-e17d19636f17.json | 132 - .../599616fb-26c1-47e3-a98b-9ad922a95c08.json | 132 - .../aeee4365-c34d-46b9-8c98-29976010bb62.json | 132 - .../1ec68708-94c9-4561-bb99-7f211d7a9950.json | 132 - .../0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json | 132 - .../91bcd646-fe3d-458b-a426-a6a8863d69a0.json | 132 - .../2e0458cc-e092-4770-bd80-00dff169d754.json | 132 - .../d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json | 132 - .../a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json | 132 - .../f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json | 132 - .../2c8c6c6a-ce95-4d11-a33a-d547859fee11.json | 132 - .../47858744-3378-4ed4-9101-8acbc3a53cda.json | 132 - .../2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json | 132 - .../23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json | 132 - .../312ec315-6175-4f99-8741-97d97eb26b47.json | 132 - .../7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json | 132 - .../68c9fb85-f90e-442f-aa96-458dabe30b39.json | 132 - .../6891d1dd-0e1a-42e8-9206-64a4c71854f9.json | 132 - .../c62eb6b3-2a3d-45bd-acdf-bad717e51766.json | 132 - .../55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json | 132 - .../227e3e19-29d6-414f-b538-9f6f89d47677.json | 132 - .../e922ac2c-e8d0-48f2-99fc-da70c925136c.json | 132 - .../59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json | 132 - .../a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json | 132 - .../a889f561-0d8a-4345-9131-0a897ec215ac.json | 132 - .../6402facc-6258-43a4-a0fd-78e21765c504.json | 132 - .../29fbd2e0-e08a-48f4-905e-d2aa54886915.json | 132 - .../313e0379-d3ea-4f5a-8e06-4b0a94317487.json | 132 - .../f326fbd0-5f92-4324-a587-1f08cf7da208.json | 132 - .../d61310e9-5267-4a87-8e24-ae25172cd64e.json | 132 - .../60953e5e-523d-43c0-ad00-f746308030b1.json | 132 - .../5afd8861-d7cb-45cd-af1b-6db966cb56e0.json | 132 - .../c3972df1-4414-4c71-b473-fb9459cf085b.json | 132 - .../b89d54b7-2329-4608-b9f6-07017e63f1cd.json | 132 - .../50389350-af23-41ba-af46-5ffe338ff9d2.json | 132 - .../b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json | 132 - .../7cd59011-75d7-4497-956c-322d5d609c5f.json | 132 - .../1313d865-9c5b-45d2-ad64-629c65f07f2c.json | 132 - .../0efc2583-bf21-4b60-96cc-716928768eb1.json | 132 - .../be0a2737-19a0-4401-998a-a03663467133.json | 132 - .../71720e07-2de0-4402-bdfd-102150c61765.json | 132 - .../38c84c69-5cdb-4f24-820d-4b39c5b118ff.json | 132 - .../de9d274d-f213-4037-9711-3e9d3dbbcc96.json | 132 - .../92381da4-b9d1-43c4-a5c9-59f375017e11.json | 132 - .../44ab6a50-027d-47df-a518-5aa944eb2a61.json | 132 - .../2a1947d7-74e0-43d0-931d-b2862348e90a.json | 132 - .../3677b71c-387d-4182-b15d-c3525bc7bc36.json | 132 - .../6b125a8e-5b53-48ca-8875-926249879f39.json | 132 - .../af851d4b-69d4-49a9-a160-a180146c3963.json | 132 - .../7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json | 132 - .../1bce093e-27c0-41ad-aad6-b656f6773ed5.json | 132 - .../5c6cffab-ef72-4e12-808c-c26ee8ec6999.json | 132 - .../e288a874-f750-4a90-be07-616094c220cf.json | 132 - .../0607da8d-3f4e-468a-91a6-b975261a87c0.json | 132 - .../be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json | 132 - .../15ffe64e-72fd-4e65-8632-babf137a386d.json | 132 - .../ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json | 132 - .../b5afab38-13ba-4abd-9d04-a433c41061c5.json | 132 - .../a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json | 132 - .../d8254f6c-8110-44d3-800e-101fc731d779.json | 132 - .../ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json | 132 - .../c208b19b-4ecf-4fad-b931-54f65d4b711b.json | 132 - .../debaf4a0-c734-47ea-bea0-2ddc65dc397d.json | 132 - .../0eeb5962-ccc0-407b-92e6-7cf17c00941f.json | 132 - .../4b60e863-482c-4f91-8cd1-6c993d3c5988.json | 132 - .../f5f0bc72-427d-4703-aab1-1bb1bea73895.json | 132 - .../aae7f543-7b5b-435f-a506-e3ab901a8c5a.json | 132 - .../6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json | 132 - .../3ee76278-89d4-44fb-a449-717534b00161.json | 132 - .../fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json | 132 - .../9ddaa721-bf3a-416a-9be8-291188793cc9.json | 132 - .../d659077d-7261-4c69-862c-d61be21662a2.json | 132 - .../e87ba227-c55e-4666-949d-b45913f8336b.json | 132 - .../077f683a-af6f-4a71-b599-b9b269546b7c.json | 132 - .../54808b08-d10d-4a06-ab60-8d99039311b8.json | 132 - .../138e6fdb-7092-4ee6-be82-7bb86c1fc759.json | 132 - .../1b27423f-62cc-4189-a293-5af84ef1f2c8.json | 132 - .../f5468512-d2c7-4486-9d31-bef61225af52.json | 132 - .../0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json | 132 - .../07b87b98-0d61-4479-937f-7447565b4631.json | 132 - .../85b11b91-d686-49e9-8db0-971dd7cafb75.json | 132 - .../21bac032-a092-4afa-8d29-ebdefb3a0650.json | 132 - .../29e3a687-429f-4f33-ae5f-48db85127364.json | 132 - .../d98493a6-f237-4565-8508-9e4cc3188d2d.json | 132 - .../2def6fbd-7488-4e9f-a822-2405d4f7a315.json | 132 - .../819143d4-9538-48b9-b7af-128bc15c518a.json | 132 - .../c29d47af-a9de-4edb-acac-6763c0d44ca3.json | 132 - .../22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json | 132 - .../2bea7014-460d-470b-918f-468b58d70fd6.json | 132 - .../3927a5dd-002b-441a-b769-ba68547cd5f3.json | 132 - .../476fc734-dedd-4192-aa59-eb2f9dabf16b.json | 132 - .../817e2fbe-0866-489f-b987-391228a68c53.json | 132 - .../f25f5eb1-ff22-4be3-a639-a9d25207078f.json | 132 - .../f71d1c31-184b-46be-a288-bdc92f0ebe09.json | 132 - .../0d9547b3-7bef-4815-9c44-7d714fe81bbb.json | 132 - .../22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json | 132 - .../afedb249-f1a5-42d6-b6c0-54b2cc303f64.json | 132 - .../61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json | 132 - .../c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json | 132 - .../cc57e6f0-ab55-4ab9-983c-63d74632d016.json | 132 - .../0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json | 132 - .../a6ec2934-e9fd-481d-8f00-932603bc6e0a.json | 132 - .../e2553c93-60df-4126-9e64-ecd4a5003389.json | 132 - .../e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json | 132 - .../a807ee8c-509e-4b6d-a414-df24444d8a0a.json | 132 - .../2199024b-7944-4950-8335-32a536efad02.json | 132 - .../97919c86-6161-4548-95b9-d44263a29f8a.json | 132 - .../c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json | 132 - .../c1294268-b5f5-4d64-b91a-147f58a21a47.json | 132 - .../2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json | 132 - .../b926ca6c-60c9-4353-9671-0453b46d0222.json | 132 - .../44db30b4-2010-4f96-a39e-9ccc8568374f.json | 132 - .../2210d673-d417-46be-aeca-de48cd846e01.json | 132 - .../892d27cc-dfb3-40c7-ae0f-a7cd06784808.json | 132 - .../49b3f293-721d-4d44-9748-88d1ce275050.json | 132 - .../70fb41fe-46af-49e3-8270-5882e12f710f.json | 132 - .../13e2489f-9d96-4f68-8e22-c937604c2145.json | 132 - .../0c386ea0-4706-4a6f-994c-b6ee21dbce92.json | 132 - .../a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json | 132 - .../4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json | 132 - .../568072cb-118d-41af-bfe8-fa14cb4c7348.json | 132 - .../a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json | 132 - .../2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json | 132 - .../359dde31-d9dc-4c22-b829-77df652dcc73.json | 132 - .../34a79823-b993-402a-89a7-538e126ee02a.json | 132 - .../f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json | 132 - .../73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json | 132 - .../72a66eae-9c94-40e3-b3c9-211303e5cba8.json | 132 - .../ef7390b5-599b-4354-805b-9486e4ce34fa.json | 132 - .../57f964c3-0504-4b60-9539-ce0e369816ea.json | 132 - .../4e6c0336-5d94-4417-a194-92a4d6f38481.json | 132 - .../fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json | 132 - .../5ced7497-5a05-40d2-80cb-cae63ca62022.json | 132 - .../52a66aaa-193a-48ca-b693-4dcab811eaa3.json | 132 - .../e0e4bcef-cb73-436b-9353-b18ade293e8b.json | 132 - .../1ae45791-7e47-4083-bd72-4530fa26893c.json | 132 - .../b2731f04-a9bd-4e36-a545-85be5b66f5a7.json | 132 - .../ed6de552-d04b-4d51-8456-610e2cb41d85.json | 132 - .../3e08a589-d2b3-487b-900e-85725522a2e4.json | 132 - .../b2717503-d081-40ee-b1ed-fcadaf239049.json | 132 - .../9915eb01-5c45-42b6-82a3-ad782411642f.json | 132 - .../190eb7ca-46db-4e1d-8b71-9bb20af74ede.json | 132 - .../86b9077d-9ec3-411d-84c5-326ba97742c1.json | 132 - .../18bfa50c-20be-4027-8ee7-f6cd1411c882.json | 132 - .../eb1a099a-48c7-412b-b62f-143537c41f06.json | 132 - .../e530a4b7-c2f6-4bad-bab5-2895e950ed63.json | 132 - .../52ad7152-feea-46a6-b2d8-20e1a70514ce.json | 132 - .../a61162a6-ef3e-46f4-8aa2-241547fadea2.json | 132 - .../9f208aef-8544-47c8-bb1f-a3841aff208b.json | 132 - .../da237ab6-df39-460f-9efc-e1649e1ac202.json | 132 - .../c81b3193-9d01-4590-8b72-da97aa3c9dc4.json | 132 - .../1a9ffe50-69ae-48bc-b636-89431391eb37.json | 132 - .../b0c67359-1da0-4f55-aa1c-f54f88038bd7.json | 132 - .../c700798b-583a-41be-94dd-382669bb495f.json | 132 - .../3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json | 132 - .../e8c9501b-c985-4b78-a902-a1a030c72e60.json | 132 - .../df978fce-3373-4073-8c44-d6a83df1d9d1.json | 132 - .../e46ee8d9-81af-4259-8fef-3d3113fb6168.json | 132 - .../aa6ab404-89ef-4336-b811-7c8064e26107.json | 132 - .../a14e6c79-4a78-4c02-a7ca-35e783f32be1.json | 132 - .../ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json | 132 - .../f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json | 132 - .../335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json | 132 - .../b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json | 132 - .../112f01a2-f0fb-4257-86bf-61c9a184eb92.json | 132 - .../2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json | 132 - .../16ff8fa3-4676-473c-99ad-908ddb59d8ed.json | 132 - .../9b153ac9-f95b-419b-b7f9-beccd769ddad.json | 132 - .../8a5df3c2-eb71-4e12-b013-fb43685f2916.json | 132 - .../35fa3213-5c08-4b19-ae76-237fdd25444e.json | 132 - .../242ce55f-1471-435e-bcd7-d28b5fc87fc4.json | 132 - .../95f509f2-5e67-404a-968d-f7488d684e32.json | 132 - .../bcbcdfe9-0663-417c-9a29-60906e63db8f.json | 132 - .../d95a7493-2f99-4c10-8067-711c7388af7d.json | 132 - .../789848a0-6d8a-4583-93c3-a72df74d0071.json | 132 - .../14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json | 132 - .../379f559f-9bfa-444f-b477-562c25b4c299.json | 132 - .../effb6a3d-c98f-4c3a-be77-902c61cda21b.json | 132 - .../6c1c1405-afa4-412d-ba1f-49dc1cac4509.json | 132 - .../6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json | 132 - .../5fd5206b-186a-43b9-a4f4-07e75aa0293a.json | 132 - .../b707ecbf-0658-4226-803d-53456d16d54b.json | 132 - .../dca1ee57-5e86-4532-a2f3-ac6a619ca576.json | 132 - .../1233476a-7839-4a22-a7ca-1d0f237d8888.json | 132 - .../5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json | 132 - .../18f5fd6c-2b79-4d48-b7e9-18845db16271.json | 132 - .../a9039374-fa5a-4b8b-800f-5f4651cf812d.json | 132 - .../3f9704b4-bf25-40da-b6dc-b927c3569f40.json | 132 - .../a8f858d8-a792-409f-b79d-948a19e2aa87.json | 132 - .../5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json | 132 - .../77092cfe-9820-45e8-94c5-31d27f1daa7c.json | 132 - .../cab8fed8-de68-4fa5-b4fc-d9483fc56571.json | 132 - .../a8103350-b208-4856-8e7b-8ea8918ba0d1.json | 132 - .../e849c03c-c569-4059-8fc5-6a98cf391342.json | 132 - .../f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json | 132 - .../97bdb352-2e9d-4cc5-8b70-55348ef3a217.json | 132 - .../78053a33-24c8-4e9f-8791-f127f21eec1c.json | 132 - .../03082966-87ba-4560-a784-5d8677003500.json | 132 - .../97f26b20-db66-4a30-ba2a-c18a31081271.json | 132 - .../85f9ccda-8c47-4fa1-9d47-e9da4730b077.json | 132 - .../2a57d6f4-643b-4b30-8d67-03032d454887.json | 132 - .../d333f360-c1c3-4916-8480-4a1fc490875a.json | 132 - .../37a41261-a7b0-44b2-916f-770cdfa0ad39.json | 132 - .../c46cd6cc-b56d-44c5-a03c-b49381ba3462.json | 132 - .../612b6226-c25d-42e0-bcd7-be7faa844530.json | 132 - .../2fc7a4d6-88e0-4f11-9110-dc53942870a4.json | 132 - .../34665752-58d8-48ee-81a6-f1a068c23026.json | 132 - .../cc0767b5-4aaa-4418-8f68-72a721323e9c.json | 132 - .../ea507a41-1654-4515-94cc-ce2e38800c61.json | 132 - .../c44e773f-4cca-4780-bdd4-f486e65c18e0.json | 132 - .../f8a46bda-d53b-484e-8832-7939f7d0762d.json | 132 - .../c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json | 132 - .../da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json | 132 - .../ac078124-85d9-4715-bf7c-1428b1063732.json | 132 - .../9c1dcd75-8491-4890-ac6f-000868099a3e.json | 132 - .../7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json | 132 - .../8f38374e-f373-4639-9278-24441ebd0325.json | 132 - .../c007938e-3427-4896-8493-1500abdfbd2b.json | 132 - .../df81dc0d-6c72-49e9-862b-02e9b6642cb6.json | 132 - .../46c96d8e-568c-48f8-a74b-9dd4b4195037.json | 132 - .../1f4f7181-8a81-49f4-9e81-925d5d69a37c.json | 132 - .../3ea343b6-93f6-4c61-a164-3db95d13cbdf.json | 132 - .../a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json | 132 - .../0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json | 132 - .../6896faa7-7204-4091-8f4e-9cc0b53d673a.json | 132 - .../88064453-fd8c-4bd9-adf1-39f43972bec1.json | 132 - .../a18ade45-acba-4059-b969-445e529a82e2.json | 132 - .../6c0e4132-71e7-44af-95fc-83b0a6be2a82.json | 132 - .../5d9ab422-4f4f-460d-bd39-51266b43d7e5.json | 132 - .../cda03c45-0782-40cc-a17d-67d808657b83.json | 132 - .../50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json | 132 - .../cf758994-6e94-434d-bf68-74cca188b5e8.json | 132 - .../611f9549-0788-44e9-8125-18df06cd80d6.json | 132 - .../59cf23ba-027d-4bac-a0e1-526376396b4d.json | 132 - .../1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json | 132 - .../1e737e28-d926-43e8-9e4c-e39fa91d7977.json | 132 - .../43ef8eee-5d8a-47e7-ac71-1a898421370a.json | 132 - .../d8d03c71-942f-4aff-8a5e-5c265c639b44.json | 132 - .../96262938-1146-4993-92a1-a2ddb2519f8a.json | 132 - .../292d7cfb-3e3c-47d8-8cca-33507f9ff081.json | 132 - .../3f29c10f-57ef-435b-85df-2cae30ae72fa.json | 132 - .../d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json | 132 - .../baa35c90-c494-4dff-af28-cb549e40bed8.json | 132 - .../2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json | 132 - .../f687df8b-42b5-4d94-b741-1b516d9221b2.json | 132 - .../c3a8a952-6869-4eee-a59f-4ae33ac72986.json | 132 - .../a7a74117-71e4-49b2-bd65-add82c9165d8.json | 132 - .../04ee694c-0c89-4f25-b10f-315a24743ba2.json | 132 - .../47fd4acb-acc3-4f12-8af5-c425d3754c38.json | 132 - .../e19577f5-d1ba-45ad-8500-d18ae2b14440.json | 132 - .../e86443cd-453b-4ca0-8e7e-054764fe4bb9.json | 132 - .../24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json | 132 - .../1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json | 132 - .../4b1f2aab-ef92-4231-9bdd-96918b26914c.json | 132 - .../4956e127-14a1-405e-a0e0-76fe94ea727b.json | 132 - .../90fb6e40-88f7-4ce2-ae99-308d87e69718.json | 132 - .../cdad0f08-1c60-4493-bed0-9733894b367a.json | 132 - .../8e83b4f7-736f-4e03-8256-2a1fc421b04f.json | 132 - .../f0d6639d-8485-4bcd-b069-046a747dfbfa.json | 132 - .../d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json | 132 - .../5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json | 132 - .../de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json | 132 - .../db96601a-2f7f-438f-915b-55fee0e0d1d1.json | 132 - .../27912f7d-7033-4b7c-b93a-af1673ce4a9b.json | 132 - .../da58a484-4a45-4a70-a651-031ada8023d5.json | 132 - .../e8bd221d-8a89-4e3c-8815-0bff27574053.json | 132 - .../ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json | 132 - .../1e506afa-0d08-45d6-9242-b06104aa67e8.json | 132 - .../7d66bb93-cb2f-4be6-b133-1f0325be58e1.json | 132 - .../936f3c5f-7817-4118-96c8-e4061d4560fb.json | 132 - .../7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json | 132 - .../77cace56-503f-4531-a4eb-0178a68cc283.json | 132 - .../9e49b710-2413-42f3-8943-bc9dbf68cb3c.json | 132 - .../9a5b3564-97df-4661-a171-37322386ac4d.json | 132 - .../0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json | 132 - .../7f06c78c-f95e-4e50-aa57-da0579adcdae.json | 132 - .../06e55e47-9995-4fa2-877a-c728e9f9f1a1.json | 132 - .../39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json | 132 - .../f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json | 132 - .../7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json | 132 - .../ce80ac07-22d2-4883-ac6c-40b080e00b81.json | 132 - .../cbece170-f872-485f-a6c2-5db17ced73bc.json | 132 - .../c1fd751b-c6c3-4350-9618-f4b4840e1b69.json | 132 - .../bfd28b91-3a72-4417-b52b-804d2cbae12f.json | 132 - .../32c26cbc-3697-47a6-bd12-18187df9dda9.json | 132 - .../02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json | 132 - .../a57d2d49-5ccf-48f5-8035-b1d480c80f40.json | 132 - .../6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json | 132 - .../fe0665dd-b976-4d90-b16b-6c2acfef15ff.json | 132 - .../8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json | 132 - .../e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json | 132 - .../26c4c993-ae49-42a0-be0a-f157be9f7d58.json | 132 - .../19adf124-c120-4e97-80cf-49c40a66eb81.json | 132 - .../66bc5d38-8d25-4934-bce8-41ce4ea0e385.json | 132 - .../541eafe5-807e-44b0-b652-a0752210fc71.json | 132 - .../845a2484-9f17-4c0e-b06b-6250992298bc.json | 132 - .../e62b6b26-5f3c-42c9-9541-bb8b23caee66.json | 132 - .../ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json | 132 - .../a70b8356-94ce-4f0d-b44a-2215076eed5e.json | 132 - .../b182807d-587e-4702-bf30-dab11983b8db.json | 132 - .../c1f0944a-c44c-42e9-90ba-a847509cbd66.json | 132 - .../64bb8530-7071-402e-ba9b-1d15ecbe275c.json | 132 - .../4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json | 132 - .../1420df5c-690e-4b01-b99c-c21c793689ae.json | 132 - .../aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json | 132 - .../dfabd777-8620-40e3-b19c-a9227f57b638.json | 132 - .../08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json | 132 - .../4b264bb0-bd7e-4b15-9591-50b5a521f100.json | 132 - .../a8cfe336-0c3e-401c-a1e9-d951e64918ec.json | 132 - .../5e66c653-41b1-46de-b677-ffd8426ba5ec.json | 132 - .../9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json | 132 - .../cc64a143-4f1e-42ee-ade1-fafc4b316336.json | 132 - .../cf322e64-2682-4a9a-a48f-c4ec47b852f2.json | 132 - .../30b32261-b24a-49e3-ba57-172dc1d03ba0.json | 132 - .../0681c01d-23f3-4b8b-9516-a5cc41761fc4.json | 132 - .../7693ed8a-f76d-482b-92c1-f11810e522ca.json | 132 - .../f8dc0128-c606-490a-b965-59d5377dd778.json | 132 - .../844547f7-658f-41dd-ab4c-dc0569030e59.json | 132 - .../75c291b5-6d60-4bde-8621-f865196a6ecc.json | 132 - .../36d54b12-594f-47fe-9637-a9b740416c5c.json | 132 - .../57733383-9573-463d-a467-068d2685014c.json | 132 - .../eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json | 132 - .../00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json | 132 - .../0a3b9ad6-b853-471d-a292-413b30273034.json | 132 - .../d61c3ace-e353-4c0b-9472-c9a1928809cc.json | 132 - .../2293a19a-b650-436d-9448-1b641e63d407.json | 132 - .../c15b977c-c781-4b17-ac9f-25c77602c875.json | 132 - .../42c191be-c0ae-4170-8b6f-565053ae7d9c.json | 132 - .../f5cb910d-6e5b-404a-a751-d5cb90668150.json | 132 - .../de806e4c-dbf8-48cc-a0d8-033a61dfc777.json | 132 - .../59150b73-b05a-451e-ba3f-696d04effe05.json | 132 - .../84926b81-360a-480c-b240-f154ec7fe0ba.json | 132 - .../8e6edb04-302b-4dfc-b38f-94b437c921a8.json | 132 - .../db92c564-1cf9-43db-9e25-1f450c7b1e7f.json | 132 - .../e3796243-cbba-4ec2-ad7c-89547ad24342.json | 132 - .../1479be90-df8f-4e1d-b9db-03e84000187a.json | 132 - .../d2e6c48c-1c18-45a6-ba1a-b335325c980c.json | 132 - .../f843e45a-f66b-4091-a964-75583c2d7fc5.json | 132 - .../cbc3cd41-e187-4c4f-b207-37bceab423a4.json | 132 - .../0f124566-5e94-4233-9a3f-5ff9cfdf160c.json | 132 - .../98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json | 132 - .../91522dad-529b-477c-8372-793f631e14b7.json | 132 - .../cec22734-493c-4d11-ba86-6c7ae2005124.json | 132 - .../704a6e19-0d86-42a5-b8f5-05a5856e9c29.json | 132 - .../bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json | 132 - .../d20d533a-758b-477c-b4eb-073adaed640e.json | 132 - .../f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json | 132 - .../9db1f823-e068-4a39-a5cc-b9c588099427.json | 132 - .../23818b45-bf5f-48a2-982f-1e2a0d35aac8.json | 132 - .../de6eda66-b8f5-4b23-89e1-44bbac600953.json | 132 - .../632974c2-57e2-41f9-8c00-671e07e7594b.json | 132 - .../e86dcf4f-6282-4aa6-b645-00f93a2e9077.json | 132 - .../b20be5c9-9720-4076-b587-728549dd19af.json | 132 - .../5e193803-39d1-4f12-8726-ebbe5f71563c.json | 132 - .../61131a6c-f412-42bf-814b-7d711a840d44.json | 132 - .../535e72b1-17e0-40e3-9d66-d31f8ec70413.json | 132 - .../ea15479e-24a8-4924-a754-a8567c511e61.json | 132 - .../5799f285-c61f-43a8-a6a6-053808cf4e8f.json | 132 - .../36feef44-3d3b-4102-8606-ee6420bddcff.json | 132 - .../fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json | 132 - .../18e5decd-c95e-43d2-9ba2-007ba32e216f.json | 132 - .../85a4996e-8c44-4e4f-9478-19a8c5513617.json | 132 - .../db6d57c8-df0b-407e-b937-67c55b513a5f.json | 132 - .../89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json | 132 - .../c79e690f-3e09-4fac-9412-937a3b7ef352.json | 132 - .../ce74b7e3-8505-4c79-a7de-12d1e6b47155.json | 132 - .../3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json | 132 - .../152b0cbe-e27b-4438-8326-e67f4e70e600.json | 132 - .../c733c91f-79a9-49e5-9398-3a424ee1940a.json | 132 - .../32d7b6c6-de5c-4864-a446-97dccce378c5.json | 132 - .../7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json | 132 - .../99650529-55d9-42b0-b812-761a30277e5e.json | 132 - .../81abbc2a-791b-4a39-bb46-97edfa14b9c0.json | 132 - .../c658e535-7098-40fc-bea0-f5734d8f4ca9.json | 132 - .../9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json | 132 - .../07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json | 132 - .../c41d8925-b56b-458e-b1a9-27dbbcaee149.json | 132 - .../9136feb4-5c3e-48b3-bc70-c7816b8b189b.json | 132 - .../c395ef02-9a50-4696-aad2-bcb32ba05f67.json | 132 - .../93f47969-556a-4fd4-b7bb-4d1c861a8d71.json | 132 - .../349ae559-6c1f-4b2f-954c-e83cba1e603a.json | 132 - .../3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json | 132 - .../500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json | 132 - .../340a3ebb-bc06-404f-84e7-aeccc016fd32.json | 132 - .../a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json | 132 - .../bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json | 132 - .../0e1e45d4-2747-480d-9b1f-2b200e250271.json | 132 - .../00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json | 132 - .../c9e57ab2-c2a4-4935-b976-4bf24647b777.json | 132 - .../c22436a2-ec60-4220-82b3-123618165eb2.json | 132 - .../1f990438-dd84-44d2-99f9-a10035ecd652.json | 132 - .../f4564f5e-3595-466e-8201-0e2a4c50ff0d.json | 132 - .../040def3a-702d-4868-b429-39697ca36207.json | 132 - .../9e24fd65-56ec-4160-b299-b34d702a3231.json | 132 - .../216bf9f8-9521-4311-a40b-8a847271265c.json | 132 - .../45f8c4fb-3591-44df-a4f0-57093b9bae23.json | 132 - .../d17275ef-8a32-4fcb-94f4-fb24299ba50e.json | 132 - .../61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json | 132 - .../113c3507-b738-4b06-ada8-da93b19c6ae2.json | 132 - .../8835d5c1-8350-4d42-a753-82b94dffda3b.json | 132 - .../dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json | 132 - .../0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json | 132 - .../f2415b7a-2cd7-4a05-834b-7da992e1da1a.json | 132 - .../01af237f-40d8-4841-a90d-13dce6db8634.json | 132 - .../d69bb392-fd38-4f57-b567-24566896167b.json | 132 - .../63503943-1c1e-4dac-9c41-4933fbb44b70.json | 132 - .../80c5d343-41e6-45d7-8921-62586a3cd270.json | 132 - .../2c27d7f6-60fd-49f3-8666-784f2a16031b.json | 132 - .../cbcc1e64-8455-4382-8999-654d1757bbd6.json | 132 - .../1bea4f6b-7a41-4907-baca-430c7ea179e9.json | 132 - .../298ce89b-966c-4f4e-9da5-3803a395188f.json | 132 - .../ea27a4d6-8c32-4b36-873d-1046ae6240e5.json | 132 - .../73d5905d-7825-43ba-8051-7e1f5639b857.json | 132 - .../956b8589-a048-43be-9cfd-05658d3c57ca.json | 132 - .../36f597b4-8f53-4b40-9c0e-c9284743e456.json | 132 - .../7b67e526-7588-4c62-9293-55e77851c4c7.json | 132 - .../8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json | 132 - .../6751a200-0bd9-498e-a991-ebe22375633d.json | 132 - .../f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json | 132 - .../b105b62a-ce77-4387-b679-1adf2782b2f4.json | 132 - .../72180fd7-bf34-4758-b02f-7d11859700c7.json | 132 - .../ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json | 132 - .../2d266d7f-8edd-40fd-adfc-597a7742167b.json | 132 - .../484ccbf2-87e2-423f-9de4-a4bd54291b54.json | 132 - .../4de79504-f9e8-4235-9aad-d38f0799e081.json | 132 - .../b4bde9d8-f50c-448c-ada4-5bc05f302c04.json | 132 - .../5da3240b-b5e3-4333-ba61-925343b56043.json | 132 - .../d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json | 132 - .../15b86bbf-8d3b-474b-98f0-abb3972a7271.json | 132 - .../c0b339f6-4a46-46eb-b2d0-945176afe676.json | 132 - .../79367289-6245-4bf0-99e9-42bc3ff7649c.json | 132 - .../c3ec5505-1086-446a-9739-523810e93d13.json | 132 - .../c6c5e462-d373-4536-afc3-b740fb7e300f.json | 132 - .../b7537abe-8177-4206-999f-5bb7e95c72c8.json | 132 - .../eb2f6159-e37e-46db-9419-6a66cb7e539e.json | 132 - .../0b2d0a06-2907-4258-be33-1591e18ac6a2.json | 132 - .../0284d867-45c4-4fe4-883c-8e3ea169d66c.json | 132 - .../1a2da513-104e-4074-b3b7-601ab11bf6d8.json | 132 - .../189db16b-5e78-439f-9f79-6eec979c3a79.json | 132 - .../d751f1c5-5505-4c12-8d51-091538b49949.json | 132 - .../b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json | 132 - .../67dc7fb2-1455-4f60-9dcb-59a8197741d7.json | 132 - .../7f4ab590-29fa-473a-b617-00135dd1d6ee.json | 132 - .../d67db62e-e21d-43c8-8b4c-bfa353e47636.json | 132 - .../85abff46-8ae5-4a75-9522-721793224363.json | 132 - .../1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json | 132 - .../4777e427-8d17-4e06-8cbf-0883c95bbfd8.json | 132 - .../4df0b890-d4c5-408e-8994-88f7383e9235.json | 132 - .../76a5a59d-f5fd-4fb0-849e-7db7772b555a.json | 132 - .../6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json | 132 - .../92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json | 132 - .../b40ef568-f277-4d5c-87cd-53feaa71598b.json | 132 - .../893d5149-c535-41c7-8a1a-26bb6b33e407.json | 132 - .../0b649ed5-5af4-4910-b853-2408e3b58f1f.json | 132 - .../5c8edeba-5c65-4168-b67e-02143acbcafb.json | 132 - .../67e657ef-d602-4f58-b898-874a22f4a009.json | 132 - .../53d2bf07-689a-4e69-a534-b288313c8481.json | 132 - .../34d6a184-d4d5-4609-8305-c0e2ee1c585b.json | 132 - .../39b627ab-3e64-42f7-a88d-abe5764fcf4d.json | 132 - .../d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json | 132 - .../85bc5976-0d40-4416-bbf8-9b1dbf372343.json | 132 - .../8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json | 132 - .../de8651eb-16d1-46ee-a1df-b8c72caaf205.json | 132 - .../6a744db8-814f-4e8e-b6e5-0d096267dfa5.json | 132 - .../028b7c37-770e-4356-a7c6-0cc74650d5fd.json | 132 - .../3b399c64-922a-48ba-9a25-862102749647.json | 132 - .../d5e46a11-3e81-457d-9d26-9fd17f96f076.json | 132 - .../b3abfbc1-911a-43b7-a338-efb25f746f9d.json | 132 - .../6b471ee0-9444-45ff-92cf-da624aa59bf6.json | 132 - .../b56bd924-0a63-4ca2-8f2f-97b581e47a36.json | 132 - .../bfe9098d-7207-4f8c-9a3f-549a29303b5f.json | 132 - .../7856172d-ec3e-4e71-befe-54952478e330.json | 132 - .../a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json | 132 - .../9d19c44f-4912-4c95-ab3f-2dddb055d932.json | 132 - .../6cef3550-27d7-4073-b4bb-0f19a2c5f553.json | 132 - .../08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json | 132 - .../622f9379-6a30-43ba-a7a8-fbd08c484fa5.json | 132 - .../24f728e6-de5e-44cc-8b6d-51e0065c1475.json | 132 - .../c3b2bf18-d355-40fc-a862-376c1b988305.json | 132 - .../79474be5-2587-4087-a2cc-1337e3b696dd.json | 132 - .../22ff2700-70c0-459e-96a2-0ce1710947bc.json | 132 - .../7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json | 132 - .../69dc0f8e-16d7-4907-9741-484eafa62b8c.json | 132 - .../e516abc1-9c3c-4921-a385-e2533d45fed3.json | 132 - .../8baa5832-cc07-4a31-a815-0e8151426ea6.json | 132 - .../509fbca4-f405-4c27-85a9-1eea59025070.json | 132 - .../6f45ed56-6bec-4439-9adb-e79fcd74667c.json | 132 - .../512ff924-c1d3-4d75-a468-2bcdcda25cf6.json | 132 - .../86b561ae-c4d3-4293-a884-bcab26df026d.json | 132 - .../516d1972-9731-4234-a4b3-b96423ebba5c.json | 132 - .../274f6e02-c81f-4f2e-9747-e5de5cee1933.json | 132 - .../61638b55-296b-40fd-a39f-cc2276d9f94a.json | 132 - .../11c1b6fe-4815-415b-a4a8-d14073df6ee1.json | 132 - .../88e2cb24-288e-4f37-8753-f0daa825051c.json | 132 - .../8a1a6c44-17fd-402e-a22e-e795a1f612e3.json | 132 - .../1121af0b-61fe-424a-bc66-3164bcb1d833.json | 132 - .../35300d67-7ee1-4874-b351-87f46267cec9.json | 132 - .../6180b7b3-4b21-42aa-a62d-084a91568b43.json | 132 - .../7414d344-0e67-424a-9e16-00de0487ce02.json | 132 - .../f5fcd407-080c-4cb7-a299-7a7f919c734d.json | 132 - .../efe03731-6021-4dcf-b7fe-24cbf2d60fac.json | 132 - .../6ffed624-cc22-4b62-a447-3c02b0e43ded.json | 132 - .../ed867fa8-be8a-49b0-8c94-38085808b58b.json | 132 - .../c8b9a56b-0933-4085-8d5f-a1d8294699db.json | 132 - .../9b178661-ed9a-427d-b93c-b905b8089ad8.json | 132 - .../69588e07-7559-49c2-9423-19fd143e42f7.json | 132 - .../317589da-d673-4f90-93e9-59983f2ef54b.json | 132 - .../efab322e-ea15-4fe7-9bfc-15246003e59c.json | 132 - .../b1eac68e-b292-414b-9594-c921f8e10818.json | 132 - .../b7d08c65-8219-4067-9504-99e438a86038.json | 132 - .../e9c5b479-0dce-4de3-84d6-90c7515337f1.json | 132 - .../3c766465-29db-4b3d-b42f-a3222b38a096.json | 132 - .../e6c85677-61ed-475b-85a5-48b91ec76bcf.json | 132 - .../7b68fa5e-dbbf-4542-8767-6874aabf8f40.json | 132 - .../c103b7f4-a432-42d6-86ef-cb369e0c16ff.json | 132 - .../643dda41-37d0-4c1e-b856-58b774612886.json | 132 - .../ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json | 132 - .../16e30aa0-736a-4ef8-8ba6-78285b84546f.json | 132 - .../73eb729d-adfd-4dee-9bde-04a31f5528f6.json | 132 - .../0daad2ae-92d0-4522-a067-20332f72c96f.json | 132 - .../a3e3849f-a289-4132-b4a8-f67d67ad46a1.json | 132 - .../59a9ed26-a67a-4e76-8858-520400c90766.json | 132 - .../6c5c61b4-8037-4b28-8616-1aefa7963eb8.json | 132 - .../e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json | 132 - .../5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json | 132 - .../d5b31b1f-ace0-457f-bf8a-9041398b8344.json | 132 - .../b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json | 132 - .../c701f1fd-166d-416b-8f78-edf17f2fecd4.json | 132 - .../4217b403-e924-4f67-9b0e-ad1d4ed293a1.json | 132 - .../03816e41-5fb8-4815-ab9c-4108ab19a3bc.json | 132 - .../a763b10e-350a-4342-ade3-b782437ca3e2.json | 132 - .../9e806fd2-edbf-40e2-a008-834cee537bb6.json | 132 - .../fbcf861c-62db-4079-bba6-becd4e231216.json | 132 - .../22b591c0-3386-4bd5-860c-20c0c6001986.json | 132 - .../dfb9a9c4-114e-4188-9940-4d6df7e4815f.json | 132 - .../38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json | 132 - .../e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json | 132 - .../2165e69a-c50c-419a-932e-909f53b73b71.json | 132 - .../46430a07-15c8-4727-9102-2f471d4f1d3c.json | 132 - .../3c7f540a-c850-4e20-ad93-60e021d17133.json | 132 - .../c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json | 132 - .../2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json | 132 - .../6427a5ef-8508-430d-970d-054fc485e754.json | 132 - .../08984ad9-1e9b-4916-b214-af26dadfcc0b.json | 132 - .../1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json | 132 - .../6bf42faa-c3e9-4069-bf93-ffd626062f0f.json | 132 - .../9feccbdc-18eb-4077-b50b-986db0047fc8.json | 132 - .../a074c33f-782a-409c-987b-7dd62c65ccc7.json | 132 - .../2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json | 132 - .../84481fee-3727-427b-912a-30e2744df28a.json | 132 - .../aaa801dc-1a47-4009-9ad4-7129a8d4e651.json | 132 - .../3ac92cbf-c85b-4e00-9ef9-4322f961591a.json | 132 - .../162b511b-4684-4595-9261-a33f3a4117f9.json | 132 - .../20d5d59a-028d-4e34-9414-d9edaf2e59b8.json | 132 - .../a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json | 132 - .../0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json | 132 - .../6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json | 132 - .../78582fec-2f69-4b37-8497-12ceb097b44b.json | 132 - .../949bf65e-c2ae-4701-82f0-39d0c62a0e87.json | 132 - .../8812151c-4301-4131-a414-d64d025e476e.json | 132 - .../2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json | 132 - .../9feeffb2-3763-4e43-933e-89100b76f7fa.json | 132 - .../721102b5-ed5e-4631-8600-a6adfff0c784.json | 132 - .../18c185f7-5ca4-46ff-81c2-6c538f096409.json | 132 - .../7ab5911c-e229-43e5-a798-095287d0a597.json | 132 - .../f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json | 132 - .../5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json | 132 - .../d4b20ef4-734e-40a7-818e-f77e170d7437.json | 132 - .../e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json | 132 - .../3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json | 132 - .../9aff874c-1953-4b97-9bff-9e6120b0bfa7.json | 132 - .../45ae7f45-8c36-46c6-989d-bc672cdf8eff.json | 132 - .../7d36e44e-a329-4b96-a891-365ad900f718.json | 132 - .../a8c26325-1eec-43a6-a8ad-3bcb2e378924.json | 132 - .../bde1a879-6852-42ce-9217-f427af85a46a.json | 132 - .../dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json | 132 - .../12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json | 132 - .../1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json | 132 - .../7076406b-7e0a-49c7-8150-2e6a243aa23b.json | 132 - .../96c3fd80-a601-4629-a1ab-bf7f366a909a.json | 132 - .../1302c9a5-d35c-400c-b9f3-d990243e5d59.json | 132 - .../c7f48bbf-6583-4ddd-ae4d-671c43218dae.json | 132 - .../5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json | 132 - .../15701682-97ce-46cf-8010-a6bdeaf8c7aa.json | 132 - .../c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json | 132 - .../4337b1c1-cc00-4a15-8148-e8d0739561b9.json | 132 - .../1151ee14-8fe9-4f97-808d-8103b353c2ec.json | 132 - .../a2c18179-aca3-422c-b9f5-8345109cea13.json | 132 - .../07495d34-1505-45a9-bb48-887af0da8a0c.json | 132 - .../567baf6d-99f9-46a5-8c40-c6899986f1ff.json | 132 - .../a337df3a-28ff-46c9-adae-4bc029937101.json | 132 - .../b201a849-44e9-4598-918b-ffa27c894ee9.json | 132 - .../dd87ebf3-3088-43b1-851c-a97d12a68ea8.json | 132 - .../1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json | 132 - .../220cb478-58c0-4028-b51a-ec5fe1050746.json | 132 - .../17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json | 132 - .../2b55023b-b8bc-42a2-aca8-dcaf39890232.json | 132 - .../31736569-5992-4b1d-9d66-27a6c1620506.json | 132 - .../630b37b5-351c-403c-ac76-ccb68ffc5d53.json | 132 - .../69cdef01-30dc-4f75-97fa-9daeebcec72f.json | 132 - .../9aa1acb0-c791-4dea-aa1e-c912cea69466.json | 132 - .../0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json | 132 - .../2872dcd9-421b-4346-812c-b27bb32c6e86.json | 132 - .../2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json | 132 - .../d0a76497-84b0-45b9-b748-04ffe9bc13a3.json | 132 - .../185b6560-6790-417f-aeba-f7405fee808a.json | 132 - .../30a8074e-df03-4866-9b8d-a5a7eece3c71.json | 132 - .../ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json | 132 - .../bc98b048-18d4-438e-80c4-0cd851798da5.json | 132 - .../c88c011f-0a24-4e78-a104-035d25af2430.json | 132 - .../f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json | 132 - .../5484405a-2ec8-4515-af75-76a5dd348d3d.json | 132 - .../7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json | 132 - .../e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json | 132 - .../7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json | 132 - .../d0a70e95-fc72-41c6-ac42-09b8f379b566.json | 132 - .../e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json | 132 - .../f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json | 132 - .../e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json | 132 - .../3feb9449-49a2-427f-a317-c21e6d1ca66c.json | 132 - .../6359e37e-0405-436b-903c-8f0e740dd6c7.json | 132 - .../f5daed76-f6e5-4a7d-84d7-80537a046b83.json | 132 - .../03af2b1d-989f-4afc-ab13-8793093b9c50.json | 132 - .../5db7ec54-7feb-4c11-b2e0-042226ba1f94.json | 132 - .../f1f5615d-8a78-43c9-b5c6-edc180252381.json | 132 - .../9c89bf8f-4b8a-4c01-8685-fafc687c673e.json | 132 - .../58b69c0f-826d-414f-915e-dd0b78d9298c.json | 132 - .../101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json | 132 - .../259c4798-ff03-4f58-8fb4-59150710212b.json | 132 - .../f731caa1-f777-494a-8490-da0c815f0708.json | 132 - .../d4d25d38-b21a-490e-9ca9-556504ec00ea.json | 132 - .../75bb85a3-40bb-4630-95a0-50e40b008412.json | 132 - .../bb44f3ef-eefa-48ef-a257-2eb345c89a00.json | 132 - .../2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json | 132 - .../caa0c8df-5488-4bf9-a5b8-0fff831e6732.json | 132 - .../c6f8e581-e849-4e28-b3a6-1838ee522770.json | 132 - .../f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json | 132 - .../44129be7-f73d-4580-8375-e8ef324e73a8.json | 132 - .../2925ecde-a9a5-4369-b391-d23a8605d35c.json | 132 - .../8409e464-fd16-4b41-b533-2f6cae4fe894.json | 132 - .../86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json | 132 - .../aa2e6df7-a0b0-42f7-8057-e2763fc34834.json | 132 - .../2bf9a06e-f3bf-4b55-804b-e553a722e0de.json | 132 - .../b380a675-39ea-4950-ad0a-d9771f09ddde.json | 132 - .../482358eb-7d3b-4de0-b5d9-451308f104e2.json | 132 - .../ef04a83d-7b89-43ec-ba33-30e1006422dc.json | 132 - .../7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json | 132 - .../52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json | 132 - .../1f4a827d-31cd-42e6-871d-7c0cad010f58.json | 132 - .../56d6d99c-fba1-42e7-aad4-631370b44da3.json | 132 - .../006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json | 132 - .../33a82686-6202-4a4d-ba34-bd4537105e5f.json | 132 - .../38d45554-44bd-4b40-b7c9-c0b7ba44b862.json | 132 - .../37d7e3ab-db9c-4ad7-81d1-933c030a6250.json | 132 - .../9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json | 132 - .../b6bd8515-4c95-40ce-b2d5-af8873d261ab.json | 132 - .../d102e75d-3e20-482b-a243-bae3ec44e2bb.json | 132 - .../68920da1-af71-4ccd-88b9-554e3c72c4dc.json | 132 - .../c0eb144f-c726-4a80-bce9-384fb7a641a7.json | 132 - .../0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json | 132 - .../8fe4360a-0924-4386-b4cd-89069f7ff55f.json | 132 - .../eeeb082b-7112-4a08-a87a-b2c9ae37efff.json | 132 - .../b8f933e9-867f-4934-9648-371d1e632116.json | 132 - .../8d225023-4b7e-48cd-ae67-6d00b541f17d.json | 132 - .../ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json | 132 - .../177ef040-da5c-4a65-adac-efdc555bd110.json | 132 - .../e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json | 132 - .../f4549a39-0b28-4e06-998a-774f5f02cfba.json | 132 - .../a79af78a-adab-406f-995a-adb3893e1510.json | 132 - .../4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json | 132 - .../eeb3a10a-d584-414a-90de-e018c47615c2.json | 132 - .../e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json | 132 - .../cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json | 132 - .../b64b6416-b18b-47cc-a516-c613cd670b37.json | 132 - .../64e96d56-72a9-413f-8903-45821b98f71e.json | 132 - .../a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json | 132 - .../79314f48-d92b-4992-b3c6-d31278c0867a.json | 132 - .../5a007612-c8e7-4f6b-baa9-a21af7e908c6.json | 132 - .../fdefdd3e-2d83-4430-bd95-e16a1935dff1.json | 132 - .../ffdd45bf-3409-4b92-909a-25a32ba27f82.json | 132 - .../a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json | 132 - .../d9d49bf7-f6f0-4c25-9182-d815454940e3.json | 132 - .../deb48e93-0378-482f-8a5d-7ec350497e0b.json | 132 - .../302a9a47-8603-42d9-85fb-64c60e7c6f44.json | 132 - .../28d52801-3998-421f-a37a-2b7b677d0eaa.json | 132 - .../32b4e23b-9430-45a8-bfa2-eea2e89792c4.json | 132 - .../0336e168-e313-44cb-a030-42e6d20e92df.json | 132 - .../11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json | 132 - .../6d97749c-3bfa-4c32-b581-a5e2b73303f3.json | 132 - .../ec58907d-b67c-467e-a3dd-b9f9c10138f0.json | 132 - .../a7f09a3d-025c-48fa-9358-863b9ae382b1.json | 132 - .../bf2be2d5-58de-4550-b733-a5910bded48d.json | 132 - .../52b32c1f-6189-4850-b3f4-de442eb2ccb5.json | 132 - .../87b44160-c3dd-452d-8c15-c4f758f8db7b.json | 132 - .../3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json | 132 - .../35b7ff42-3825-4240-97bf-f8af7e8c23ff.json | 132 - .../c108173e-1582-4c99-9291-46986d7ba1cf.json | 132 - .../6feb08b0-1c67-4fe2-a001-0b3b84529687.json | 132 - .../d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json | 132 - .../53ec995e-bcfd-4a72-bd9a-45d14da3f219.json | 132 - .../299a0397-89c7-4329-9599-9fc29a52db87.json | 132 - .../41adbc32-6cdf-49ba-980c-6eb6f722b40b.json | 132 - .../4236ece5-f2b2-44e7-9503-9731bff20155.json | 132 - .../b33d672c-4a96-4093-bc13-25c42303b918.json | 132 - .../2b4f42fc-8b25-481c-98f7-911c52fdd242.json | 132 - .../634b7a64-2bd3-48b8-b2f4-a93189801850.json | 132 - .../72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json | 132 - .../78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json | 132 - .../359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json | 132 - .../79b81e37-f75e-4b18-b145-73c42625ced5.json | 132 - .../2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json | 132 - .../315fa815-fab0-47c9-8185-00bc597c0176.json | 132 - .../0c1686db-b396-4ecf-86f1-e4e092491acd.json | 132 - .../57455fbc-b5a9-4a3b-9a30-7da0593fd778.json | 132 - .../a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json | 132 - .../9d0d4eee-0b87-485c-843f-e32d08aa601b.json | 132 - .../e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json | 132 - .../8c7e25df-884d-4940-8185-4c1b82fac8c5.json | 132 - .../83611d50-01d0-4642-a104-daf77f1a0fe8.json | 132 - .../5cbdafba-6071-4da1-8b19-3de612e9ff18.json | 132 - .../1c934cba-c94a-4aad-9645-84658e0b5588.json | 132 - .../7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json | 132 - .../38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json | 132 - .../3d65fbc2-bf91-479c-a687-e9ef702794fb.json | 132 - .../650cdbbb-e066-4581-8d61-77aa6a4c402c.json | 132 - .../05d566c5-1810-483c-8ce0-84635b9457dc.json | 132 - .../37e3456a-92ff-4122-a697-ffbdc1c79555.json | 132 - .../70c908d4-f1bf-4553-9bf7-95eb593b4853.json | 132 - .../2ccc9c20-5414-4286-abcd-ad2b20f8652d.json | 132 - .../50f4560a-e172-42b9-b552-437aff158a38.json | 132 - .../c6a3abac-8a34-4725-915b-c27c3d0bc484.json | 132 - .../a8ed68ea-6463-4ff9-9dcd-034080272dec.json | 132 - .../5799ce8b-c00d-49f6-96dc-f7dd057a268c.json | 132 - .../0d261023-3e35-4160-98ca-241bbaee927e.json | 132 - .../f0454d3b-18b4-488a-94dd-fb24729996c7.json | 132 - .../6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json | 132 - .../37f20f86-40ba-4f63-b29d-efff6cb0e09b.json | 132 - .../bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json | 132 - .../bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json | 132 - .../cbea8d66-0370-4998-8e3a-06fef0a60f0c.json | 132 - .../ca48b670-b82e-46cc-beb9-2fd0f11d3585.json | 132 - .../d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json | 132 - .../503c8a24-4ced-4dca-b9df-5733ce89c2ca.json | 132 - .../5c5283a0-819f-4112-bb90-5277423d9c00.json | 132 - .../b636bc82-1625-49b1-beec-cadaf4e1b1a9.json | 132 - .../00f481c1-0ef0-40bd-bd95-81dc9443a62c.json | 132 - .../7ea22fef-2d79-49ae-bf72-9153a4e239c5.json | 132 - .../64f441df-1781-4d01-b73b-2156413ad403.json | 132 - .../4e3676eb-8607-416e-986a-7098bc192820.json | 132 - .../2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json | 132 - .../c4b86264-3725-4742-91f0-3e01f8d965a4.json | 132 - .../0308147c-dabb-46bb-8add-d332fcd5a800.json | 132 - .../a9977a0d-e199-488a-a26e-6269806fdb2b.json | 132 - .../56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json | 132 - .../4185c376-91c6-435d-ae3b-47cd85151049.json | 132 - .../26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json | 132 - .../09be48ce-61f8-4ba9-b082-b9c475fa714d.json | 132 - .../27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json | 132 - .../7b6fc3c2-a67d-450e-858c-fa87be122376.json | 132 - .../76b86418-5450-48c6-ae56-58a19016d055.json | 132 - .../e06594e4-899a-4285-b130-f7b605e5a6b9.json | 132 - .../9efdc773-a5c7-4709-88c8-96a67d84a742.json | 132 - .../1fcc2f96-afc9-403f-b82e-8e1804506582.json | 132 - .../bee1e134-9a43-441a-b977-522c510dd1ce.json | 132 - .../b70e1089-d136-4b2f-a253-f361bcf8cdcc.json | 132 - .../8b7e9c34-a982-4f4d-b5dc-66a12578601f.json | 132 - .../0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json | 132 - .../066abe97-2c6c-4f3b-9e5e-e144f130258a.json | 132 - .../a3af8f77-d915-4482-a2b6-c99744aada4b.json | 132 - .../82cc8b37-e242-441e-ac74-1662bcc0a0e2.json | 132 - .../1527c8bc-c1ec-45f4-9663-4cffbb808f94.json | 132 - .../337b8ce8-d697-47f6-94ac-7a420dd7d91b.json | 132 - .../3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json | 132 - .../0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json | 132 - .../3e236ad8-3828-407f-9076-743b465b8d15.json | 132 - .../9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json | 132 - .../940d88e9-085b-4065-b8c8-92ebe685deb0.json | 132 - .../7fdcd616-2c72-4c44-9646-9c32344bfa0b.json | 132 - .../9d358f55-810c-4ac1-adc7-83f95bd74c11.json | 132 - .../9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json | 132 - .../651a32b1-77fb-4acf-89bf-2d45b684944d.json | 132 - .../192c4037-753a-4790-80d0-33c4d277102d.json | 132 - .../679d66bf-244e-4080-9a42-0a0c6cfdc965.json | 132 - .../73b0ca8a-fb16-43eb-a9af-a01219cf6196.json | 132 - .../7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json | 132 - .../a8238bd4-3982-4e45-92e4-bab77e528e29.json | 132 - .../f87f9f08-e989-4e99-a254-a3650e7ab1b6.json | 132 - .../f40496a9-fb14-4b2d-8070-84f55e6417f6.json | 132 - .../cc52f59d-5669-44b0-b1af-e6fd0836e284.json | 132 - .../67525a37-f658-40e8-89a1-de8bf6275a00.json | 132 - .../3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json | 132 - .../0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json | 132 - .../7a05616e-7335-419a-914d-00fb287fe663.json | 132 - .../070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json | 132 - .../5afc044a-3138-443f-89cf-74f1272cc632.json | 132 - .../a6c1d914-647c-46b7-b0e1-712b8d506780.json | 132 - .../43f35eac-0946-42f9-a128-eb8011c29588.json | 132 - .../04c22be7-2cf4-4774-b479-863199c7c3a4.json | 132 - .../fc3d436b-ec61-4458-a3c6-1df41057ea70.json | 132 - .../e3ed157f-f306-40fb-b3a1-d3434236759e.json | 132 - .../8793b3e3-f409-499a-81f8-c250c8092841.json | 132 - .../33572f63-15ba-4fbc-b1cf-56b978384d02.json | 132 - .../44c636ba-8303-4d75-bcb5-46e3c07a991a.json | 132 - .../0a002444-3e5a-4fc8-acc6-72210a4181a9.json | 132 - .../bbf936a5-3594-4d0a-b5af-7a01740d0c81.json | 132 - .../1164abea-4cc2-46a7-a44b-f024a2ce40b4.json | 132 - .../bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json | 132 - .../7f49e582-a01f-481f-8345-1c384fc8b567.json | 132 - .../10937ed1-56e2-4aad-b717-5125bc8ac72a.json | 132 - .../f4622539-c0ac-4e9f-86d4-00e3c826d03b.json | 132 - .../6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json | 132 - .../5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json | 132 - .../21d6f2dd-7bd6-42a9-b14e-c25777497890.json | 132 - .../d0bc11cb-56ff-4c77-9446-e76e550e0919.json | 132 - .../ff78dc97-e9cf-4215-a607-3e80892af82c.json | 132 - .../0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json | 132 - .../48837141-2556-4658-87e0-bb88cfcd562a.json | 132 - .../f2d6da5d-3685-43de-8ceb-5b798f88e24c.json | 132 - .../9ec02ccd-329a-4d62-9f04-87de6fda5011.json | 132 - .../781d0332-e332-4ff7-8585-9c2d8395a147.json | 132 - .../d6dd460e-c352-4d31-8941-183c6eabd0a7.json | 132 - .../66bf6442-04ea-437b-88c4-e61afc6f7139.json | 132 - .../0d1911f5-a2e7-4511-a8d8-098cbf9207df.json | 132 - .../abc18648-ef96-4695-94d5-fa14be277431.json | 132 - .../ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json | 132 - .../cc8ef5bd-957f-4308-9539-00a696182056.json | 132 - .../abc7652f-b88e-40ba-847c-c99dce9f2719.json | 132 - .../56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json | 132 - .../4b81caad-92ed-4bd5-98bd-58582854b5d8.json | 132 - .../2cef0040-6d4c-4c38-be40-5477911f3063.json | 132 - .../4aeef94f-823e-4be5-b4f1-37463e052748.json | 132 - .../3d367147-373f-4543-be19-55a6429558a2.json | 132 - .../cb93091a-6c46-438a-b111-cbf7e2fac420.json | 132 - .../ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json | 132 - .../f4dc1659-800f-49d2-a290-48e9d4b15581.json | 132 - .../d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json | 132 - .../91017e73-f33a-49f5-ac87-f6e6a178d885.json | 132 - .../b7a75bca-6afe-448a-8e5c-53ebd577c964.json | 132 - .../8cdced5c-23bc-4426-a0c9-b9bf82913683.json | 132 - .../368784c8-6fc2-4340-8277-a6a9a9800a99.json | 132 - .../f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json | 132 - .../f423b0d1-3536-4865-9615-f89b9d15b14c.json | 132 - .../c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json | 132 - .../b6149d15-3e0f-43d2-ae90-eca290a94edb.json | 132 - .../e21f5d83-6b71-488d-ad55-d23268fbd611.json | 132 - .../68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json | 132 - .../12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json | 132 - .../adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json | 132 - .../7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json | 132 - .../1bb09da7-1675-4e57-b46a-9791c888ce6f.json | 132 - .../3ed7dd5a-e431-480a-91a7-5ccd915057e4.json | 132 - .../9cab35b6-d6a7-475e-b715-e4493d07cd92.json | 132 - .../ef7149ae-8d50-4890-89ae-fb561a86d130.json | 132 - .../3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json | 132 - .../4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json | 132 - .../8fe13380-a045-4d63-96f8-ec977540478c.json | 132 - .../6da42427-c7de-4830-b368-ca7757ee1d51.json | 132 - .../5faf24b3-38af-4f3f-8377-bba70d75f8df.json | 132 - .../9a26214c-2601-49be-b1b1-03796b704059.json | 132 - .../fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json | 132 - .../25c5b304-46d3-4df3-9ac3-75ffa972849a.json | 132 - .../88ed0272-39f8-4676-970a-525aee058991.json | 132 - .../d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json | 132 - .../dcb90e75-8709-4729-8c00-e756e6a9a49d.json | 132 - .../81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json | 132 - .../0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json | 132 - .../82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json | 132 - .../e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json | 132 - .../7ccaa29a-4f73-4794-83a2-b925d755d91e.json | 132 - .../ba8de8f6-c118-4bc3-ae8d-851e964684ed.json | 132 - .../4011975a-e2a0-466a-9b34-923e1b4f8733.json | 132 - .../8a172205-39c6-4dd1-86b2-11b234b37e3c.json | 132 - .../495b2e8e-e2d8-4158-bc6e-7568604d44e9.json | 132 - .../e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json | 132 - .../4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json | 132 - .../a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json | 132 - .../c85c79d6-28e0-4deb-ad84-901b725aeca8.json | 132 - .../73271472-d06f-405b-af9d-2da7c17e1eb0.json | 132 - .../4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json | 132 - .../9b36e4c0-0d13-4988-8145-b9254da2e76e.json | 132 - .../6a464798-0111-4c71-b156-72a5aba1da63.json | 132 - .../78252135-f15b-427d-86de-c32cd3dbcd0f.json | 132 - .../c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json | 132 - .../bce17582-e807-4b91-b0e7-0a890bf5eb24.json | 132 - .../f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json | 132 - .../78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json | 132 - .../bdb9e2d2-8d09-4994-a320-2f968bcb4898.json | 132 - .../c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json | 132 - .../550d5665-7a8a-437e-b318-000690dd250f.json | 132 - .../a1922f33-32f5-4f99-8df6-e2080808d292.json | 132 - .../6ccc376b-24a4-42cc-8ea0-823ef14336db.json | 132 - .../6547b6f3-63dd-4516-b294-62c4246c3dc7.json | 132 - .../a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json | 132 - .../b15ad3b5-7ef2-439e-9acd-a85eab520d31.json | 132 - .../64da2654-9fdb-4a08-ad16-cf8793a30ed8.json | 132 - .../37080215-ee30-4e59-a407-b14695ac2a38.json | 132 - .../b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json | 132 - .../bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json | 132 - .../da9ddecc-43cf-4055-a19e-795b1ee98826.json | 132 - .../a93ccb3f-f2d9-415d-8397-0c7fb765fada.json | 132 - .../d0f86765-bdb4-4367-986b-28303bbe1844.json | 132 - .../693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json | 132 - .../7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json | 132 - .../c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json | 132 - .../1810033a-185b-4c91-91d3-43b8f6c61443.json | 132 - .../beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json | 132 - .../cf20e77a-340f-4d8d-b593-9645bdfc5877.json | 132 - .../eec73e49-ac2b-42ed-a115-76e45007cd5d.json | 132 - .../aa06d058-87f9-4fde-ad53-139b29a71448.json | 132 - .../3f1d571a-fc42-411b-88ab-4700d5861367.json | 132 - .../74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json | 132 - .../2eb433ba-5c93-4355-99dd-edcb65721603.json | 132 - .../826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json | 132 - .../6da54964-e3b5-4567-8ce4-7e0f279af84f.json | 132 - .../a7dde688-a0ae-4731-909f-0bef0c6eeba9.json | 132 - .../eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json | 132 - .../9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json | 132 - .../6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json | 132 - .../b0ca2dec-387f-4b27-9adb-772af1899832.json | 132 - .../53c4b397-b78e-4699-a01e-3535aa072225.json | 132 - .../f5b251f0-741c-4ad5-ab04-19c5202854ea.json | 132 - .../7b2ba13a-e01d-4442-9abe-d16df1a1668a.json | 132 - .../bf79f87c-3f14-49e8-acba-725e709d5f11.json | 132 - .../3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json | 132 - .../6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json | 132 - .../1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json | 132 - .../03393ffd-1923-4767-ba14-d0e3e6751842.json | 132 - .../b7d049dc-127d-4075-8067-22adac9a58c3.json | 132 - .../89d79024-f4b8-4165-bd88-47f2b0010800.json | 132 - .../d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json | 132 - .../a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json | 132 - .../55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json | 132 - .../fe7a6940-fc4c-4345-84be-609c8155be57.json | 132 - .../77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json | 132 - .../94d744be-5d28-490a-ba9a-8440cb97dce9.json | 132 - .../2765061e-7506-4eb6-b63f-312f6290665a.json | 132 - .../167c937c-66c7-45a8-bbd9-97d98531bf7d.json | 132 - .../9587c35c-1def-46e7-8642-7acb0340be5e.json | 132 - .../1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json | 132 - .../8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json | 132 - .../a2f9536a-9266-4aee-be90-d04f4dcbe53c.json | 132 - .../7f116aaa-3880-4e53-948a-4b06e0d26cff.json | 132 - .../7cbe4516-2be2-421b-95f4-c9500ad64ca5.json | 132 - .../07df565a-bc30-4a9d-b472-7a85f35938be.json | 132 - .../7545f7db-10bb-4d97-9b3f-4346f4f26bad.json | 132 - .../47384f10-ac6a-4629-92db-86f01a441f7f.json | 132 - .../3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json | 132 - .../1d851cfb-8624-4516-8204-85569c60dc67.json | 132 - .../a7990990-7498-4b74-a0aa-9c266910698e.json | 132 - .../0b41d37e-0728-4575-9662-c150e2e29bd0.json | 132 - .../c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json | 132 - .../680a4507-755e-4014-877b-6032f0220270.json | 132 - .../5ace8dc6-e348-4267-bb4a-f71a335d074e.json | 132 - .../07549821-db51-4b77-980a-056131b5dd29.json | 132 - .../ff12a0a1-a913-441b-955c-bcbd50056acf.json | 132 - .../947cfc2b-b73c-40eb-9e57-be5278776711.json | 132 - .../53639078-c50a-4147-bab0-16993f1790b6.json | 132 - .../b2cf96e0-382e-4200-a4a4-d66e8a188878.json | 132 - .../d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json | 132 - .../210f7063-e0d9-424d-94f4-3645e4e1b401.json | 132 - .../4ecd26d8-8416-4dba-8d53-96f4013cfef0.json | 132 - .../15712b7d-e69f-4a4f-b13c-4e79ce859399.json | 132 - .../9148c375-7c08-4c1c-82ed-5f935b2a4f04.json | 132 - .../fb93274b-b7d8-483a-a95d-96340535febc.json | 132 - .../0818b755-ec49-457c-8635-73f01816f30b.json | 132 - .../77962326-0160-49bd-9ef1-59b403b2bfce.json | 132 - .../272abbe5-8b61-442f-9860-d7411e7fec99.json | 132 - .../14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json | 132 - .../ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json | 132 - .../1970e257-7c93-4342-9ff4-a96af21acc67.json | 132 - .../15d71696-4b21-41ff-a4c6-0aea92fb844a.json | 132 - .../ccb85394-5252-48d4-8980-8b3a6c67ab1a.json | 132 - .../ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json | 132 - .../fe9012a7-d07f-48d4-b460-eca256078d8b.json | 132 - .../8e8d2071-8e7d-4dad-8536-4698b2d00316.json | 132 - .../dbcb41be-9ed6-4244-ada8-77f363c3487e.json | 132 - .../e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json | 132 - .../30c2d908-3eaf-408a-a2b5-301e0cd9e052.json | 132 - .../f7624d04-66d1-4c05-8c01-d015ecf8412c.json | 132 - .../511e4aad-1e5a-4515-9433-46989fc3945b.json | 132 - .../863e71ec-03a4-47ed-8bc9-b064d5571162.json | 132 - .../6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json | 132 - .../e0dbec0b-a154-448a-be23-ef9b764469ea.json | 132 - .../ecd91300-b0cf-48ce-9e5c-253a7991f90e.json | 132 - .../e3df71f1-63e1-40f1-918d-07cb3ec939cf.json | 132 - .../52066a23-9847-490e-90e3-57eee3c63276.json | 132 - .../91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json | 132 - .../323630ee-fbe0-49a7-aa11-816fde38ba2d.json | 132 - .../e5c8f97d-1873-4c9d-8bed-50dc592543db.json | 132 - .../7ee2803c-b8f8-4156-8472-bab4baab8863.json | 132 - .../78573f63-3073-4be4-93a7-0ea00b1383fd.json | 132 - .../42da7295-d78d-49a4-9279-8406063240c4.json | 132 - .../b61c5735-53ca-4dda-a223-79921eee7f3e.json | 132 - .../310124ef-e33f-49de-83eb-e665a5143aaa.json | 132 - .../c9b056df-8bbe-4959-ab44-85813157c95c.json | 132 - .../7a60385f-48dd-4926-8b66-3d42a1631db3.json | 132 - .../da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json | 132 - .../e2930715-b616-49a4-83bc-53e92fc3580f.json | 132 - .../543f45e0-a158-4fdb-bbb1-8deb38f4515b.json | 132 - .../b96a20e0-d044-4a66-8909-437aeaef569c.json | 132 - .../408742ff-4b21-46dc-b4d6-4c78d652d228.json | 132 - .../496a9fbe-376c-4546-bd90-b42f583924ce.json | 132 - .../f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json | 132 - .../cc36cc37-0f41-42aa-8051-54cc135820ef.json | 132 - .../20d3dac4-9f8c-431c-b20f-364dd860e37f.json | 132 - .../89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json | 132 - .../97bfd152-79c6-4c96-8d3e-588275339e41.json | 132 - .../93061947-2bcf-482e-ab22-38ef8ee33bcf.json | 132 - .../8f65748b-1251-49f8-bfed-d1e4a937d5ba.json | 132 - .../4f278881-69d3-42b5-b72c-ff8627a6ef44.json | 132 - .../d88e85c5-73df-46cc-9234-f0556592ad5a.json | 132 - .../44d2a20d-e867-4fa5-af3d-087f9c1b4067.json | 132 - .../e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json | 132 - .../44f2948c-4564-44cc-98d8-4f82a30e1f09.json | 132 - .../846cf1ff-62c3-44e7-b6dd-0135ec77451a.json | 132 - .../d2054469-b38b-4b1d-bd40-7324319f8eca.json | 132 - .../ce60608d-5b52-49d4-bbce-4b20e8272cef.json | 132 - .../f177bb70-fb7c-4b57-965d-acbcb4936bfa.json | 132 - .../a5b2ab3d-1f12-4a5a-a110-2514185568b6.json | 132 - .../63b887a1-a0b9-46db-a563-b9bd67a0805a.json | 132 - .../92d122f7-f29d-49e3-99da-bf20edf377a2.json | 132 - .../a0b71344-f3a8-4ad0-87c5-6393148488b1.json | 132 - .../821ff784-c48a-4623-9fb5-b77b7114b625.json | 132 - .../ed251513-4807-4e31-bc8e-3ab0217ae4f3.json | 132 - .../e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json | 132 - .../11dfd131-00bf-4561-a913-f1c0cb15bf9c.json | 132 - .../3ba34f38-2340-407f-a7b5-82749f8a0ee6.json | 132 - .../91b9649b-bdf6-4b15-a038-47edc2e79ef6.json | 132 - .../24670e63-32e1-4c5d-82fe-0d0c45a4e165.json | 132 - .../198d1441-1d13-468a-a998-c8cf9f1e7a57.json | 132 - .../e9eb1499-835c-4a70-b531-4be5a9718c34.json | 132 - .../b1fd95ad-767d-4c13-a936-00b08c74ca3d.json | 132 - .../f87bd357-535e-4450-b01d-b41e1b7571e0.json | 132 - .../300fd27e-4dce-441f-91da-f38bd14ffe5e.json | 132 - .../1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json | 132 - .../4c34d5c6-af1b-4519-8d08-67bd837e9b97.json | 132 - .../ddc27df7-1c4c-4563-92b2-5a39380423a8.json | 132 - .../3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json | 132 - .../b9053559-3b90-4de0-981a-dbb49db38eb5.json | 132 - .../cea89bc6-b1a1-4b67-a136-45e097563a5b.json | 132 - .../5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json | 132 - .../45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json | 132 - .../c94079d1-d8b1-4198-8129-8c5a11c310ca.json | 132 - .../cb45306a-096c-4ed5-a028-6d720b26afe9.json | 132 - .../f301908e-474b-4ba2-a873-610ca1b6c2bd.json | 132 - .../06f5865d-a62a-48da-b33f-486fe29e3685.json | 132 - .../4f952c51-91dc-446e-bda1-43ed66e1ca3e.json | 132 - .../dcba3a6f-8f4f-49f6-af74-541de16be435.json | 132 - .../b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json | 132 - .../1e597e9b-4e75-4981-842b-dad6f1c15ed7.json | 132 - .../18752dc4-76d1-40dc-9f43-62b8087b7a88.json | 132 - .../fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json | 132 - .../5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json | 132 - .../a95ab4cf-456f-4b3d-9bab-2b755649758d.json | 132 - .../9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json | 132 - .../26ff113c-95ca-4716-83f7-4792b46be246.json | 132 - .../285e1d08-15a0-4d8b-a844-e4cad923ea9b.json | 132 - .../0462269d-94a3-4991-9af5-e55592f344e5.json | 132 - .../c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json | 132 - .../0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json | 132 - .../4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json | 132 - .../84783e4d-5eed-474d-9463-a01a0890850e.json | 132 - .../d9fe39c5-24a5-4240-bfc9-59860fcb3911.json | 132 - .../2ddf850e-36dc-41b2-92da-e2b45d1544c6.json | 132 - .../b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json | 132 - .../2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json | 132 - .../425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json | 132 - .../7e1fcf4e-9f64-4112-934c-4808f07d32b2.json | 132 - .../d3666566-09dc-4d53-9996-2301c6fb2721.json | 132 - .../36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json | 132 - .../a6dba337-81d2-40c6-89c2-aee6de82282e.json | 132 - .../e44b8d9a-f270-45c8-b126-6a8911c35436.json | 132 - .../44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json | 132 - .../4246401d-9049-4c83-83d4-e2d9efa4dded.json | 132 - .../26c4785a-0caf-4b01-be5d-1e421bfeb698.json | 132 - .../cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json | 132 - .../b4edb7f5-a675-4627-af96-7ed0909da1e5.json | 132 - .../461b6f40-6f19-48b1-857e-f0fb37f929f9.json | 132 - .../e924270d-a655-4093-91b2-f73b7f12eefd.json | 132 - .../af8905e0-e969-45bd-8e09-e7316fff0914.json | 132 - .../e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json | 132 - .../47472cd9-36d3-4074-83d4-af53b9c23758.json | 132 - .../b922f4e1-1fd9-4a32-94ce-4784430cef51.json | 132 - .../5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json | 132 - .../35937213-bb16-4935-9d92-9fa8fd61aac3.json | 132 - .../04122d1b-929d-439c-bb8d-f08508f7a00e.json | 132 - .../03beb242-2628-4ea0-a2f3-c3ec43d379de.json | 132 - .../46d55b7b-1972-4cb0-97ca-e04d306282a7.json | 132 - .../32730d82-cfac-481f-9a22-9cbe40646218.json | 132 - .../a290a75f-753b-489d-87a2-ce0637c09f41.json | 132 - .../54032eb0-c4cd-4c76-be2e-f0c81bd26365.json | 132 - .../73b59506-cc1d-413c-a28b-d25e0e6bf413.json | 132 - .../bea2dcd6-4772-4aac-bcbc-4802cfb33495.json | 132 - .../66275215-28e6-42bc-bc22-5d152682ce53.json | 132 - .../9015365c-400b-4fa3-85f2-a1033b030cf7.json | 132 - .../55d52914-0904-4e6e-8b37-c22b06f5f2bf.json | 132 - .../3677260a-2fd5-41bf-9010-f1b31cedacbc.json | 132 - .../fc54f87a-2e4a-4f3f-b407-e268c4487d16.json | 132 - .../8d893736-1707-4c0b-860d-16c62ec26d78.json | 132 - .../d3d2728f-74bf-4196-a909-43797d8b628a.json | 132 - .../ed241e67-8718-48be-a6e8-19e295a2b5cd.json | 132 - .../05aafad3-e07a-453b-a70b-f18fbd4eb218.json | 132 - .../f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json | 132 - .../cec76b15-1069-4d37-b8bc-74dde28101f6.json | 132 - .../e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json | 132 - .../f8d629bf-df0b-4c6a-8c18-17dda002b089.json | 132 - .../6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json | 132 - .../a51722f4-29f4-47a5-acba-4c8b5355551b.json | 132 - .../06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json | 132 - .../04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json | 132 - .../e4668365-d3dd-4996-9bb1-5b4e6f510264.json | 132 - .../4d743678-e14d-4866-b1bf-0d660787847b.json | 132 - .../720b1476-876c-47d1-bf46-d037389b4b2f.json | 132 - .../4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json | 132 - .../b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json | 132 - .../623f1b73-1505-4527-b41c-dcb2b711226d.json | 132 - .../53f03454-9587-4208-bc01-21de62f59195.json | 132 - .../fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json | 132 - .../b127a923-3bf2-4cad-9225-d738efe800e3.json | 132 - .../a94ae52a-7936-4750-83f5-4740f23adf15.json | 132 - .../95e689c6-cd19-4114-b3b5-1672ab849214.json | 132 - .../890a8414-bccf-4a66-8013-6c270d017965.json | 132 - .../0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json | 132 - .../121096cf-356b-4069-a0a3-8cf6aad52b81.json | 132 - .../fb0bcadf-32a0-4320-909f-2c38ba7d9372.json | 132 - .../ab941c52-cf33-4b8e-87af-4a73930cf72a.json | 132 - .../08c242fd-0258-4817-970a-668584ed9385.json | 132 - .../2171af9a-be5e-4daf-8e67-a5239ccec7bd.json | 132 - .../706f75a1-2f6b-47dd-809e-a830e739b574.json | 132 - .../a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json | 132 - .../67cfd12d-0551-406d-bd1d-8ced75c69478.json | 132 - .../0a31d2f0-196b-4508-861a-1ba7bd28ea23.json | 132 - .../57576999-2749-441a-91d6-5a976e83a658.json | 132 - .../e44792e6-0329-4784-832b-3043478e70a4.json | 132 - .../8b3789d6-51be-472a-95d3-2ae7c34ad140.json | 132 - .../3f4765f2-551b-485f-9020-0cf17a36a887.json | 132 - .../6375a845-5d86-4dcf-bfd2-e836daa4ca11.json | 132 - .../65a74446-6964-4f5f-8ea6-aeb1b09595ae.json | 132 - .../dcba5998-3b84-4753-a4fa-2558ffe3e69b.json | 132 - .../0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json | 132 - .../4e332594-d0b9-4913-9950-208abe4faab7.json | 132 - .../5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json | 132 - .../c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json | 132 - .../a2e32a77-867c-4921-ada4-c7b169efbebe.json | 132 - .../f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json | 132 - .../ece0bd6b-4eec-485c-942b-e23f3295c2f8.json | 132 - .../ada110bb-0988-4c19-9798-74577dde5ce9.json | 132 - .../ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json | 132 - .../57395f9a-0534-453e-80fc-96e9dc5cd9c3.json | 132 - .../f8f70702-9ab4-4e1a-a11d-090627d58f02.json | 132 - .../3cab8bda-bdf6-4345-b89e-18d34a8f6361.json | 132 - .../0955fc17-8878-401a-9ec3-149528ee51e1.json | 132 - .../c63bf49a-e7d4-4853-8684-9cc03eaa7840.json | 132 - .../65e6a3b6-4291-4591-bc0b-576930061c68.json | 132 - .../1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json | 132 - .../f9f96bb2-edbc-4112-97aa-a7420dea32a1.json | 132 - .../3a24b30f-7698-4ecb-ac26-3537a0b38616.json | 132 - .../d4030df6-2be6-4f46-9c9b-ce3037b9a004.json | 132 - .../ec234403-f43d-46a0-84a4-ab47673226b3.json | 132 - .../805379f4-784f-4602-92e8-180df4da9fc3.json | 132 - .../9f3920aa-9400-46f1-bcfa-969f69b3335c.json | 132 - .../26cbf444-ab93-409a-b85d-e2bd267eae5e.json | 132 - .../7c2b17a8-1de2-4441-a281-fe3fd043f831.json | 132 - .../94c5756c-cbde-46e2-90d2-207678373061.json | 132 - .../e0048124-89bf-4327-88a8-00aa51ee29af.json | 132 - .../9d776307-43af-43bb-ab64-52fb7f331cfe.json | 132 - .../d8d41981-a7c8-48e9-a63c-86520a0f23d5.json | 132 - .../1355985c-fbcb-4eac-8435-417d6034f2f0.json | 132 - .../44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json | 132 - .../45ae3dc3-6dc0-4d10-99cb-a7f330110906.json | 132 - .../6b54763a-6329-47fb-bf50-296604251b47.json | 132 - .../96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json | 132 - .../655b047f-c3a8-4c9c-b864-81d318b2f506.json | 132 - .../f62fed77-e166-422d-b5ce-c50b7bccbf4c.json | 132 - .../7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json | 132 - .../2c93c987-b32d-4a02-8df4-949cc45b8eb2.json | 132 - .../02e7c1d6-9db1-4de8-b13e-afd752b3669a.json | 132 - .../580a3045-338a-47b2-8ed7-54c993d5aa90.json | 132 - .../e71d3be5-ea9d-4426-aa58-5806b7541aa6.json | 132 - .../1174683a-9488-4c6b-be6b-e5a96328a96f.json | 132 - .../3789b37f-daf0-4c21-82b8-309cbf00312e.json | 132 - .../8586cdc1-dd4e-4112-a59c-f6bc2766701b.json | 132 - .../946a7b16-dfa6-42ad-97c1-955bf8a40dae.json | 132 - .../d9a6cc31-57c4-4480-a019-25a34b31fcc8.json | 132 - .../279bd5fa-0ab1-411b-871b-bd9ff23853f6.json | 132 - .../c26fae10-e65a-49ac-a2da-2dbf024fd10d.json | 132 - .../6d37b2b4-630e-4471-b7a8-50f8a58902fe.json | 132 - .../de687865-4297-4130-bcfe-0c5116c9b0d1.json | 132 - .../ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json | 132 - .../52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json | 132 - .../2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json | 132 - .../5120e433-f5c7-45fa-be56-566101556271.json | 132 - .../7f4b4668-c3a0-4575-957d-ba321d55f420.json | 132 - .../9245b74d-4b9d-4158-a402-0c3742097eba.json | 132 - .../29a5fcd3-9c22-424c-ab17-70cfe187aea1.json | 132 - .../af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json | 132 - .../258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json | 132 - .../4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json | 132 - .../a5d66f97-1f4b-43da-a83a-4a262e297fd9.json | 132 - .../5d29cf73-65d6-4965-a504-4caf07108cc8.json | 132 - .../15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json | 132 - .../2ed96c70-390b-44de-aa08-9883a2f33ff3.json | 132 - .../67c95889-8a67-40fd-99e2-62e767c16416.json | 132 - .../a518f39d-e073-493d-9a4f-9af53fc71abf.json | 132 - .../24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json | 132 - .../3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json | 132 - .../ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json | 132 - .../0da639d4-181c-4ee1-808c-3de8003c2471.json | 132 - .../480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json | 132 - .../dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json | 132 - .../a2ae2953-e341-49be-8469-32bd41d780d7.json | 132 - .../23bdd694-f250-46dd-9b8b-526fda47bc9e.json | 132 - .../d600a69d-1952-4e30-abe8-1769ab63ac29.json | 132 - .../afc031d4-852e-4ead-9098-6ce30112b459.json | 132 - .../cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json | 132 - .../a4b93124-1151-4f69-8a5e-6b916e8cf11f.json | 132 - .../efe11d8f-65e6-4ba6-8148-fdd43c9346be.json | 132 - .../923da7be-2ec8-46b2-8187-fe08eb86d5a0.json | 132 - .../1652b9fe-640a-48f9-b7a5-20ae28fb5985.json | 132 - .../572463ed-f6b9-460d-9c38-0e0ee5327511.json | 132 - .../5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json | 132 - .../32322361-f18d-480d-9475-cd11a45bc4bc.json | 132 - .../f62d1aee-2d9e-466e-85e2-002fae5d2504.json | 132 - .../af389bf1-da63-49a9-9e49-32613d8d05b8.json | 132 - .../ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json | 132 - .../1e697620-36a7-459c-b88c-405febb57c3a.json | 132 - .../532723e8-a9b7-4f72-a015-c2bd9363b5d8.json | 132 - .../be096a57-7d81-4999-919a-ed8a243012b2.json | 132 - .../cadeb016-e158-4a49-921c-efe0e4eb0cb2.json | 132 - .../c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json | 132 - .../04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json | 132 - .../a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json | 132 - .../29c7bc9b-6833-497b-a553-2941026efea5.json | 132 - .../09a60955-978e-4136-bdde-d5459e37ad2c.json | 132 - .../501744a2-070a-4378-9232-f7ccd9b2a67e.json | 132 - .../369efdc6-6529-477c-b5f0-d229c8102491.json | 132 - .../906645f3-2041-4380-8118-ac26b92297ba.json | 132 - .../57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json | 132 - .../95f2fa22-3da9-4876-ace3-50763f2b2453.json | 132 - .../b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json | 132 - .../b3173a2a-8309-498d-961b-0167d5d5dea6.json | 132 - .../0d59dd75-c999-4a7e-919a-fd084202fc9c.json | 132 - .../639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json | 132 - .../56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json | 132 - .../d03fb481-be0b-4dfb-bb4d-54067e058e99.json | 132 - .../d8fc3475-83e9-4790-a472-72b442087562.json | 132 - .../57efd335-4873-4e01-bfc3-0d704b3d482a.json | 132 - .../25fdcc8a-0e7d-4148-8508-2631ea6deb05.json | 132 - .../f5f63d06-7e51-4b91-8814-ecbda604fe6b.json | 132 - .../5326c33b-6b8a-472a-9058-a9e9fe83b599.json | 132 - .../28674053-e1b6-4f0a-a90e-5dd5082ec164.json | 132 - .../fd27bfa7-11b3-46d3-915c-373ddf5a9865.json | 132 - .../91f190ba-39c8-47af-8351-73d1f382dd99.json | 132 - .../b637b55c-dd05-4060-bf33-e63e9de7fac9.json | 132 - .../bcacef79-d7c0-46e7-9194-43541c2f01fc.json | 132 - .../77a358c7-59fa-4b22-a190-dfca86c5166b.json | 132 - .../ad4c8922-7079-4383-8f42-d3de6326a1e1.json | 132 - .../7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json | 132 - .../07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json | 132 - .../5fb04756-c7bb-4772-b209-0d9a300bbf7d.json | 132 - .../0c02d1b6-2d31-4c54-b881-588cbfb0c686.json | 132 - .../a32e4d22-8096-4537-a68a-98ff9171ac8c.json | 132 - .../4e45b666-fa7e-4a38-8b6b-65846876c8d9.json | 132 - .../d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json | 132 - .../6afaec07-ebb8-4f3f-af48-c679f38f4917.json | 132 - .../bf8370c9-baed-4034-ac38-c6f796baca15.json | 132 - .../d397c078-6fe3-44a8-859c-a0f7c551dc3a.json | 132 - .../ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json | 132 - .../6be795f4-0784-44bf-8926-e3060ec37dcf.json | 132 - .../d4d808f5-3b79-43b5-8076-d3f785083789.json | 132 - .../370f5923-91d7-40d2-bd06-bf2b657b8ef2.json | 132 - .../5334e5e4-d243-4c20-912c-d0ded74d6ea5.json | 132 - .../7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json | 132 - .../68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json | 132 - .../59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json | 132 - .../220cd306-0613-4c8f-9848-4af812a1d37f.json | 132 - .../39a6a40c-3fa0-41ba-9d13-da9381263d4a.json | 132 - .../4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json | 132 - .../16baf620-7dcc-49f3-a787-b431e11ad4f6.json | 132 - .../4745add2-7bcb-4c05-8b12-6bd30856890b.json | 132 - .../f68b122d-4dec-4d5c-ac22-198da3d3e96b.json | 132 - .../2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json | 132 - .../f21bcd75-fc9f-4266-8976-3227b18b6b32.json | 132 - .../7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json | 132 - .../1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json | 132 - .../ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json | 132 - .../df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json | 132 - .../774d54fb-a445-4ed9-b79a-9c1346537e98.json | 132 - .../420b8be3-3560-48e8-8ab3-bb55338a9069.json | 132 - .../c118b75c-597f-48a7-a4eb-675af72c9930.json | 132 - .../e75534d3-b994-4e88-9274-7b62f61916cf.json | 132 - .../770a1ff1-057f-49a7-9402-c6dd881ac03d.json | 132 - .../6cc9790d-9b02-437e-8ac7-be4152f5b17d.json | 132 - .../264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json | 132 - .../549db368-437a-4982-ba5b-5c4d7bf203ae.json | 132 - .../0d098a19-7e8f-4a52-8466-729be91388d8.json | 132 - .../83335f65-25a4-4bec-a901-587567ed0e99.json | 132 - .../02fb24c3-927f-4c21-bd47-b883521162a3.json | 132 - .../2a6507c7-44c1-4416-9ff1-36abd6af3b73.json | 132 - .../327a146a-8cfd-4480-8342-46afde530677.json | 132 - .../0700fb7a-e722-432f-a64d-c040bba4deee.json | 132 - .../131d3a7e-43dd-4189-8466-6562703b3bdd.json | 132 - .../8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json | 132 - .../aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json | 132 - .../a73250f1-399a-4afa-bf83-4036dce78ef3.json | 132 - .../f68bf680-9626-4952-b95e-12a18fd60820.json | 132 - .../d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json | 132 - .../7b5eab2e-fba3-47d5-9839-02249c2568c5.json | 132 - .../2acee2c3-4322-4152-8151-c1d571475b7c.json | 132 - .../67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json | 132 - .../2923aeb3-982f-400d-9588-707583c75a1d.json | 132 - .../b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json | 132 - .../7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json | 132 - .../fd481b93-55b2-4831-9be9-1b1b2886fda3.json | 132 - .../f159748f-234e-4962-b582-cd5805448f33.json | 132 - .../044d53dd-d134-4959-a70c-46f11cc0b300.json | 132 - .../f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json | 132 - .../5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json | 132 - .../80e5134b-0733-41cc-8b4f-ef32fbe57066.json | 132 - .../61123e41-7b2a-40da-9f7f-b830c27d7f12.json | 132 - .../b93c31d7-54c3-47b9-a267-3f8fdb796805.json | 132 - .../b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json | 132 - .../3b06f75e-3d22-4428-8d4f-2e704b96961e.json | 132 - .../dfda4aab-f8d4-49ee-b141-78539b69007c.json | 132 - .../690f3c19-c148-458d-b4c5-87761d72b851.json | 132 - .../b6a18246-776d-463f-80d5-140df74e9704.json | 132 - .../9831abdc-ad08-48c0-8384-86240e7350b5.json | 132 - .../96a572e5-4751-46ce-9202-deb223ef4dfe.json | 132 - .../f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json | 132 - .../8376c0bf-f9c3-4529-b13c-c57106182d15.json | 132 - .../97a80145-e621-4603-8ff8-2cc4bd74190a.json | 132 - .../99a7881c-cca0-43d6-96f5-ce5292ed60a0.json | 132 - .../60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json | 132 - .../4a0f8dc7-9446-4dda-bf49-8cca4851746c.json | 132 - .../6eb3a040-8234-4d31-8274-6987b0e4e3b4.json | 132 - .../16053077-38fd-4136-81a5-fea0d4cd927a.json | 132 - .../25abb99f-536e-4638-8611-a1db5dee931d.json | 132 - .../aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json | 132 - .../b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json | 132 - .../169fb05f-5201-47b8-a06e-7d01e574c689.json | 132 - .../db076309-32e5-4d46-9786-ff14f8daf5d2.json | 132 - .../cde914dc-7d57-425f-9787-e4b8d36d61cf.json | 132 - .../5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json | 132 - .../8c645c9f-02f6-44a5-b295-d6364ed49464.json | 132 - .../97bb5519-e2d3-44d5-abf4-b5263c2b3245.json | 132 - .../bd3d78d3-3ff1-4a92-a316-e4e30787a331.json | 132 - .../d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json | 132 - .../e1772d6c-fd26-43a7-82b3-7997d8a6809f.json | 132 - .../febaf893-6aaf-4c87-89fc-cc865ebf2859.json | 132 - .../0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json | 132 - .../0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json | 132 - .../e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json | 132 - .../85426280-8138-46d0-a111-b59b0d7c86c8.json | 132 - .../32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json | 132 - .../86ed6833-ae85-4a8e-b840-b0c9540083ce.json | 132 - .../2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json | 132 - .../9677e68d-afda-4917-825c-83318219ff59.json | 132 - .../23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json | 132 - .../bec23315-f98a-4211-81a0-c49f395e66c9.json | 132 - .../1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json | 132 - .../39327803-11e7-4b28-8750-81feb027e8f3.json | 132 - .../ce2b6874-0fc8-4364-a526-7b25b101e1e3.json | 132 - .../9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json | 132 - .../d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json | 132 - .../1eb697fe-9dd4-4a41-aa47-33456df39e2d.json | 132 - .../5f10df7b-cd2c-44ca-b13a-2852483c71f8.json | 132 - .../3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json | 132 - .../5f47e65d-293f-469e-a18f-5627ca1adf44.json | 132 - .../b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json | 132 - .../15c21655-9af8-4bee-9884-b047683e9adf.json | 132 - .../f642de95-218a-4db0-807f-1bb97618b4f6.json | 132 - .../01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json | 132 - .../1ee8c377-2236-4225-942f-ef8ce5770741.json | 132 - .../4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json | 132 - .../419c6631-805f-43ba-9db8-5296f8d221ec.json | 132 - .../3fc1822f-4a43-4a3b-90d7-fc163491c90a.json | 132 - .../76b4037b-c5d0-435f-966a-bd88b1665dad.json | 132 - .../757b85e7-84c8-429f-aeb4-870852fa8959.json | 132 - .../acab4982-1205-4362-803e-306b1e2371bf.json | 132 - .../0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json | 132 - .../d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json | 132 - .../708aded5-6252-44e3-bf0d-08bf3e7f32e0.json | 132 - .../ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json | 132 - .../5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json | 132 - .../47320824-8064-40d4-a08c-810faafbba77.json | 132 - .../8baeef58-0ba6-4723-8f23-7a4c386f2cad.json | 132 - .../0387ca63-1e31-4eaa-ac7c-35d417548c54.json | 132 - .../733983fe-4b9c-47e6-963d-c57829b6f1af.json | 132 - .../80c4859d-8016-4650-939f-100ba2e6d808.json | 132 - .../21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json | 132 - .../d781945e-e9df-4136-90cd-632f0bed6246.json | 132 - .../8f146bb5-dd4d-49ce-ac60-76f66321feb8.json | 132 - .../89bfba6d-c622-445e-b0b9-512aadcea7cf.json | 132 - .../9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json | 132 - .../455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json | 132 - .../e04a76a6-ac22-43b2-bbf9-196a08de2949.json | 132 - .../2fcb74f0-add1-4d46-8a0f-8578a616dbed.json | 132 - .../51530638-ef76-43ce-9396-8a0d07988712.json | 132 - .../74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json | 132 - .../80e8b9f0-b507-4927-9d24-1c793e3783cc.json | 132 - .../7b037520-a5e9-4b58-80f3-f0ecc5957c67.json | 132 - .../10b88d05-62d2-4603-9d04-b0854e39ed40.json | 132 - .../4b693f41-d811-4b64-892c-d840eee5ace4.json | 132 - .../90d86c8c-3aa6-42ba-a94f-75c961e65c41.json | 132 - .../8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json | 132 - .../b20a1d13-2f14-42e4-bdde-49f053cef325.json | 132 - .../51521dfb-d4b5-45df-ac2a-54190aed0b9f.json | 132 - .../997a1ceb-185a-4e6c-8383-eb5a6f976771.json | 132 - .../22101998-c3d3-414f-9ed1-99330cdbe3b2.json | 132 - .../a2408953-a7eb-449c-b80c-3620915d44d0.json | 132 - .../d65e5b08-7d3c-4c0d-85fa-496db65a235c.json | 132 - .../ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json | 132 - .../e9ba998d-8147-4046-afae-9ee7d544e98d.json | 132 - .../c44f1012-1123-42c8-b110-5735dc756fd5.json | 132 - .../5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json | 132 - .../b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json | 132 - .../529dba11-53af-4045-ae46-04e1b9838d4a.json | 132 - .../391f6d6c-418f-44be-910a-fb90b5712649.json | 132 - .../2ccccb4b-7260-4a1a-9426-117e359c7c5c.json | 132 - .../84afecec-453d-491c-9f5a-de31d8fba43e.json | 132 - .../dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json | 132 - .../1179bcce-558e-40ad-8537-c74c59557975.json | 132 - .../fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json | 132 - .../81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json | 132 - .../173bb053-e817-4551-b169-c3f71163650a.json | 132 - .../b7e6a86f-340c-48ed-a828-2e80a13aa515.json | 132 - .../bd221eee-7aa8-4d6f-a6be-89ee5568e729.json | 132 - .../8727a325-a515-4456-ba34-65c30f84644a.json | 132 - .../3e4011fa-d480-4c16-9371-2025bc834358.json | 132 - .../867499a7-589b-4564-b04d-a004b7c0abb4.json | 132 - .../52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json | 132 - .../5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json | 132 - .../3278855d-7bd1-4e7e-b27b-b1393006e7e7.json | 132 - .../5193ab4d-1627-43b5-bfb7-89e08ea1f810.json | 132 - .../598faeda-48fb-43a8-aaa9-849d5dfcea79.json | 132 - .../d1afa2fb-1256-4dd3-b13b-802917bf481b.json | 132 - .../397c9bc3-0af5-453c-9b68-5360783dfbf7.json | 132 - .../9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json | 132 - .../7e793244-b746-4aa4-a401-dcf5884f61a4.json | 132 - .../26a8da03-debd-41e3-8ee1-2827d76b26ca.json | 132 - .../e214c326-dd84-4915-bba1-faaafbb026b2.json | 132 - .../98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json | 132 - .../40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json | 132 - .../d881a83a-9ba8-4919-8b89-45f5a7220621.json | 132 - .../d6c966a1-7927-424a-9886-b98688d27e6f.json | 132 - .../c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json | 132 - .../b3979c7f-0596-4a24-b264-73a17ba19821.json | 132 - .../f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json | 132 - .../8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json | 132 - .../6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json | 132 - .../329e5e91-10ba-4795-ae86-dda95e698b4f.json | 132 - .../3fe89b13-135d-4790-871d-74e7a28ea2e9.json | 132 - .../4b807741-f1b9-4964-9bc9-bb93f9b34217.json | 132 - .../c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json | 132 - .../f6e157c4-0ce9-41c9-b885-9222d894ff0c.json | 132 - .../fe52a94a-5324-4b59-accc-dfd1f9d4aead.json | 132 - .../1241f5e3-54eb-429e-b109-a5e163e39eda.json | 132 - .../8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json | 132 - .../5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json | 132 - .../63e82cb3-2f6f-4617-abb7-ae093bc27830.json | 132 - .../0feb74e6-40d4-472d-9233-27faa2d3f802.json | 132 - .../e74dd005-c9b5-45c9-b7f5-455c3110e09b.json | 132 - .../d094bf6f-9952-45c7-995e-d7eda07f4668.json | 132 - .../0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json | 132 - .../f91982ac-0cab-415a-8503-e090d195bd05.json | 132 - .../fb1af66e-7828-495b-8277-5cff77c3070e.json | 132 - .../ac84c157-4d11-43c1-8731-b1e5cfa91668.json | 132 - .../bbc812dd-9a9c-4f99-b813-50361025eea3.json | 132 - .../fc818799-49d5-4fca-b131-ebe8d5d831f1.json | 132 - .../33349989-8573-4d71-ae0f-99691fdaffc3.json | 132 - .../91551de5-d8ac-4c0d-b9b4-3627db947f0e.json | 132 - .../c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json | 132 - .../36821a8b-af18-4631-b4b0-7e4b37bb194b.json | 132 - .../e402d129-f4f1-4b95-b079-4f30936119aa.json | 132 - .../814e1ea7-a639-4b05-9208-0bf537ea5479.json | 132 - .../35a50d36-31d0-454b-a13c-80ca26945f94.json | 132 - .../87347017-4ff1-4bd3-a1d7-8f3999061209.json | 132 - .../976184ed-c4ed-4898-83c7-521a8a8309ac.json | 132 - .../fa52f072-7725-4a4e-b728-042e5897a1bd.json | 132 - .../6374dcee-301c-4f28-9316-82ed8e693089.json | 132 - .../b7c95cb4-f32f-466e-a28c-32afd9ec5578.json | 132 - .../bddd742b-f7c9-44aa-ad2f-83f51a4625be.json | 132 - .../099af0ee-c06b-4435-8f97-27681f3eddff.json | 132 - .../fa826f3a-8688-4518-8d44-68189abb47ba.json | 132 - .../10d29dc0-3486-40df-9933-1ce8f0fabaa2.json | 132 - .../741ff375-3392-461e-a9b0-e0dab4e6e9f8.json | 132 - .../c3d709de-118d-40c2-ab89-040efedd7fdb.json | 132 - .../9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json | 132 - .../be850d1b-bf75-4c34-830f-8881792ac842.json | 132 - .../6b644b97-4fc3-4826-9ea9-68be1dc8e947.json | 132 - .../861d41f1-6d33-4e07-96ea-2c39a36c4b63.json | 132 - .../7501b038-4847-45bc-8b92-6800d7a58c1e.json | 132 - .../db48206d-700b-45f3-b597-8752110113b5.json | 132 - .../b52b76e4-9dec-4336-88b1-d98b95b95d2a.json | 132 - .../ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json | 132 - .../724221ce-d7b2-43cb-8e16-72ac529a7b60.json | 132 - .../552f3814-d071-4d00-a895-b739dffdcb2d.json | 132 - .../d3819133-bae8-493d-9a86-aee67da5d115.json | 132 - .../5c3a022f-7221-4b4f-ab67-d5b69c558434.json | 132 - .../c161b868-746f-4d88-9f41-eb8283a7b87a.json | 132 - .../f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json | 132 - .../39f4d1ab-fd42-4746-b949-9666ce32f9d1.json | 132 - .../8348f316-9109-4229-9fee-edc02431befa.json | 132 - .../6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json | 132 - .../8645ffc1-6487-4205-b8b0-e980e094ac6c.json | 132 - .../2c6d1e57-7673-4a86-808e-6ff6a7146a11.json | 132 - .../64ab8b1a-62be-4561-8f0c-e42f1fe37178.json | 132 - .../3eb22885-eb7c-4c85-b79f-cd47ffacd551.json | 132 - .../8956d608-c627-469b-943d-bfad6c7382af.json | 132 - .../9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json | 132 - .../e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json | 132 - .../20acb302-3a74-4425-af4c-a1d719b90a88.json | 132 - .../a8613588-687d-4291-ae5a-57688501cffd.json | 132 - .../83dd67cb-5508-4aa5-9435-d5585b7f3d52.json | 132 - .../26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json | 132 - .../df06c977-b54c-4668-837f-eb583ef24d29.json | 132 - .../31a8ac03-f58b-46e3-9f17-53311b1fd506.json | 132 - .../3e4a7141-7a82-421a-a107-bbac3cbafc9b.json | 132 - .../9a3069f2-81ed-484a-b6e6-a45a259e9a43.json | 132 - .../c0a3d0c3-c541-4606-a925-4100b062284f.json | 132 - .../20685a4b-686f-4cd4-b49d-3067a005256d.json | 132 - .../85a91293-cd51-4f79-8b98-2f4bc67d78c1.json | 132 - .../d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json | 132 - .../c4d686f2-2af1-4271-9556-09380f07ba5f.json | 132 - .../93167303-b38e-43f0-a552-72c26ccb4339.json | 132 - .../b52a176f-f369-4791-a7e3-88a72709c868.json | 132 - .../b6310012-17f1-4ee0-abd0-0079a9299350.json | 132 - .../f581e832-0f77-496e-bcd3-6cfec51ef594.json | 132 - .../47b47c89-b13b-4099-98b2-854feae05f63.json | 132 - .../8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json | 132 - .../4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json | 132 - .../0bdb6574-69e2-4858-b7aa-a90a5fadf741.json | 132 - .../fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json | 132 - .../d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json | 132 - .../912446e3-efdf-4ed0-80bd-261c6c87a3d0.json | 132 - .../5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json | 132 - .../80680e5e-ab83-4a59-aeec-9d4166509c47.json | 132 - .../c5bc9c92-8469-4174-aafd-67bb61aaccf2.json | 132 - .../1d67b792-178b-4baa-a108-2362f658bd4e.json | 132 - .../eb0c87b0-4795-4029-82c1-57ce37ba8259.json | 132 - .../dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json | 132 - .../e005624d-c822-4be1-9477-873642aae228.json | 132 - .../e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json | 132 - .../704598c3-c5d6-4ce0-bab3-0fa98118e16a.json | 132 - .../fafc9463-d725-4827-8bc1-5cd9e83814b6.json | 132 - .../109820e0-ee00-449c-9ae5-58a7bf1da5f8.json | 132 - .../37f29d5b-d803-4195-9ce0-75e45e32c160.json | 132 - .../43546f48-8c46-4481-b1e5-f4b1ad2535be.json | 132 - .../ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json | 132 - .../9290c86f-40b0-4520-b8aa-3460de62c396.json | 132 - .../a4bf576e-9556-4956-8dcb-4d8906d45db0.json | 132 - .../320a5c00-3307-4bc3-9f47-9befb88e461c.json | 132 - .../844d1556-6bc6-467e-a145-f92646770727.json | 132 - .../78923f4b-c2e7-4472-8398-10a0a8453ec5.json | 132 - .../17abe1bf-2e97-409e-88e3-4f661861a195.json | 132 - .../756978e5-1dfe-433e-ba88-339004a50ea7.json | 132 - .../a889ae3a-5d86-4454-bfb9-332c4b61b836.json | 132 - .../2c5e1086-03b7-4cdd-801e-03fb26183076.json | 132 - .../d9578847-b732-4c75-b246-9cdf03674fe0.json | 132 - .../4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json | 132 - .../619037af-d528-4579-b7e3-58628468d8fb.json | 132 - .../5113b737-8d9f-4321-9a67-91f1aabb40a1.json | 132 - .../641ac372-2e5a-4b44-b22e-a17600a6a868.json | 132 - .../7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json | 132 - .../c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json | 132 - .../50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json | 132 - .../99d97aef-bb6b-471b-8ed7-f6f92f75842c.json | 132 - .../b98504a0-f1d6-4872-b748-2ca8199c5328.json | 132 - .../5a159667-7460-4a97-884e-6a96df59873b.json | 132 - .../16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json | 132 - .../e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json | 132 - .../acbb93b3-f8fc-479d-9610-392efd7d4ecc.json | 132 - .../6d0589bd-1f05-44ee-afa5-3657b960d7c9.json | 132 - .../134663d8-05a8-4336-90e2-68e7cba5f1df.json | 132 - .../3bfced28-b06e-46ab-a6aa-171b0c424337.json | 132 - .../b6a83b82-6b05-4437-a076-e2a3982f6169.json | 132 - .../f621201b-f571-4487-9f1e-b767675c659d.json | 132 - .../710fdb79-fba4-42da-8e26-45b4caf75207.json | 132 - .../35fa7a5e-8866-4ce3-9899-8737e908f34f.json | 132 - .../2b24b69b-15dc-4666-83f3-c77db545bdbd.json | 132 - .../0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json | 132 - .../f45135b0-3c26-44b5-9922-a6c0817a172d.json | 132 - .../67eb0d6c-9086-4c80-8506-c3e1489f2673.json | 132 - .../79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json | 132 - .../4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json | 132 - .../6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json | 132 - .../871131c1-295d-40a0-a396-09d24b880064.json | 132 - .../44eefbb2-22d4-4dff-889d-a87fc40b2eea.json | 132 - .../cd1de470-a174-4c08-9efe-a06d493dc4b2.json | 132 - .../fdb55a14-0697-4775-8358-fed202498b4f.json | 132 - .../c069a224-638a-4cad-a9ad-e4f8579e8c15.json | 132 - .../10e5c103-f25f-45bb-bfe6-a22876cffe87.json | 132 - .../a9ecca9a-c5d4-45b2-a403-e74a98a46322.json | 132 - .../630d8a60-03b7-4550-82f4-e879b2e01c6c.json | 132 - .../206b5a96-ae07-41fd-822f-436d49c57dcb.json | 132 - .../702d2120-5301-4e03-bb0f-1f8ab19e522a.json | 132 - .../61e39700-c237-49fc-baef-3fa573b3b0c6.json | 132 - .../8892ab84-750d-494f-9f87-ad28e73cf364.json | 132 - .../538a2eb7-34e4-4e78-a382-60a13710096e.json | 132 - .../a041629e-8ed8-4a6c-95ee-98e759501e19.json | 132 - .../09f05984-5815-4b3d-bc73-83ea1e5ecc27.json | 132 - .../6535524e-f8cf-4f2f-9d89-9ba70aedac91.json | 132 - .../08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json | 132 - .../631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json | 132 - .../b771f6db-7516-4423-9010-3467db0e26e3.json | 132 - .../cf580dfb-2924-4c4b-9352-394275b959bd.json | 132 - .../ba549fe6-7718-4abf-a610-7e0f48611483.json | 132 - .../b92440b1-78a9-4288-a432-f057f2b04a2f.json | 132 - .../838f3932-edf2-4f72-9238-981d1aadc771.json | 132 - .../61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json | 132 - .../0b307c78-94c7-418f-bc47-5106b81c30de.json | 132 - .../18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json | 132 - .../dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json | 132 - .../8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json | 132 - .../f74d26e6-9dfb-4e81-8522-8309b27760cf.json | 132 - .../2022bcf3-a057-4b0a-aa33-6cf074ffc714.json | 132 - .../a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json | 132 - .../24d850fe-1817-4041-8767-085f4bd2bac3.json | 132 - .../610a3be1-1032-4079-ba37-d6c2c5f9fd55.json | 132 - .../857bb10e-1b43-4714-a758-0cef5816ba02.json | 132 - .../cdabdd54-6101-471c-9bd8-446953be986b.json | 132 - .../8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json | 132 - .../65d10996-2c5b-4e11-9a07-319c2446a237.json | 132 - .../ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json | 132 - .../45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json | 132 - .../4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json | 132 - .../b4a79f30-3a04-4f78-861e-1571316a0642.json | 132 - .../53426038-df38-45ba-b621-34231c9cad7f.json | 132 - .../fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json | 132 - .../d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json | 132 - .../8b752519-63d4-4638-b56e-1c45c7f4694e.json | 132 - .../8da71b7c-7b73-453f-998b-84e70b54e471.json | 132 - .../2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json | 132 - .../37e19712-3197-42da-a8f2-ae1f36c2b06c.json | 132 - .../c6ae6691-64ec-443d-8d76-af614c8cc7f9.json | 132 - .../80567722-8c6b-41b9-8103-3bdaedfdb8ee.json | 132 - .../20192dc4-ea3a-4413-8457-18a592fa0c64.json | 132 - .../8c878c05-86f7-4d61-81d7-9bb286516581.json | 132 - .../fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json | 132 - .../0516b46b-a957-413f-aadc-58f4339dc60a.json | 132 - .../97200dd7-7ed0-4a7b-ace9-31c173f017f1.json | 132 - .../758f8332-ffa8-4059-ac6f-400f9367bb23.json | 132 - .../b1103662-055c-471e-ace8-dd75f607491d.json | 132 - .../27b0d675-498f-4351-b92f-7c0d1a3c83bd.json | 132 - .../3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json | 132 - .../3883b0d3-e442-42d3-adc6-ed959c902dd3.json | 132 - .../da172cdb-1388-42f5-97b1-ae8e15291631.json | 132 - .../7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json | 132 - .../7cdd1de0-767d-4527-a024-c67166bb8b20.json | 132 - .../d4702278-54c4-42e8-a901-dfe5c7f2004a.json | 132 - .../149f8ee5-4376-4fcc-8f87-7412a3083570.json | 132 - .../de82b746-c5d7-450a-bc2b-1b2859d91d6b.json | 132 - .../d2a916a6-288a-4761-a3fd-ca674edb67c1.json | 132 - .../cda497f9-c7f9-48d6-944b-0167476e5e5c.json | 132 - .../b56c6c01-a226-4090-9332-330535d79e24.json | 132 - .../0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json | 132 - .../2917c469-7e22-497e-8d62-9b9972266658.json | 132 - .../2424d85c-e092-4e7c-bf4f-ae014d08a159.json | 132 - .../90278363-1d8f-47ca-a7dc-c51c6b511dc9.json | 132 - .../3c3197ee-675d-4bb7-874d-28104d2a3cae.json | 132 - .../eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json | 132 - .../d770f88d-b110-4f27-85e9-e52217c11798.json | 132 - .../364328ce-5de7-401f-ad84-0c76e3c1dc91.json | 132 - .../f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json | 132 - .../d641aa88-9981-4a25-90d5-fcc4564ede52.json | 132 - .../8915e742-df2e-41bc-b83f-3e111edfd257.json | 132 - .../e29a5e35-8677-4e53-83fd-85e919b4366a.json | 132 - .../e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json | 132 - .../504baceb-6684-430d-a532-b7b5b0b061fe.json | 132 - .../31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json | 132 - .../01ab0a3e-393a-497a-9b32-8af790b7581a.json | 132 - .../541967a6-b856-4dc9-958a-9335197fba99.json | 132 - .../ee31c801-67cb-46a3-9e39-02e842c0473f.json | 132 - .../65fabe8b-05af-461e-b804-fcff3492da34.json | 132 - .../7e1a7121-2c9f-4196-bbdd-48aea257f384.json | 132 - .../dd32609c-316e-4511-8791-fcae33a1a506.json | 132 - .../d95d7058-49eb-47d7-b790-3a253291d22b.json | 132 - .../37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json | 132 - .../76d0d338-e502-4638-adad-c4c4df00c26f.json | 132 - .../f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json | 132 - .../6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json | 132 - .../0e59c8ca-cde0-4482-ab03-3309bcb8737c.json | 132 - .../d7e900e2-0574-44cd-a68a-0dd2715cf48c.json | 132 - .../fd626c3f-566d-4193-9a85-e7c9a89e671c.json | 132 - .../196b04ae-fd53-400f-9f08-19edd4959f6e.json | 132 - .../57177299-076a-4506-89a7-ce54af08df4f.json | 132 - .../d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json | 132 - .../92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json | 132 - .../cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json | 132 - .../6999bb02-29fd-4c59-886f-184362afa06e.json | 132 - .../913d1d8e-0b02-4ce5-9b7c-403143a8c880.json | 132 - .../82c87bc0-29cf-4150-92f5-c80fb0028ea6.json | 132 - .../a18834ad-6143-4ce2-9842-471817a60a39.json | 132 - .../be900bcf-8ec9-484f-81db-0e83975c1ecd.json | 132 - .../d226ccf6-674b-44c6-8b11-d782b59a961a.json | 132 - .../d8839a1a-8d07-4e0b-bd44-2668c84f750c.json | 132 - .../e90b04db-2eb3-483a-ab0e-ea8aef821d84.json | 132 - .../900921ae-fbb2-4488-ab19-18987c1d008d.json | 132 - .../0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json | 132 - .../b50a49cd-2909-4dbe-9c9f-c150abb99845.json | 132 - .../13831d81-a9dd-43c7-bce1-240aad42fbc6.json | 132 - .../56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json | 132 - .../8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json | 132 - .../18ab167d-b72e-4fa9-94a8-09edc641c73f.json | 132 - .../7df237ea-29c0-4d0a-9092-c41df4c13aca.json | 132 - .../e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json | 132 - .../01591bb6-9daf-40fb-b802-0a007f4cc388.json | 132 - .../f6c32abf-bbae-4827-9ce2-29ce20c9463e.json | 132 - .../74a6605d-3557-4458-bef5-cc9420434e68.json | 132 - .../dbe6e126-d35c-4634-a544-adf374ed5d00.json | 132 - .../d68681c1-01e4-4af0-9a81-e0aaed0ae865.json | 132 - .../de9620b8-7112-436f-8941-fae2c5e7f9e0.json | 132 - .../cafee7ac-deb6-4c4b-af8f-81548648cb14.json | 132 - .../3e3cb617-6f19-4731-b31a-b1f4d88237d5.json | 132 - .../3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json | 132 - .../8909f916-401b-4457-ab8f-2691696049c6.json | 132 - .../ae191508-7dad-4cac-ad4a-af95d7a15b5d.json | 132 - .../507f5047-fac3-415f-b9fa-aae4311fa837.json | 132 - .../0ee8716c-74f0-41b4-94a2-efc715150293.json | 132 - .../fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json | 132 - .../4fd20259-c7c7-4da5-9013-ae2feb2175b1.json | 132 - .../a7c8c345-cade-48fd-93c0-0f344044d2b5.json | 132 - .../7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json | 132 - .../7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json | 132 - .../25468720-93d7-4f10-a534-30c4976657e8.json | 132 - .../5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json | 132 - .../27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json | 132 - .../65917125-bb7c-4d64-ba5f-b5e4f67ec332.json | 132 - .../30bf22d8-b93a-4775-8073-30e14e15e35d.json | 132 - .../ff510365-a13d-4e44-9709-59a56e864991.json | 132 - .../6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json | 132 - .../f1e8cdbb-14b7-4959-a053-fb1b37629aff.json | 132 - .../4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json | 132 - .../d6966190-e254-4902-8472-cac59bfbdbe0.json | 132 - .../5fdb5437-f413-451d-9800-42036cda7686.json | 132 - .../347577a4-2768-4472-ba48-9b174ad89724.json | 132 - .../33af440e-837d-4454-9340-af0d3ee74f77.json | 132 - .../1a1f4709-8d05-4905-8105-0c3606d5ef5b.json | 132 - .../28421948-089b-4487-bb71-a06e5ce74402.json | 132 - .../3fa0c783-9226-4fc8-b3a0-6e960684f43d.json | 132 - .../743b7fe2-f998-408c-98b1-af02d9c1ee2a.json | 132 - .../0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json | 132 - .../87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json | 132 - .../6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json | 132 - .../525f1b9f-88a2-459d-bb4a-7c01a0107968.json | 132 - .../503f79be-7f05-4464-ac9f-0f284f1e7965.json | 132 - .../86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json | 132 - .../d472ba79-6592-4f8a-a99c-ec3f71468d3e.json | 132 - .../6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json | 132 - .../76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json | 132 - .../d2845d6e-65dd-4448-901d-d554b3e741f3.json | 132 - .../f7dd203f-24d8-4875-878a-12ed99e20cd3.json | 132 - .../287ae246-bee5-4fae-b78f-203491aa8df2.json | 132 - .../9ee493f7-e031-4593-beae-65be17678e00.json | 132 - .../86b10c6f-41c6-4d0a-ae59-f90e204e466c.json | 132 - .../043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json | 132 - .../1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json | 132 - .../ee625c29-62c4-49da-9790-e7e67233157d.json | 132 - .../02b16bf2-62bb-401e-9726-2135d8d610be.json | 132 - .../db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json | 132 - .../aa37bda0-2e0a-4361-a5b4-468154d8ac72.json | 132 - .../d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json | 132 - .../becf9805-83a9-4137-a938-81a61a10e4f0.json | 132 - .../6e848120-bc31-4628-af05-30707a6dcc41.json | 132 - .../864af855-71b0-4b11-ae3f-56294a7d0db9.json | 132 - .../285bd390-1dd9-4db2-af45-68dea557da3c.json | 132 - .../459e2375-1a15-4129-bee0-dc8852d531e2.json | 132 - .../7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json | 132 - .../7ceab841-f9a3-455b-9314-243d8fc3cd11.json | 132 - .../c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json | 132 - .../68cb2ca1-1648-41a2-92b7-969bccdca4ee.json | 132 - .../5f285d61-5e4b-4c5c-8960-c10313d76ae3.json | 132 - .../3af19898-8590-4aec-b324-46c7fbf596d3.json | 132 - .../e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json | 132 - .../3f578b45-48f9-4022-991c-32a71706aba3.json | 132 - .../ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json | 132 - .../81630ea2-d496-4872-92b7-e476badaf50d.json | 132 - .../9436d04a-9c81-47ad-a7b8-496e14058627.json | 132 - .../f1e6e54e-cb97-4980-8957-2190ee5c4c34.json | 132 - .../30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json | 132 - .../1c389a32-68b3-47c0-a6b8-2c2291293002.json | 132 - .../e759a217-6571-446d-9bf9-d1512793f307.json | 132 - .../753f3b21-7365-4117-b2a0-a91f03ec3d39.json | 132 - .../297ef102-67c1-4e9c-b418-fed026bb1f8a.json | 132 - .../9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json | 132 - .../b1446577-f13f-434a-a0b4-916091395d4a.json | 132 - .../fc8946aa-8b04-482c-8c05-d026d2af07be.json | 132 - .../fabe3784-948c-4618-9cf0-c76a3ddd3820.json | 132 - .../736dcf09-6a19-4e88-a790-7a7ee74d8717.json | 132 - .../75b4c750-1570-4825-a04a-965c06861fd4.json | 132 - .../b7f8b678-2aea-4d41-ba21-2083fc472574.json | 132 - .../a8010630-58de-448c-af08-70b8ffec431b.json | 132 - .../4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json | 132 - .../1132251a-59c7-402e-9957-f9288864508f.json | 132 - .../e2fac049-8f9f-4b71-bcd3-5746b7d90150.json | 132 - .../d891a1e1-ad65-498f-9ee8-59523c1bfd19.json | 132 - .../9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json | 132 - .../ca031f70-5785-46d1-8a58-b279d8340776.json | 132 - .../18457711-92b8-4c27-a89a-928fecdf5724.json | 132 - .../3398aeb8-08a8-4be9-a24c-efeabcaa2139.json | 132 - .../707bc006-4318-41bc-b91b-aa43ca7cba6f.json | 132 - .../7bfda919-13be-4b68-8655-99fe6a4605a2.json | 132 - .../f844e739-5f0d-4db4-ba66-bd33b1290571.json | 132 - .../0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json | 132 - .../87652005-4404-4c45-bd4f-5f63c44adf63.json | 132 - .../a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json | 132 - .../e8ba93e6-6f90-4169-8403-381b7f9e26ab.json | 132 - .../ea86b542-3d06-4e71-b49d-17cdd362b465.json | 132 - .../15615d2c-46a1-47c7-a273-697e97bdf9f2.json | 132 - .../a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json | 132 - .../76f3fa3a-1629-4cdd-b457-3a108784b427.json | 132 - .../c9e979e1-4433-4a38-8fd4-c14895e74f44.json | 132 - .../3f2effba-1ab8-476d-b228-ed9491e83adf.json | 132 - .../a5f0fb1b-27a7-495f-a010-3307afdb8949.json | 132 - .../22f2aa1d-fff1-430a-9c20-3b32859d9665.json | 132 - .../daff0e6f-d29f-4861-855f-902a0cd9a469.json | 132 - .../0f5cb926-b691-4d57-87f5-290235fd250a.json | 132 - .../d9e813da-2966-4901-99f9-c7627c64fc52.json | 132 - .../4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json | 132 - .../f7494fd4-d248-46a6-a46d-f9d8db560aae.json | 132 - .../4b8533d1-7770-435f-ba76-a5c658aabd8f.json | 132 - .../309c7906-0010-4f17-848f-185062d96a26.json | 132 - .../f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json | 132 - .../b4a70c71-dfac-4888-937e-d5220b491b0e.json | 132 - .../b879a534-6b24-4873-a0e4-e18453540121.json | 132 - .../c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json | 132 - .../7766c638-b4dc-4b2d-8c14-becdb1b709ef.json | 132 - .../dd211bef-3940-4d78-8f7b-a67da81d605b.json | 132 - .../87e20b7a-85c8-4845-94b0-ace1e18814cb.json | 132 - .../9ab01db6-3154-4c5b-b6a2-35479538d332.json | 132 - .../9d35316a-011d-4e45-ae57-317b53de621f.json | 132 - .../c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json | 132 - .../0659cb01-0d52-42cb-9e3a-2d8cac01692e.json | 132 - .../98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json | 132 - .../6e0f7e7e-8927-436e-95a7-5a7c626ca241.json | 132 - .../9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json | 132 - .../04840708-a4cc-407c-8b2a-876b382920a1.json | 132 - .../83b0844c-70fe-4b63-8ed2-4147390518ee.json | 132 - .../9cf10c60-bee1-4f4f-9e03-c3c10287bded.json | 132 - .../8e92dd9e-a68c-46ef-9b03-955c06a21437.json | 132 - .../dd1139d8-2b44-4516-b24a-1219826f5482.json | 132 - .../e37e86f7-b67b-4f0a-b1bd-92f30842b303.json | 132 - .../bc3b55d5-35ca-48b5-832e-8544e145b1b1.json | 132 - .../5757cd3d-c64e-4743-8200-5e610e24bf95.json | 132 - .../ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json | 132 - .../bee54048-ebb2-4051-a18f-aa85b0f2ce27.json | 132 - .../2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json | 132 - .../2c530a3b-888e-4a61-b97b-ea875b30ec9c.json | 132 - .../4c9fb322-735e-4644-8121-088d00f78c5f.json | 132 - .../e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json | 132 - .../e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json | 132 - .../42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json | 132 - .../915ae579-786a-4eb2-a1bb-107a12c9c40d.json | 132 - .../3489ffea-a607-4f3d-a0c2-bd17147f244f.json | 132 - .../7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json | 132 - .../6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json | 132 - .../fe344f84-7428-45af-940f-736275bc4d50.json | 132 - .../60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json | 132 - .../1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json | 132 - .../2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json | 132 - .../e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json | 132 - .../01bc964f-552b-4cda-9ed0-cf720f0c8de4.json | 132 - .../c9e95c55-978e-485b-8a77-ab2e668e3254.json | 132 - .../c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json | 132 - .../ae1801cb-d112-4d1a-895d-c6743779846a.json | 132 - .../008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json | 132 - .../379b315d-96fb-4edb-b2d6-3dc113a10c17.json | 132 - .../8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json | 132 - .../f76ce244-29f7-44f0-9850-7291f8e4cbf1.json | 132 - .../506871f1-0c87-4e8c-a270-eed7b5da2599.json | 132 - .../c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json | 132 - .../59f14dca-923a-41f1-b443-cc3551063f45.json | 132 - .../a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json | 132 - .../51d8f53f-ad7e-4dae-9e2a-0895729ff790.json | 132 - .../421119ea-0da8-4b26-a335-f2e720618c44.json | 132 - .../b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json | 132 - .../7c4c2ccf-7d7b-4d24-802e-20c182290d07.json | 132 - .../95212a55-f382-4869-9e11-cfa201ba865b.json | 132 - .../a7da2118-063c-489f-bb31-40f1b7beeefe.json | 132 - .../9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json | 132 - .../a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json | 132 - .../2fd1c45e-209c-43da-ae85-d60887513a96.json | 132 - .../91e0e6aa-b933-4a02-a28d-8d69e698c60a.json | 132 - .../6f3f3d06-2937-4c55-9b95-a62ae5253571.json | 132 - .../9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json | 132 - .../60077cbd-87af-4a00-a359-9235acb011ed.json | 132 - .../577936a8-b450-4233-b633-064565b3d1a4.json | 132 - .../470b9413-2cc8-4bf4-9e7c-0b8e99929568.json | 132 - .../3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json | 132 - .../3fccb1d0-5ae1-427a-adae-37004ecbacaa.json | 132 - .../6463183f-4043-4b96-b4d1-0bd41b4d6876.json | 132 - .../0b102423-1a06-4e5b-a287-710695658b63.json | 132 - .../b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json | 132 - .../3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json | 132 - .../abd48d9d-0443-40be-a23a-68922771e14f.json | 132 - .../436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json | 132 - .../7a654100-b206-4011-828e-fb386df27d0c.json | 132 - .../2f0e262c-a099-41f4-89f1-8b251708a960.json | 132 - .../7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json | 132 - .../8703dbdd-12ef-457b-8cda-f570c8f5c890.json | 132 - .../d77f3e8f-1eea-478e-babd-ba873d2d427c.json | 132 - .../783a4385-c802-4bb3-9a21-90629d16efc7.json | 132 - .../bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json | 132 - .../e80d25b5-3f4b-45a7-9472-09f98db03bf0.json | 132 - .../7fed0b1d-0d79-4784-8fd6-42f8611b1751.json | 132 - .../be534cd3-8245-4370-ba6c-9687b431ee8d.json | 132 - .../e98967b7-3aff-4baa-92eb-eff86bf09797.json | 132 - .../8736a22a-f980-4a01-953d-217f27050129.json | 132 - .../75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json | 132 - .../0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json | 132 - .../f8579305-003b-4727-b904-bad4f363a616.json | 132 - .../3103f36a-4a88-4a39-8261-0b597f8d6db4.json | 132 - .../eda9de3b-ae53-4102-b203-eddadbc50464.json | 132 - .../b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json | 132 - .../fa6ecaf9-457e-4135-ad25-4790ebc27737.json | 132 - .../ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json | 132 - .../e388c707-8b35-49a4-94eb-f32e983fe33e.json | 132 - .../f6273192-31cf-4ee1-af45-c2f62de05330.json | 132 - .../105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json | 132 - .../a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json | 132 - .../4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json | 132 - .../3c4713a3-3973-4a04-9c4a-a6782251734e.json | 132 - .../de70c700-a007-4e87-a3db-941ee285eb1f.json | 132 - .../a1324a7f-1911-4fa9-8d83-be891f752a61.json | 132 - .../9c4af0df-f538-4755-8cd0-eec6b2b26524.json | 132 - .../fde650a6-a5d1-4edc-bd64-8be806663263.json | 132 - .../96dd1a08-b166-4d8e-ac31-5e948adf931b.json | 132 - .../3b90b9db-a68e-4ee9-bd4d-a18cec357753.json | 132 - .../444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json | 132 - .../7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json | 132 - .../e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json | 132 - .../d05b129c-6b9e-4e6b-80fc-af65db620c5d.json | 132 - .../d9792fac-29c1-45b2-b649-cdebb6830e2f.json | 132 - .../fcc2f06a-e6c8-4c28-bf22-4ee582392912.json | 132 - .../c6e13327-90b3-440d-9367-dbcec54dd6cc.json | 132 - .../30b02429-350c-4d86-aded-ba8597bec4d5.json | 132 - .../7d1ee802-106e-4313-ba1d-72d5a0676c88.json | 132 - .../1b3af020-f65e-44b8-a9a2-ad60fa686427.json | 132 - .../6e40871d-bc23-4f1c-a005-f5b8eb096f84.json | 132 - .../1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json | 132 - .../ec601f5d-bf19-4407-ac41-6b9272d94735.json | 132 - .../87e53761-e8b7-4032-ae7a-c3a91704d115.json | 132 - .../59492d86-4b85-4865-84e9-84ab4ace630c.json | 132 - .../cc082df2-259c-44c1-abe4-ef349056a2a9.json | 132 - .../3f069053-b24e-4242-9302-d46b82e511aa.json | 132 - .../62cd9bcb-a74c-40b9-be84-a0077235ae3c.json | 132 - .../b4cd25f1-87d5-4173-a4d3-928444f6cb37.json | 132 - .../ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json | 132 - .../1e5b62a3-018b-429a-b2b4-325545ee99dc.json | 132 - .../958d410e-ce43-44c0-8a56-685c0a618408.json | 132 - .../57c53f20-aa32-49fd-926a-f26c9d0759d4.json | 132 - .../76def522-6fe1-458f-bfbf-99b50ece3367.json | 132 - .../c467bc88-6769-48ac-abd4-867ee38bbe57.json | 132 - .../801681eb-66f4-46e0-bb2b-7ba4b46679af.json | 132 - .../cdd0ea1c-b17a-4816-953c-1d7164c64114.json | 132 - .../b2060893-1f7d-4e7a-a458-3623147ac118.json | 132 - .../cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json | 132 - .../34bfe887-5a3a-4626-997e-c35d3a0ec341.json | 132 - .../b81acc47-6fd5-4f89-8c70-f8f14b677e04.json | 132 - .../30b977a8-7882-49be-8621-9ee3fce270ec.json | 132 - .../3367fd79-713c-4691-80cd-4abb6b2818ef.json | 132 - .../add899b8-f3e6-4d87-8846-8254f4dfbd5f.json | 132 - .../53829ec0-f233-4b61-a672-6a467823caaa.json | 132 - .../e2b41200-bff2-4835-a0ea-27ff56937570.json | 132 - .../3d33f26d-72be-451e-bcf0-501e0bc2f1db.json | 132 - .../3b4c05fc-2ccf-46db-8d64-045508f6614b.json | 132 - .../af83a91c-3b07-48c6-9726-5bd77347f810.json | 132 - .../48759b07-9aea-42bd-8d73-9c4208d2789f.json | 132 - .../68820679-55f4-494d-91a0-0db1bccb8983.json | 132 - .../029774ac-a63d-4acc-a37c-4194e4afdecc.json | 132 - .../146df856-e2c8-41eb-b860-ceb78c126e55.json | 132 - .../74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json | 132 - .../b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json | 132 - .../e79d0a8c-caec-4dec-b119-3229ffa69a73.json | 132 - .../2c760893-b52a-40a9-9420-fb193a62a5c3.json | 132 - .../ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json | 132 - .../fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json | 132 - .../9450acd9-16b6-49a2-9b73-cf1161b96df3.json | 132 - .../0d50ec2d-5dd4-487e-80cb-9533246a9876.json | 132 - .../f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json | 132 - .../c5e48fd8-0eea-46a9-8790-1745923561d3.json | 132 - .../870c7739-8886-47df-8e20-09bfae03b9c5.json | 132 - .../d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json | 132 - .../6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json | 132 - .../24e7df20-e046-48f7-909e-502d0c70216a.json | 132 - .../7920f562-9e7f-4a64-85f4-584b13af44de.json | 132 - .../c6620817-69fe-40e2-bb0a-1e9c739ab65d.json | 132 - .../520e2d66-4143-493b-8533-64f86c6d676e.json | 132 - .../993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json | 132 - .../4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json | 132 - .../42c174d1-6211-4438-bb9a-24f3cf386a6d.json | 132 - .../625bf39b-a118-4ec6-82d0-5405cf70ba53.json | 132 - .../e09cb198-d259-42ea-a356-6efe61b1e12b.json | 132 - .../5838b130-c2e6-400c-80b7-6822efb5db2c.json | 132 - .../52b51638-64cd-4b19-8fc7-c223d50bc549.json | 132 - .../28b3178b-c963-4267-9649-3f7fc10fba3c.json | 132 - .../748298a2-5042-4636-ac7e-051c28916f3a.json | 132 - .../03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json | 132 - .../c7fba530-63cc-4ece-a171-4a2919aa8057.json | 132 - .../c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json | 132 - .../c3800a5c-310b-41cb-9b07-cfc1f1b13256.json | 132 - .../e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json | 132 - .../2da19e45-117f-446b-b956-b35a20bb7411.json | 132 - .../9e982a33-19cb-4381-8560-884bc8946a2b.json | 132 - .../9130a862-cfd7-47ce-a92a-f60438739491.json | 132 - .../858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json | 132 - .../5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json | 132 - .../6feca911-7a6e-43a2-b59d-7cb48070fe8e.json | 132 - .../d3ad9813-273e-47de-be16-312cc67ac64f.json | 132 - .../317205ee-2cc6-4523-9662-be6508314b08.json | 132 - .../3b5fe65a-50a1-4036-b81a-86117356cab9.json | 132 - .../812ac262-97f4-485e-93de-f8d420b8658e.json | 132 - .../39cd7eb0-781e-47b6-8eaa-c72e702f778f.json | 132 - .../9411a8a4-306e-43da-96d7-c93eb3aac398.json | 132 - .../c93feb32-0526-44ac-b3ed-95f08c37cc9f.json | 132 - .../1a3b0f7a-afb6-4002-9321-23a86f000c5c.json | 132 - .../8d29363d-3096-4c54-a40e-acf4a7318a04.json | 132 - .../8cea452d-63b8-4e82-9511-64c94f8e140d.json | 132 - .../5e5b5424-1d48-4a5e-8775-52c75609c338.json | 132 - .../73787033-ed1d-4d2e-b7b2-e886ef6f1036.json | 132 - .../54c9403f-2525-45c0-a585-9ff598f95f6b.json | 132 - .../77d0d88d-7ca8-4f3e-8b79-295f53140635.json | 132 - .../727f27e3-2a3f-4572-8db5-87e498c4b6ca.json | 132 - .../b6e0cc97-27cf-4082-a908-95d5c39014b8.json | 132 - .../3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json | 132 - .../b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json | 132 - .../e47a3cab-dfef-47f6-9377-9ee32489bab6.json | 132 - .../1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json | 132 - .../6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json | 132 - .../55f43b53-6ed9-4c16-bf75-c968999a6f36.json | 132 - .../6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json | 132 - .../95096a89-2baf-4b14-bc6e-1f30e920c086.json | 132 - .../f1651632-2787-47cf-b471-89d1b89a6b01.json | 132 - .../e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json | 132 - .../d3accbc1-d698-4357-ab08-0b98fb49b4ed.json | 132 - .../5388a25a-5780-4ae1-999f-172b558a7b52.json | 132 - .../9e4143ff-d461-4fdb-8bc7-86f959f69e68.json | 132 - .../5d843bd7-b34b-41d4-92ff-c25a709b4930.json | 132 - .../87975b2f-298b-4297-8f4d-e5bb1bf5d113.json | 132 - .../41bb8174-f3d6-4862-b892-dbc9f6e2e696.json | 132 - .../683ad2cd-5e39-4088-b98b-94d89dda7b88.json | 132 - .../08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json | 132 - .../4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json | 132 - .../4986c30a-85b0-4263-9be4-d69c9b067e0c.json | 132 - .../47b5a878-1a4a-425f-ae6f-ac286f681cca.json | 132 - .../992a6862-46b9-415e-858f-2eff8709ca81.json | 132 - .../c6391381-c973-4068-b72c-af08762d9e5c.json | 132 - .../0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json | 132 - .../56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json | 132 - .../d3e753cc-37fc-4d77-8b2d-da90a7843d60.json | 132 - .../eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json | 132 - .../2207b154-c5d4-4e5a-ade0-271e62d6345f.json | 132 - .../f4161154-7777-4261-9275-a3002a1305d8.json | 132 - .../8523812d-1db6-4a9d-b06b-ac904191789d.json | 132 - .../6cd9ea81-618d-444e-a892-d4f9819daa67.json | 132 - .../2217326d-377a-4503-8180-206c12c87436.json | 132 - .../3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json | 132 - .../01124f11-b739-422b-97f7-062074b8d0fb.json | 132 - .../7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json | 132 - .../bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json | 132 - .../2eae8905-5338-4a78-86e7-d354d06efa23.json | 132 - .../9dcc4121-e046-49c7-969e-7255b0c32d3d.json | 132 - .../dd7d4acd-549a-467b-b461-0eba5b019122.json | 132 - .../159969cc-32c5-4f6f-b586-8e6d44180b44.json | 132 - .../b80e559d-e519-4678-8abc-ee5591b81fac.json | 132 - .../90c137c9-939d-4e77-9fcc-9e33551a6121.json | 132 - .../f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json | 132 - .../c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json | 132 - .../96c64d23-d23d-486c-83a4-4c0ab4f09d60.json | 132 - .../243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json | 132 - .../438fb728-d6ad-4c28-a43c-ff82d522cd50.json | 132 - .../94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json | 132 - .../5618fc82-d455-4261-8e34-1190d70fd3f3.json | 132 - .../395f6339-3fca-4f4d-befc-2d231008efdd.json | 132 - .../b22696ac-7074-44f2-b72f-c59ca0a41ce6.json | 132 - .../6856f8b6-a719-4f69-be71-4df582015f28.json | 132 - .../f2c0ea2b-76ae-4469-832e-84c0b79fa283.json | 132 - .../5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json | 132 - .../9d5e329f-491a-4608-bcac-1ee63046b34a.json | 132 - .../80953f08-6530-4bab-a375-cc542081aabb.json | 132 - .../0b8691a8-f394-4da3-a67b-faa1af9b42c9.json | 132 - .../fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json | 132 - .../c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json | 132 - .../8a10eeb6-7178-4c78-8940-68fad78e389b.json | 132 - .../f0bb774c-a842-4261-b817-b169ce65a493.json | 132 - .../59afe234-3a7f-49bb-873c-df6cf793e5e5.json | 132 - .../4074081a-66a6-42e4-994f-72541f90888b.json | 132 - .../6a618ec8-c029-49ec-9ea5-da52b5231280.json | 132 - .../edc8f510-c961-4c1f-9757-e80c4247f275.json | 132 - .../aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json | 132 - .../89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json | 132 - .../41000c74-8b29-4369-996f-cf3a2fd09f63.json | 132 - .../a1765846-74e1-440a-8851-12a571444059.json | 132 - .../9c6b594f-387a-42a3-9e40-3b26363e6071.json | 132 - .../2b910401-457a-45dd-920a-559f4595897b.json | 132 - .../90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json | 132 - .../5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json | 132 - .../df6e0cfb-d720-428a-a5ad-b1529faa07c0.json | 132 - .../a88a6e6f-2253-4b67-9527-55ab6153e40f.json | 132 - .../00c66a37-b46b-47e8-a098-ce12433c1135.json | 132 - .../6ad5483c-13dc-4e79-a719-66af383d195a.json | 132 - .../9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json | 132 - .../3880e3bf-6ff0-4eef-a519-2649014254e1.json | 132 - .../e77efb9d-b1fc-4833-8e7f-8da683019018.json | 132 - .../2bcc02df-8d27-412a-8b58-c331df98e4d4.json | 132 - .../622531d5-03f8-42cf-974e-94291aa1e515.json | 132 - .../b772f20f-afbd-496c-9f94-e5fd30d54466.json | 132 - .../169d5ad3-ae4a-42de-b951-f264d85bf623.json | 132 - .../e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json | 132 - .../594780dc-d969-4a6b-b90b-1cc32f40c452.json | 132 - .../4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json | 132 - .../bb576dc9-eede-48d6-b438-732da91a4d29.json | 132 - .../0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json | 132 - .../03d59002-dc98-467f-b2a9-605ef8d9b763.json | 132 - .../8a7034fd-7027-4a87-9cac-c95b745935d0.json | 132 - .../717f745f-1eae-4277-8a31-dbed140ef3e8.json | 132 - .../2dc78735-c0c3-4dd7-8e97-52c92785e623.json | 132 - .../e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json | 132 - .../6303d73e-4129-472a-a6fd-c64cb3de7204.json | 132 - .../8a689e8f-19cc-45b7-80be-ce861a549af7.json | 132 - .../84881315-55a4-4f05-a115-cf82f850090d.json | 132 - .../970dc71c-42be-4d50-86ac-f7301ec969ca.json | 132 - .../c02e1fcf-a837-4b8a-a42d-63837c56128d.json | 132 - .../37280340-5b9a-47d9-aa37-9299d9025518.json | 132 - .../46e7ad9b-b774-46b9-933c-913d1b307f7a.json | 132 - .../c154d3f5-39dc-43c0-85ea-2e43b08494b4.json | 132 - .../abd830e4-2b7f-4895-8262-75926edbafd9.json | 132 - .../2c945021-72e3-4e7a-9c6f-81efb27b2206.json | 132 - .../5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json | 132 - .../6c73f6ae-8ffd-4948-8071-33eab07437a6.json | 132 - .../fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json | 132 - .../e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json | 132 - .../9278bcf2-bfab-437f-bd64-7496b24fb8cf.json | 132 - .../633aa068-5613-41d8-a194-aebc9ce1586f.json | 132 - .../d3c1a922-a453-4c7b-b33b-52934e7bf72b.json | 132 - .../3a27b2a6-5eea-450b-91c7-1dc006229985.json | 132 - .../395e37ae-005d-47c0-9cf5-919460e34350.json | 132 - .../b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json | 132 - .../452ab810-6921-4922-9446-f2a5c081dc61.json | 132 - .../1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json | 132 - .../b2eefd3a-795c-4dc0-a10e-924bece05ea5.json | 132 - .../008cc919-f156-4a2e-af4b-eed015ca91f6.json | 132 - .../9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json | 132 - .../7ea26e73-a501-40bf-8f01-81ab8e850a91.json | 132 - .../e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json | 132 - .../ba1965f8-b59f-4d71-920c-e3b401ca0534.json | 132 - .../6dc87410-a39e-41b1-8759-68c1556c8419.json | 132 - .../c4ebe788-fb60-453b-914b-56bf87dd6374.json | 132 - .../45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json | 132 - .../10593c13-3b30-4605-8063-c6a6526fc9d9.json | 132 - .../12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json | 132 - .../96d9b675-c299-4138-a381-fb4de36287e5.json | 132 - .../17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json | 132 - .../8999a5f3-f421-4663-835e-7626cebd2282.json | 132 - .../951e1a4f-ed6c-49ca-b648-6086989e333f.json | 132 - .../2acc0666-e0ff-4760-a74a-227a02775344.json | 132 - .../3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json | 132 - .../e858aa6c-c424-447e-b512-7dcf794f9f0f.json | 132 - .../8773eac5-205e-4264-981b-58f1a25f872a.json | 132 - .../c26ae286-a9b8-499f-b886-4b75be0cf2da.json | 132 - .../d3a61998-2d41-4349-bd15-ce29143cc910.json | 132 - .../56b66428-2751-4c62-b98c-6c60e58c45ca.json | 132 - .../9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json | 132 - .../5855a920-428f-4699-becc-73d4422f706f.json | 132 - .../f1004f08-7f46-4eb1-8f60-66893fca7180.json | 132 - .../97db158a-3035-45d3-8d92-a08c9e605493.json | 132 - .../0d81b928-2a24-4eb4-93d5-224e3c505532.json | 132 - .../bf4cc7ee-cad4-42af-8638-6b371577ec68.json | 132 - .../5b574dda-0d85-47aa-9ebc-7f8581d402ca.json | 132 - .../6043830f-8a9d-4a03-9de5-4805724a9ae8.json | 132 - .../9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json | 132 - .../217819b0-2c4b-4c26-823b-1ea14f893e01.json | 132 - .../0f844855-fb46-4b53-82c2-f36e5721c385.json | 132 - .../59aaa7ed-27d4-4765-b115-90570ad86c77.json | 132 - .../4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json | 132 - .../9202146d-5889-49fd-9025-e03153ba9093.json | 132 - .../94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json | 132 - .../2245cf71-fb8d-44ca-b58d-06608312ee8c.json | 132 - .../9a823fde-7802-4876-b72c-d8f73cd17236.json | 132 - .../ede99239-ef8f-49eb-a48b-0ec2553c99e5.json | 132 - .../4a307570-994f-491c-87a7-ad90b7965b8b.json | 132 - .../eb448d78-6417-4533-8458-99c1869a74ae.json | 132 - .../e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json | 132 - .../aab6b224-b948-4fb1-84b7-0dbe5c46d527.json | 132 - .../2e5cd1de-6109-4f76-b722-abbd4b207f4d.json | 132 - .../767d1296-4971-478f-8d78-1d63d162ae5b.json | 132 - .../eab74e3b-de61-4fa9-87c2-56e69b70349a.json | 132 - .../3219d563-3bfb-4618-8cb3-e9b198d5b11f.json | 132 - .../233fd27c-561e-4c9e-a917-cbc5b08c055a.json | 132 - .../a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json | 132 - .../4b68ba49-6681-4add-9197-2cd711701e15.json | 132 - .../5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json | 132 - .../a6c631f6-890c-4199-abee-18b012bc48df.json | 132 - .../1edc3610-40fc-467d-8410-26d4b6adebce.json | 132 - .../42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json | 132 - .../1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json | 132 - .../821a21a0-6fd7-438a-933d-5e31b2dd2adc.json | 132 - .../781a4cc6-a69d-4106-81aa-06e114f7c897.json | 132 - .../e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json | 132 - .../3b7524a8-d17b-4788-93f2-11076df464a7.json | 132 - .../6188a57f-4bc3-42a5-ad18-c59774e40407.json | 132 - .../28689805-7c4c-438e-8431-f4a6aceb5e94.json | 132 - .../7c156689-9668-4ded-bacc-c88a03ad1526.json | 132 - .../7e43f187-1959-4dfe-802f-094ba88f3b0d.json | 132 - .../a6170173-ef17-4cfa-a76e-8e51cb8cb970.json | 132 - .../e998d52b-dd94-4ef2-9cfc-5034ded0105a.json | 132 - .../a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json | 132 - .../0f69217c-74ed-4398-8d1b-53d1a43be890.json | 132 - .../b973adcc-769c-4009-87c5-5f5af02a5d3a.json | 132 - .../4b30f11e-a2b9-40e9-b080-9d7484a5d048.json | 132 - .../befdae09-4caa-4996-a3ac-fe36310aaf01.json | 132 - .../8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json | 132 - .../7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json | 132 - .../7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json | 132 - .../1f3a733d-a6d3-453b-9763-61992cd514b0.json | 132 - .../d0eed3c1-2226-48c5-a314-e429f66c5053.json | 132 - .../957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json | 132 - .../55a01e8e-318a-4609-a862-bab4d62b3e7a.json | 132 - .../cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json | 132 - .../c7b6515e-6f96-468b-8bc0-15212c31e790.json | 132 - .../f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json | 132 - .../994aa481-627a-4bed-8719-9e874373cbc6.json | 132 - .../9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json | 132 - .../c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json | 132 - .../e908b473-a015-4156-8e88-d67153479cb9.json | 132 - .../173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json | 132 - .../0bb65f09-323d-485f-886e-5a35c8bcd342.json | 132 - .../86b4c877-ef2d-4563-93a2-92d7e77eab5c.json | 132 - .../be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json | 132 - .../e574af17-dd3b-4c09-8689-ea598d44e562.json | 132 - .../83958185-047a-4356-918d-2f45f273c08a.json | 132 - .../d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json | 132 - .../a218e260-7f56-4676-af58-254bd84d0327.json | 132 - .../f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json | 132 - .../da5774b2-8a6f-4f2d-8267-beb25490b06a.json | 132 - .../274705bd-8eb6-4863-8998-f5d67c4ac827.json | 132 - .../5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json | 132 - .../6918d1a3-e547-46b7-9062-274057c1f513.json | 132 - .../599deb3c-49f9-4c0b-af8d-78f9e166820b.json | 132 - .../b4ea3f14-3787-434b-8f26-20ff640c0146.json | 132 - .../6952c527-ca23-494a-910c-1c027e4a5a29.json | 132 - .../3f12e79c-dd1b-428d-9094-10a047205e3e.json | 132 - .../d508da29-0288-4a0a-b727-fc5355515c5e.json | 132 - .../48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json | 132 - .../4bb7d331-f305-4c08-a073-87ba7b2cbde2.json | 132 - .../94639454-c525-4e6f-af27-d92d45a9ac40.json | 132 - .../9fa81bb7-7abc-4764-9465-d61217590da5.json | 132 - .../9a683492-4057-4de4-a30a-aa66becffb13.json | 132 - .../b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json | 132 - .../ba658bc7-b89d-4fb7-a794-f48bd3715a49.json | 132 - .../93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json | 132 - .../5a91b0bf-b043-41d2-960d-5f0e78abc400.json | 132 - .../263f56e5-b578-475a-9bc4-b5ffc142f9e2.json | 132 - .../9219ff66-73ba-45d8-99a0-23d23b3555ba.json | 132 - .../b2328396-e9b2-464d-94e4-f03db19144ea.json | 132 - .../3f895edf-8f54-48ff-a731-666144af0fda.json | 132 - .../b48b8e16-a555-466b-8b1c-246137223311.json | 132 - .../5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json | 132 - .../d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json | 132 - .../0176903f-e6ca-4f21-b98a-00bc443bf244.json | 132 - .../11f32afc-95c1-4531-ae45-5a0974d36b3a.json | 132 - .../70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json | 132 - .../53cf325b-6f32-4791-8f95-8b982ea03b23.json | 132 - .../8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json | 132 - .../7adf79de-a51d-4b87-989a-c218ec6d99e3.json | 132 - .../92358e5a-5e73-4747-9e92-e5ac003b97f7.json | 132 - .../f1636512-b98f-4fe4-adf3-abd556dd0ab9.json | 132 - .../9333afdd-4866-412b-b11b-dfb118a06db9.json | 132 - .../840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json | 132 - .../071b49f2-8e23-47b1-9858-78d676d9905e.json | 132 - .../d3821f53-87aa-470a-a403-c8e3cd100ae1.json | 132 - .../389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json | 132 - .../5f78f39a-42cc-4cf6-bb27-e2160765bf24.json | 132 - .../b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json | 132 - .../bef1cbad-4f75-4dde-b467-6145f72a87f4.json | 132 - .../654bebe0-b461-427e-a4cf-06386e9272d8.json | 132 - .../37ef4e34-58f8-463a-950f-48b3a6833d54.json | 132 - .../20687086-8aab-40f1-aec6-03917f4f9bf5.json | 132 - .../53a0a998-a0a6-4800-80bf-bfd83123f2f6.json | 132 - .../4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json | 132 - .../42c8d84d-c8b8-42c6-8f49-4e971df173d7.json | 132 - .../77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json | 132 - .../a9ed5d04-57d2-4566-91df-b798be939fdb.json | 132 - .../bad4ec47-fe84-4518-b072-6955938f0c86.json | 132 - .../497e585c-059a-4e18-9a8f-bdaa066f59ea.json | 132 - .../e24b2a4e-83e4-4a79-bc41-03a54af00595.json | 132 - .../15e39361-585b-4870-b91a-64dce4fb37ec.json | 132 - .../96efd11b-e9f2-4bf1-90f9-561714137edf.json | 132 - .../98e9936d-d376-4c72-80a6-0a28cf722ac4.json | 132 - .../7ada9c83-7851-4da2-b9d1-d744b174b777.json | 132 - .../a6ed72b7-14f1-464c-a7f5-590791982696.json | 132 - .../79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json | 132 - .../ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json | 132 - .../2663884f-941c-4e16-8029-b38e3a543733.json | 132 - .../ca7af645-4796-4b31-ae7d-2cbebe5a369b.json | 132 - .../f95e098c-d320-4db1-887d-8c3252bbaf77.json | 132 - .../2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json | 132 - .../5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json | 132 - .../aadfae06-73b6-4306-b056-0a733b9bd8f4.json | 132 - .../cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json | 132 - .../97640dd1-d415-4b56-818c-cdcede3c52fd.json | 132 - .../b750c460-ef70-4abf-b77d-118a82039598.json | 132 - .../f4c20519-9e33-4698-a17a-07e5fe7d2707.json | 132 - .../0f204733-55b4-4c06-bd12-dbc2e2593abd.json | 132 - .../0bb226ed-fe88-4678-9b50-f77883ceb708.json | 132 - .../fb297e45-9e14-4853-8384-75c187b28a9b.json | 132 - .../4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json | 132 - .../c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json | 132 - .../5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json | 132 - .../4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json | 132 - .../2c044767-1169-48c6-9e37-e9d1e35f4cfe.json | 132 - .../bad67b35-d9ef-417a-955b-9c33e87cb927.json | 132 - .../60eaa315-f489-405d-a67d-7f1312e90cab.json | 132 - .../50de312a-293d-41a4-8bee-4feb0c148b90.json | 132 - .../56f24cac-394c-4439-8f2e-8270e7519bda.json | 132 - .../8efa1423-0a39-4674-a94d-3d92448010d6.json | 132 - .../350b3491-cba8-46b4-a07f-3d1277270530.json | 132 - .../0741ead7-24f3-49b0-9967-f726df84f78a.json | 132 - .../1ea4d10e-e099-4967-8c43-e84acaeb40be.json | 132 - .../6c78d9f7-a61e-4f65-ac57-61597f735541.json | 132 - .../e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json | 132 - .../153cfe7f-c27a-40b8-b8d2-54351f26f583.json | 132 - .../b58372cd-5d55-4f42-a5da-2970e55b44b0.json | 132 - .../34a028ac-2002-480c-a1af-5b945ffe872e.json | 132 - .../065ffc51-154c-4a93-a342-0dd476fda473.json | 132 - .../ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json | 132 - .../91004d26-7b8b-4c0a-bd8c-8880654dc93a.json | 132 - .../5eb1aa92-a031-40d4-ad64-552075dae68a.json | 132 - .../3ebc147d-58f2-4605-a011-a71c591fac0e.json | 132 - .../01795776-e909-46d3-8b6c-0989334e3d0e.json | 132 - .../00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json | 132 - .../736249d0-cea9-46c6-9677-ecae4b410af4.json | 132 - .../ef602cfe-3453-4189-b583-292cf05421d1.json | 132 - .../559af2c1-deca-4c35-b83a-004c22ac958a.json | 132 - .../8d66d895-626a-477f-91b6-2195f35aacb3.json | 132 - .../004df803-70da-4e59-b3ad-f210c790f29e.json | 132 - .../bb2972ca-e673-4be5-bc7e-2689adeac3a9.json | 132 - .../eacf2411-a0ea-41fd-8363-e565fce0f26f.json | 132 - .../4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json | 132 - .../f19dab38-48ed-438e-8a62-86e4d111f6c8.json | 132 - .../ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json | 132 - .../9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json | 132 - .../404e3d61-26d3-4f95-9847-064f0c7c6970.json | 132 - .../0b4574f2-1b71-427f-9923-17db449be191.json | 132 - .../775b88cd-98e8-4d93-acca-e294f68f2da2.json | 132 - .../89464568-47cb-4659-af37-8b061d3f0c8c.json | 132 - .../9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json | 132 - .../c1882335-0df5-4df2-bfa1-c16126c328fb.json | 132 - .../291471ed-3b7c-4bd4-91bb-c27cd74ec460.json | 132 - .../53565fe4-0368-477b-9916-ac9a4b8a9c7b.json | 132 - .../f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json | 132 - .../e51fee25-7648-49d9-a8da-b8dbc68a722b.json | 132 - .../6acdc96b-cfde-439f-b6b3-a66257b3fcde.json | 132 - .../850da8de-ca13-4f15-bb9f-68b910355cfd.json | 132 - .../542fbb7a-d4eb-4cbf-b63a-4305cb108361.json | 132 - .../1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json | 132 - .../6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json | 132 - .../e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json | 132 - .../5113439d-1394-46f2-a38e-34b54e94a9e6.json | 132 - .../a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json | 132 - .../1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json | 132 - .../446ac93f-d47c-4207-bf32-0cd94e88a931.json | 132 - .../7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json | 132 - .../ca77f821-4722-45b1-b731-7d774232acb4.json | 132 - .../f32d2a11-edd3-4662-aed7-88c6820b2c2e.json | 132 - .../71c56883-dd14-4f16-b839-5ce607a4aadb.json | 132 - .../639004c2-81a5-410d-bd61-e3e263f55335.json | 132 - .../5f232a99-07c9-4df7-9d3b-837966ea6de5.json | 132 - .../482e34ee-8974-46c6-b3f4-4cc9872ef562.json | 132 - .../13743252-3ba3-406d-8e95-5a4cd3ac3772.json | 132 - .../ff25cb66-ed6f-421a-a038-1feb24666645.json | 132 - .../843f0d9a-04e8-4cea-bb18-94651a814d1f.json | 132 - .../fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json | 132 - .../ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json | 132 - .../468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json | 132 - .../bd8fdfa5-bda1-402b-9010-94bf78b0127b.json | 132 - .../a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json | 132 - .../dbf4fbac-cd99-426d-b725-600e60af00d2.json | 132 - .../f793c471-1638-476a-a050-455a32368e29.json | 132 - .../1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json | 132 - .../99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json | 132 - .../82a44b46-156f-4232-92e4-6a08d7a4f197.json | 132 - .../3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json | 132 - .../dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json | 132 - .../1cffcbeb-ef81-4efe-b883-0a8540a799e7.json | 132 - .../033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json | 132 - .../bfe654b8-cb79-4845-bf14-85012207ce90.json | 132 - .../5c4efc23-9591-447b-aecc-4c82797d7d01.json | 132 - .../a5fe3fab-95d9-41ac-a95f-66205e489dae.json | 132 - .../c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json | 132 - .../3d556d9f-036b-4368-bb4a-18ad6b444bdf.json | 132 - .../92905e27-1033-4423-b87d-23236f9be964.json | 132 - .../17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json | 132 - .../11574f56-6c34-48e4-8fb5-c58d42f07330.json | 132 - .../8f728c51-15f9-422d-bbdb-4d976961ab9d.json | 132 - .../8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json | 132 - .../5e33bf05-6c67-4ecc-982d-7590e9953145.json | 132 - .../f55ae879-bd95-409c-a8a3-9a57cd615a31.json | 132 - .../b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json | 132 - .../51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json | 132 - .../c46e4fa1-afae-4b68-a13e-034b5cd2b779.json | 132 - .../42cc06ed-20fc-4e84-836f-3d7243ec336d.json | 132 - .../aaa53387-af33-4454-95f0-3af85f4778c0.json | 132 - .../465bca6d-b32a-4d34-9916-fc8b3166faa0.json | 132 - .../bf138f3d-09d9-4dea-aa43-5efc804bc775.json | 132 - .../cb4e944c-66f6-49f2-b1e0-d90454e34315.json | 132 - .../b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json | 132 - .../933f3d40-8726-418f-be2f-1f9686e9ab02.json | 132 - .../af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json | 132 - .../43df4336-1eb8-4df7-8309-1199aafc07b1.json | 132 - .../44ae222d-407c-4c8b-9b67-75440631f848.json | 132 - .../a87db0fe-3727-4ff1-875f-9edd3109f3a2.json | 132 - .../0c73e33a-7f6f-4925-970b-db289069d5ca.json | 132 - .../02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json | 132 - .../590c031c-2aa6-48e6-9b3f-68b1a585dd39.json | 132 - .../970c9fb8-c217-444b-a025-f4d9acdd679d.json | 132 - .../07a08dd7-822b-49ac-859b-d2fc75b9c88d.json | 132 - .../0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json | 132 - .../2ae306b1-5409-4418-b5e4-50feff9dafe7.json | 132 - .../44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json | 132 - .../e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json | 132 - .../6369fceb-148f-4491-9488-420182a9838f.json | 132 - .../045c814e-a30f-4b6b-b4f4-382dee4063b7.json | 132 - .../59d2b375-5696-47d0-9c96-1a826c08bea0.json | 132 - .../ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json | 132 - .../8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json | 132 - .../ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json | 132 - .../d69ecbfa-5036-48b8-8fed-f9162e2857f5.json | 132 - .../b5924329-c182-482a-bee8-22fcb348281d.json | 132 - .../a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json | 132 - .../b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json | 132 - .../b5de0218-91dc-487a-be90-70f8bcb64803.json | 132 - .../3870f65b-3429-45c2-846f-6af30155a78b.json | 132 - .../d6c33a51-be09-4cb5-9942-4348668d3e5e.json | 132 - .../1ccd36ee-445a-4861-8835-d602973148fc.json | 132 - .../4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json | 132 - .../6a69202c-1c68-43e4-bd45-bbc2ff2db743.json | 132 - .../a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json | 132 - .../f76d3d30-4fce-48a9-a26b-7d714fff1d29.json | 132 - .../eb38a092-1b56-4348-8188-baa2243f7046.json | 132 - .../1c4cfb94-fc66-4fe2-9879-78683abe654f.json | 132 - .../2deef730-c37b-46ca-82b7-de38ae724fd4.json | 132 - .../13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json | 132 - .../36cf5b59-5369-4baf-80c1-3a47678eb5cb.json | 132 - .../fced3ef1-fb69-47fe-bf68-3efe72db3142.json | 132 - .../7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json | 132 - .../6f413d72-cd9f-435c-b13e-9cec14edeb5c.json | 132 - .../a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json | 132 - .../0b19508c-4996-4fb7-b0e0-9fa952854fa3.json | 132 - .../447c22c1-8929-420f-b59b-01ab32a22281.json | 132 - .../ab3dbe43-658e-4c8a-a399-b3d070d467ba.json | 132 - .../ee5c87a4-aa06-4728-a9bf-2fc35284b987.json | 132 - .../6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json | 132 - .../9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json | 132 - .../100cf60a-c43c-4b3a-a667-a45cffdd562a.json | 132 - .../2088fca7-11d7-47de-808d-d47da0caad0f.json | 132 - .../bf0b3560-9d38-406a-ad30-5fd157f0fe43.json | 132 - .../9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json | 132 - .../14501de3-dac0-44af-8c17-7abcd9bbba8b.json | 132 - .../c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json | 132 - .../8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json | 132 - .../1326ff61-d0b4-46eb-9bcf-f978166e622b.json | 132 - .../4c9e829f-7a99-4d61-8730-7457215a4fd6.json | 132 - .../afc24d42-6d25-4036-8f22-fcf944b481b7.json | 132 - .../6f6db681-991e-408b-8d4e-71fff9e1c974.json | 132 - .../f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json | 132 - .../77b457d9-4957-4f0d-a8d3-e005ae382239.json | 132 - .../11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json | 132 - .../23cc1e7f-0994-43a5-8403-5361a2976285.json | 132 - .../88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json | 132 - .../ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json | 132 - .../3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json | 132 - .../a06ad94f-13ee-466c-b25f-87cd87012678.json | 132 - .../9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json | 132 - .../7dcd6e37-3685-4b08-b983-b2a711aeaf73.json | 132 - .../b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json | 132 - .../4cc037a2-d952-4566-a575-015f8e3a5925.json | 132 - .../a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json | 132 - .../40e452df-8f0a-4473-a3d1-41f9c288c12f.json | 132 - .../216020ac-276b-436e-815b-d6968eb83770.json | 132 - .../1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json | 132 - .../25739611-f690-41b4-87de-9f4ea8b3d815.json | 132 - .../b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json | 132 - .../fa237949-c3ac-482a-8a54-5a2019f24016.json | 132 - .../b60dd828-a3e7-46a8-b4c2-322aeca42faf.json | 132 - .../5de9f914-333f-4181-a93f-79257a3daf54.json | 132 - .../e2d23da4-226a-4a02-8390-e8edaea4b65b.json | 132 - .../c64c7470-dcf9-46f8-b789-cab7e902739d.json | 132 - .../f6d727a3-19dc-4173-a88f-2c47449896aa.json | 132 - .../490d14c8-2cb0-4328-9f41-6074b28d6fdc.json | 132 - .../9351b079-7ef5-42ec-bb83-f0d8ec7de479.json | 132 - .../852d5adb-f422-4102-8114-082ab0b3c07d.json | 132 - .../c64e98cd-c022-4834-a3e0-3949416d1fb1.json | 132 - .../f101bd15-ac61-49d4-beac-c89bc889b34b.json | 132 - .../11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json | 132 - .../f0b57a60-8402-4430-93f3-b846a94113f2.json | 132 - .../50aa8077-4493-47a9-9cec-014c56343ecf.json | 132 - .../5e70d00b-c822-4ad6-afe8-3756a7038c57.json | 132 - .../8162ba41-e630-470f-a297-72fb9f2110fd.json | 132 - .../60dd9d02-476f-459d-a41c-f89f82116dc3.json | 132 - .../73e89f21-5799-4835-a0e0-a6664c0483da.json | 132 - .../7f355ad4-9156-486d-8cf4-723117da3bb8.json | 132 - .../4ccc6026-b639-488d-867f-d98ea49cf1b6.json | 132 - .../3cf2e68e-4de0-436e-935e-86935e11f72f.json | 132 - .../e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json | 132 - .../c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json | 132 - .../94fb625d-f58c-4f2e-8268-1dc4472c1cce.json | 132 - .../4481ddef-2bef-4284-b56d-21054f5a9a97.json | 132 - .../80048c4b-e97b-45c7-aa04-70ce69481a97.json | 132 - .../d21a2557-2348-4087-b2a6-6e1c0101bccc.json | 132 - .../76290d4b-5526-400b-8ca4-24d220f7c02d.json | 132 - .../3a146535-09b3-4246-8bd8-0e984e0905b1.json | 132 - .../6683f95c-f97f-4117-b3c5-c1ed9587289e.json | 132 - .../bbe74b2b-9e13-4c13-92c8-618078667248.json | 132 - .../61876ce3-acc4-4619-b0c2-78ac4dff48ea.json | 132 - .../b304baee-c9de-4982-801d-2b9e7f1a7334.json | 132 - .../6f27e746-1bdd-4cec-a955-c27f2f9900ef.json | 132 - .../30637c5d-1bc0-49dc-8afd-335a9a66f196.json | 132 - .../169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json | 132 - .../427d32f7-190b-4005-b02c-6a8ce089dbbf.json | 132 - .../de7551a8-63b1-4de3-899f-9d98cb985005.json | 132 - .../eff6f456-906d-4320-8e6f-667fbbf0574a.json | 132 - .../6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json | 132 - .../7e3d3803-c8d4-4025-8d12-c4c29c49c059.json | 132 - .../a43a6ca9-3543-44bc-8511-ee5c45552070.json | 132 - .../83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json | 132 - .../6e2d4174-303f-437b-9abb-26667b1dd04c.json | 132 - .../955e93d0-bec1-483c-b3f0-258e13d5cb16.json | 132 - .../3065ca79-c5e9-4875-9f81-4231e971d818.json | 132 - .../fc7e485f-a416-420b-b43c-e45e502c4a8f.json | 132 - .../53e882c6-6eb5-4202-a8d0-3a313556c9f4.json | 132 - .../ba715669-c0ed-471f-80a6-b67453fb4930.json | 132 - .../316cab27-5cac-4d26-90ae-05d1fc3bd14a.json | 132 - .../d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json | 132 - .../bf3eabff-fbf7-421c-9e04-548accc7678c.json | 132 - .../b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json | 132 - .../b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json | 132 - .../5b769770-3b63-4863-a723-95212e2be40e.json | 132 - .../f2264b41-efa5-4278-91fd-2f454aa91c61.json | 132 - .../5c3484b4-6faa-47fd-a1a2-881898450f79.json | 132 - .../326b95f8-9eae-4064-a261-077a957e233c.json | 132 - .../c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json | 132 - .../89e55482-b762-4f5d-a021-211048719bdc.json | 132 - .../81018e12-63f8-4ad8-87c4-181a13202497.json | 132 - .../5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json | 132 - .../8b344f21-9038-4b15-aba8-308aa62e4b39.json | 132 - .../68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json | 132 - .../df557f25-5505-49dd-a0cb-88fff601c6e2.json | 132 - .../a50bf387-bf34-490f-979a-b6217a85a1bd.json | 132 - .../89264aa0-3bed-41d3-b171-2a5434cc990f.json | 132 - .../a3272caf-a292-4dc7-8932-636a4099ca6b.json | 132 - .../c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json | 132 - .../b030646c-5f5c-43ab-bbc4-405f82992265.json | 132 - .../399e516c-d8c8-4511-a746-76c81f72b36a.json | 132 - .../bd8e4424-7903-43e7-8105-269de734582e.json | 132 - .../9126e939-3a87-4774-9606-084c5b56e933.json | 132 - .../be2ef197-738e-422d-9a88-cafd124584b7.json | 132 - .../ee22e6c5-8529-4987-86d0-4abf3b525f90.json | 132 - .../50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json | 132 - .../83294141-a70f-40da-b3f8-21b367098cce.json | 132 - .../303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json | 132 - .../1b13d76d-259f-41f2-baba-ce96ef0cb937.json | 132 - .../b644a420-0a70-4b3d-9a5a-ff91911c857b.json | 132 - .../33aaa60f-eb69-4d36-917c-6862121a223e.json | 132 - .../a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json | 132 - .../ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json | 132 - .../b8043d04-c3ab-4d6a-97eb-44b195a52710.json | 132 - .../c6bff6da-382f-4423-ba3a-d987839132e0.json | 132 - .../f3574ad1-a6d7-47fb-86e7-69c256452dea.json | 132 - .../f2e47267-6c40-4d70-8420-295c95b318f3.json | 132 - .../395f246e-34c6-40e6-bfeb-b047aa12cf90.json | 132 - .../3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json | 132 - .../97c92043-9bed-460a-8d7b-70ab3584c75b.json | 132 - .../ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json | 132 - .../f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json | 132 - .../cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json | 132 - .../c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json | 132 - .../1b32c387-97a7-42ff-892c-d3bacebbf050.json | 132 - .../cbea057c-b0f9-48ac-a075-eb28ebbaf358.json | 132 - .../0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json | 132 - .../a86678ad-344c-430f-80c7-02d634b0cd5b.json | 132 - .../827f3236-74fa-432b-8177-8785ac25ad76.json | 132 - .../7f694687-77e5-41d2-923b-f2d5f231729b.json | 132 - .../daa9d03e-63b0-4c08-ae72-e11041200ac7.json | 132 - .../1539822f-acc4-4dae-9e61-133da97ebcbe.json | 132 - .../eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json | 132 - .../448cac5f-a7d3-41fb-9b49-666758037eb4.json | 132 - .../5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json | 132 - .../7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json | 132 - .../ec4d21be-b1a6-47a9-84a4-1a25249c1768.json | 132 - .../c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json | 132 - .../f156ac38-056e-4ef1-bdbe-e83c299a683b.json | 132 - .../11d3c8db-300c-4e02-b729-7adba6844ad2.json | 132 - .../fc75a820-fc0b-4e50-9304-61f0e93795c0.json | 132 - .../bb66896f-799c-4e17-8b54-af5e795699fa.json | 132 - .../30a1a786-7478-401f-85ae-57037ada3d32.json | 132 - .../05430b16-07b6-41a1-ade9-6211cdf8ccf1.json | 132 - .../09bc4d5a-f104-4a36-999c-11e2532eef1e.json | 132 - .../a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json | 132 - .../8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json | 132 - .../e4f39815-9704-4d0a-8d9b-39359367adcc.json | 132 - .../f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json | 132 - .../398996d9-299b-4120-a757-e2fe14e779ee.json | 132 - .../4398633e-77b0-4b61-ae85-29b0e5aad38b.json | 132 - .../1bc60148-512f-4830-b541-f30535cf74bf.json | 132 - .../a9dfb20a-13e0-4419-a747-7c001b2e9435.json | 132 - .../388e3559-a3b6-4738-9843-9bdd048bae09.json | 132 - .../994a6930-42d5-463a-9e7c-0a3070144211.json | 132 - .../cce46320-9794-443a-831a-92e2a21515b0.json | 132 - .../988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json | 132 - .../3c675148-5d09-4778-baad-9295ef8cfc79.json | 132 - .../620b80ba-81ab-4504-9f42-4965014f3cd1.json | 132 - .../b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json | 132 - .../19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json | 132 - .../7966789d-8ace-4b39-9093-96bbb8e641d8.json | 132 - .../5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json | 132 - .../a17563e3-0369-4042-8006-2ec781653f63.json | 132 - .../68369110-e371-4112-ae0a-14f7fe9fc40f.json | 132 - .../2a6925d3-992f-4c4f-a57b-3eb41062743b.json | 132 - .../28290ea9-9ce5-4605-ac5b-aa2d606994d8.json | 132 - .../eb2ed6eb-4789-400d-aea5-841547a20cd7.json | 132 - .../873218a0-7ddb-4287-88ce-8c8214e85c85.json | 132 - .../e4c32b92-46b4-431a-83f2-11499f587534.json | 132 - .../a05681a0-07e4-4206-ae89-dee4e9706467.json | 132 - .../b078f823-d603-4030-81a2-a3ca1a1117f9.json | 132 - .../26625158-6720-47c7-8c28-46ca7b4b947e.json | 132 - .../5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json | 132 - .../35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json | 132 - .../4d99a55e-39c0-41c7-9ef0-494f739ceaec.json | 132 - .../f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json | 132 - .../1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json | 132 - .../ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json | 132 - .../0af58746-0492-4ba7-8a17-c0a5c43d0700.json | 132 - .../88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json | 132 - .../bc79527d-ae58-4b17-afd8-df931562dbf3.json | 132 - .../3e7423d5-ad7e-48e2-bd25-a4946d443c24.json | 132 - .../7979fd6a-a886-41cc-987b-356b7c452bff.json | 132 - .../2be6bc34-1e61-426f-b963-6e096b5418fb.json | 132 - .../c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json | 132 - .../c845eb10-a028-4cc2-8f64-25d75480c0d5.json | 132 - .../377e7223-4876-49b6-8057-b1831d7f129b.json | 132 - .../4ddb9ed6-0599-482e-b12e-bcb01975cc85.json | 132 - .../9d5af106-be69-4b62-99c1-fcfb6091d080.json | 132 - .../2f2d7a55-2838-446d-9487-a6cfa0c03356.json | 132 - .../65d20d45-f63b-4b09-b66d-5f53297c0c20.json | 132 - .../4712953f-0777-4b97-8f13-f7309f19f0dc.json | 132 - .../84382308-04b5-439f-b486-b26d20da605a.json | 132 - .../e82be06f-14ed-45e8-a273-d28c50f5212b.json | 132 - .../5815ba55-40fc-4f8e-ae0b-b329c42fd503.json | 132 - .../e58eceb3-b501-4924-9d0d-98d7da3c16c5.json | 132 - .../5a88455c-7699-4c49-8a12-76cda15d878c.json | 132 - .../122b4c1e-6e6c-4db5-8991-b091361c3ecf.json | 132 - .../6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json | 132 - .../679f214f-e03f-47a9-8a11-91adbf1c4880.json | 132 - .../680e77b8-9c64-4c52-aa83-55236039cef1.json | 132 - .../c24c471c-14b3-462e-8b81-6548b27e5ffc.json | 132 - .../efa7fa62-2e8b-403c-b345-eef876b48dbd.json | 132 - .../40bae762-65bd-4b4c-b422-ffd0fd3790a9.json | 132 - .../596957cc-719c-44c7-8284-06a9ba0d1a30.json | 132 - .../706bbc09-f867-4327-bc4d-b5ede41ebd93.json | 132 - .../8962e9be-75bf-4f57-8ce2-b29523740851.json | 132 - .../014f4838-22ff-4802-a887-4d2de01a9256.json | 132 - .../5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json | 132 - .../09b81cf2-3b79-448c-ab8e-87e378c804bb.json | 132 - .../28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json | 132 - .../845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json | 132 - .../706737c7-cd1a-4958-9ffc-2655f0b50178.json | 132 - .../5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json | 132 - .../d374a68d-b985-47c2-b087-500bffa93c80.json | 132 - .../23fbceb0-b646-4945-b17f-66dde24a0e43.json | 132 - .../73d9e204-e829-4159-b340-6d9581c6f0e1.json | 132 - .../a6979dda-fba6-4104-b153-3b0a89de8585.json | 132 - .../62e04968-0c5c-4aad-a434-d9d24bccbdb8.json | 132 - .../bae4064e-b10f-4082-876d-e4168ca1a8cc.json | 132 - .../0040b48c-0f54-4c9b-97ee-1ca833c68e36.json | 132 - .../6050e969-bcde-4594-8e53-05fa74c7287d.json | 132 - .../3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json | 132 - .../ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json | 132 - .../4048fa60-7427-4f7e-9939-e270aa5e8b51.json | 132 - .../f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json | 132 - .../1da70796-d40b-4f2a-8ce3-b304f414a6d5.json | 132 - .../de476f79-2539-4f9e-a1d2-901c6c4342d4.json | 132 - .../80aee542-c894-46b6-a6ed-9f3400aefa9e.json | 132 - .../5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json | 132 - .../4b019824-8454-4ce8-aa49-d122a2491f9c.json | 132 - .../0dfcd13c-f057-4aec-82ad-b5cf2b266502.json | 132 - .../927589bf-f6a0-4155-a24b-120231bbf029.json | 132 - .../1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json | 132 - .../0110d1c9-755e-4f09-888b-0c9c1a263639.json | 132 - .../cda65781-494c-45bd-8c32-7b1fe987f31c.json | 132 - .../2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json | 132 - .../acf07f51-5acd-4375-bafa-7a1a244db3c6.json | 132 - .../ff985193-ba26-45d3-97be-b7d3b17ab4d7.json | 132 - .../21dbea2c-5cb1-431c-a496-af9b932b3440.json | 132 - .../1143955c-c32c-4b41-8484-2c77e72f4946.json | 132 - .../94824ceb-08c3-415c-8003-b70a0d9af09d.json | 132 - .../bf2903cb-b954-4870-98c3-116a96aa49fb.json | 132 - .../b089c439-a38c-438d-bdad-1c68a1265d95.json | 132 - .../c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json | 132 - .../fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json | 132 - .../1c81787b-594e-4bb6-aee1-7f193a628b16.json | 132 - .../fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json | 132 - .../0625f09a-3e02-410b-963b-49b83dfc5c8f.json | 132 - .../50c1399e-b409-4dff-b4d6-9be01dbb02c7.json | 132 - .../402bdb4a-b258-40a4-ac9f-de74026c02f3.json | 132 - .../65dcf458-db0f-45cd-a8a4-e16108e51161.json | 132 - .../f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json | 132 - .../11e7b55a-d872-474a-98a6-fc82ce5a863e.json | 132 - .../19688633-fa6c-412a-8dbc-c16fc49b3276.json | 132 - .../7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json | 132 - .../447f880c-643f-4041-8cdb-87697d798085.json | 132 - .../653d459e-f8b7-48bc-a9db-779e515532cf.json | 132 - .../4e56faf6-dbde-4059-b502-32c76bdbed2d.json | 132 - .../f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json | 132 - .../7d08412d-e987-497f-a6ec-ce0affe0f80f.json | 132 - .../f042f897-cfe8-4d8c-b75b-bbfca44505ea.json | 132 - .../f24ab334-c022-4e34-a930-3fed6ee18793.json | 132 - .../2bd3c620-780f-452d-92d7-d01a04539939.json | 132 - .../234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json | 132 - .../d8e0a32e-f307-4056-b450-47a12a0a7b15.json | 132 - .../9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json | 132 - .../037787fb-9c61-4c56-a7fc-704c04b519f7.json | 132 - .../5df3dd8f-4921-4916-8163-8651b796e478.json | 132 - .../50463593-3a53-4b3f-9621-d05670309b7e.json | 132 - .../d7fef356-36c7-488f-8f49-997682a2c01a.json | 132 - .../42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json | 132 - .../b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json | 132 - .../e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json | 132 - .../731a5f85-a59e-40af-870c-00e519ca0e7e.json | 132 - .../38d93ae8-90ec-473c-8570-33d52c46770b.json | 132 - .../9072fd28-040b-44df-bd58-6e3f59398189.json | 132 - .../14827e00-09c5-4ebd-93cb-8e026ac73d20.json | 132 - .../11e76d74-b8e0-408f-b429-566faa5d60a2.json | 132 - .../944c84d8-231d-47ef-85f4-23c0286a4a02.json | 132 - .../47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json | 132 - .../ca54a8d4-153b-4169-b6ee-133461a9bedd.json | 132 - .../652359ec-14f2-4f94-a694-b7dc98819bfc.json | 132 - .../b34f3335-c7a3-431f-b2c8-6f0731a81378.json | 132 - .../077306f9-5d40-40dc-9df4-b5ca559af5c7.json | 132 - .../e0f0fe87-8ed3-4398-8683-65aa042d01d9.json | 132 - .../2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json | 132 - .../db476911-87fb-433f-b164-4435718dab46.json | 132 - .../75a967f6-a8ab-435f-999b-4889e8217dce.json | 132 - .../e072997b-2f79-4d25-b8dc-ebf15ac311e1.json | 132 - .../6d681a29-0d1a-4054-8250-5246993509f8.json | 132 - .../2a6af4ce-e45c-4721-a23c-03071a5e774f.json | 132 - .../5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json | 132 - .../60052d34-f6a7-4204-baea-532f5ba29880.json | 132 - .../e1ddd882-f8a1-48d0-bb2a-878f43095895.json | 132 - .../d2c3edec-38d8-48e3-9f6d-e26a63442af8.json | 132 - .../dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json | 132 - .../8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json | 132 - .../fc262523-dcde-4b45-80ba-2922e66d42c4.json | 132 - .../f8d745da-9867-4348-bace-d8052c3b4025.json | 132 - .../3d410f0f-6b24-4e86-a353-6142c51b1ecc.json | 132 - .../46329fc3-974f-4d04-be9e-ba85b3816efc.json | 132 - .../b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json | 132 - .../126326f3-6521-45d1-aa14-5c51335c1929.json | 79 - .../b3f5937a-1489-417b-8162-6c62dea0703d.json | 79 - .../f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json | 79 - .../809a1503-a161-4532-afd3-fdbd6551eb63.json | 79 - .../808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json | 79 - .../be076445-eb88-49b0-a855-2e0cb1551bab.json | 79 - .../69210faf-04a8-46d4-b92b-94f2ca521c09.json | 79 - .../ed293aa1-f64e-429d-bddf-91a35a4203d1.json | 79 - .../2bddd388-5e9a-423e-8767-37d6f9f69032.json | 79 - .../bfd991ca-13e9-4716-b389-11e0d2afe286.json | 79 - .../b29b7c8e-759e-45fe-a9d3-1054f19af617.json | 79 - .../801d2dc6-17e7-47f1-a54f-87b94a59b508.json | 79 - .../def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json | 79 - .../157dd68b-fcc2-416f-a2c0-c9781020e6af.json | 79 - .../174f0e23-84f1-43d0-bcdf-11b83c37025a.json | 79 - .../bef7254b-549f-4e6b-b5c8-31b84dc6acda.json | 79 - .../aa236b03-b81f-431b-b049-7101cea165f2.json | 79 - .../abc37028-a362-4e02-8499-1bb7497e0293.json | 79 - .../ba46ef91-d157-4984-b3df-ce33d8d97f8e.json | 79 - .../e70acf51-30ef-4c20-b7cc-51704d114d70.json | 79 - .../0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json | 79 - .../de66cc70-b456-4165-a827-5193dd77e84d.json | 79 - .../e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json | 79 - .../1dd8c827-72af-4c8f-9ead-989de7105590.json | 79 - .../ead39f61-b408-42b2-808f-8421a3200c89.json | 79 - .../f96bdb35-4d61-4fde-8d91-edf55f13dc03.json | 79 - .../5516f77c-932a-4eaa-ac31-dda9260ce82d.json | 79 - .../8992cef5-df7e-40a1-b099-331532c3deb0.json | 79 - .../a77c08d6-a782-440c-b545-c60b6169712d.json | 79 - .../623bae1f-19e9-47f9-bc7b-80a859218d07.json | 130 - .../fbba98c5-5d56-4837-9044-d4e5ac610c2c.json | 130 - .../dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json | 130 - .../c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json | 130 - .../3101726d-fd51-436d-8adf-cbdf0d534834.json | 148 - .../f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json | 112 - .../904c6359-bd7b-4448-9f16-bc115d0629c4.json | 148 - .../49511052-6881-4151-9b46-686c75f73c22.json | 148 - .../b289e2e6-d57b-4a2b-aa61-e2974d193909.json | 130 - .../aeeca919-71a1-42a0-a6d0-6779d77750e6.json | 112 - .../db29538d-f40e-42d0-b3c0-e622f92112d2.json | 148 - .../ab0cdc4f-47dd-4dcc-b506-982ce3924105.json | 130 - .../44da63b6-d934-4330-bc20-33464bae61dd.json | 148 - .../c930cbe0-f429-4b61-9abe-86dcb7266cf7.json | 148 - .../c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json | 112 - .../73ee9408-e669-4b8a-9419-76bd6051ce8d.json | 112 - .../0deed2f4-770e-4033-a65d-e1da19e00611.json | 148 - .../e727cb77-f229-4aaa-909f-99c7aa06676b.json | 130 - .../da9264cd-2fa3-4121-81de-eef994e15993.json | 130 - .../79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json | 148 - .../28c35831-679d-489a-b2c4-fd2c7f333fbc.json | 148 - .../9db7907d-7b22-480c-86a5-f88ec2b302e7.json | 130 - .../2faddf79-41e6-47e9-9c26-17bc987bc870.json | 130 - .../20989a47-6556-4e3b-8909-d0a419cb159b.json | 130 - .../f3d0010f-efed-4f87-9582-b9c87b4de99a.json | 130 - .../a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json | 130 - .../d54c4830-23c8-4c12-aea1-4f5b5245464f.json | 130 - .../b5853278-edd9-4bc8-bbeb-d6dab515b562.json | 130 - .../74188e30-1e49-47d8-af01-b80e430dafa0.json | 130 - .../93974286-0497-46a2-a2e8-404c1e89dba0.json | 130 - .../02c0020c-7d69-4701-a606-4bc79ad87afd.json | 130 - .../5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json | 130 - .../4887256e-0545-40dd-9756-ff850e003a29.json | 130 - .../d2b70870-9cbc-4666-bbd4-097fcebe716e.json | 130 - .../f420f432-2291-40a9-8ebd-b91241970113.json | 130 - .../02e68d1b-86f3-4344-ad8d-45df878b744c.json | 148 - .../f712ab4a-1127-44ba-b6b9-7a40290f3322.json | 148 - .../b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json | 130 - .../9879e9a7-ddbc-4338-abc7-e3bc394869e9.json | 130 - .../d7d8a5cb-e295-4ced-b528-d99d814ff008.json | 130 - .../bff86a1f-71c3-4f27-aeae-bba6d03635ef.json | 130 - .../723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json | 130 - .../0ce7dc54-f608-4985-9904-75cee09b6288.json | 112 - .../5bb0aaa4-2cc5-4622-8235-993bc4178f12.json | 112 - .../85ab22b8-0587-4e2b-857f-3d6d84d571a4.json | 148 - .../37aa6702-b2fa-43bf-b5a9-36740f627217.json | 112 - .../57f48d0c-e424-410d-b9ee-4707e2add036.json | 112 - .../8643b4dd-e18c-442c-adb5-84ef756534f8.json | 148 - .../2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json | 130 - .../4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json | 148 - .../f9b60945-8b14-4564-9d44-3eb6db675ab9.json | 112 - .../56703c11-eccb-4f66-af13-60f972a5068f.json | 130 - .../fbd8be7e-5670-4729-a77d-83472510b734.json | 130 - .../2e18ee77-9c46-4cf9-9521-303ad15e5be4.json | 130 - .../ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json | 148 - .../07b61a55-a8e3-4a6f-9806-a4100f8d5297.json | 130 - .../3d534c25-5016-44de-9c47-24b7d7399b0f.json | 148 - .../4de91433-05b3-4f88-9d0f-66691c671f62.json | 148 - .../dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json | 130 - .../36c4adc9-c2fb-4bc3-81ba-88478d30332e.json | 130 - .../f0827b15-20d0-4986-b5a0-bb4bc9be768e.json | 148 - .../aeaa8b33-e327-4c65-9641-5dfc63feee3b.json | 148 - .../c97c79f3-fd92-49db-9131-5e45834a7eaf.json | 130 - .../687099cb-c1bf-49ec-a902-329c2b818369.json | 148 - .../8da4f5eb-6264-4503-b9bc-fcf843b638be.json | 130 - .../28a68b87-5412-4374-9e61-896b0fff7669.json | 148 - .../3209c869-03c5-4801-8e4b-4c8bcde3d58f.json | 130 - .../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json | 112 - .../633d499b-58bd-4fca-9b56-0f005a5a21b8.json | 130 - .../5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json | 130 - .../77d1edc1-fb54-4371-bf7c-baebbb351163.json | 130 - .../e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json | 130 - .../3f3915b3-0d6e-451c-9185-fa4372b93f2b.json | 130 - .../e534d37b-3009-4a7d-82d8-d7c85b95649e.json | 130 - .../bd8f0ed1-75fc-48c1-996e-655d205c027c.json | 130 - .../e9effaf6-e48b-4b35-b035-430be81b316b.json | 148 - .../d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json | 112 - .../ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json | 112 - .../43f0e93d-f0b8-46af-a549-e1ac315d96ea.json | 148 - .../9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json | 130 - .../c10d4213-f1fa-41e6-92d9-0d5337c1362b.json | 130 - .../63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json | 130 - .../d724076d-509f-4ad4-894c-976b0472de85.json | 130 - .../54d34f25-1cd9-4995-8e56-c36981842fc8.json | 112 - .../63ae1c75-fd4d-4f40-afd0-b9f91d700014.json | 130 - .../1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json | 112 - .../3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json | 148 - .../680098fb-76cf-47b6-a0ea-a1a06ca46dca.json | 148 - .../6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json | 112 - .../592ad1e3-8a48-4c39-8013-81d7c731780f.json | 130 - .../5b36f0af-7ff6-4564-9714-08fbf41d261f.json | 148 - .../04f120c6-b648-4c83-81d8-05118efb0904.json | 130 - .../c907e494-ab2e-4a28-a28d-aeb68eb818ed.json | 148 - .../d9eed240-ebbe-482f-8dae-c5251ed6d067.json | 112 - .../670865e1-f219-465b-9fbe-6da6f73ac9e6.json | 130 - .../88953298-b63e-499f-a31e-f0f586c4772d.json | 112 - .../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json | 130 - .../6ad2cb6a-f9a3-424e-aed2-9493899872e3.json | 112 - .../1892bf75-916b-4d4f-96ab-fda36872ae5d.json | 112 - .../e06e1863-c28f-4c96-a672-b1073c80aa71.json | 112 - .../d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json | 112 - .../5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json | 112 - .../59299d8c-e468-490f-8a52-eef49b0aaeea.json | 148 - .../3ce9612f-9b57-476e-9fa4-6e63f14568a7.json | 112 - .../9c605bf1-2533-43db-a610-e71c0aaecdb5.json | 148 - .../c289f778-92b8-44df-a079-3bced33c8ab5.json | 112 - .../329d4101-e740-490c-9fbc-1708f76a2f61.json | 112 - .../3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json | 148 - .../62b9adca-db38-46c0-a68a-ed7a8e735035.json | 112 - .../4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json | 148 - .../830df3fd-d479-4af8-a92b-93d82e804fec.json | 112 - .../0e6d85b8-aa37-448c-adb2-0da2bd13e322.json | 112 - .../45f0bd9c-e939-4b83-a623-1db61f431500.json | 148 - .../0f710903-7dd8-44ea-914d-d43bbfe894f1.json | 148 - .../b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json | 112 - .../25a4520b-c780-45fc-a00f-36db1776c6a8.json | 148 - .../96d7e5c1-2f43-4f09-9702-0af090afa141.json | 148 - .../5a47f8bd-401a-4b6b-91b0-9593b36e5996.json | 148 - .../c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json | 148 - .../060bf847-e7b5-4e30-934f-5306d01c499a.json | 148 - .../e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json | 148 - .../537e92cb-25db-47f5-916a-6f666e14639a.json | 148 - .../e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json | 112 - .../fc99848b-82c7-459e-8327-1867a332ff28.json | 148 - .../357f4f03-9542-495f-b575-4274111bbe1f.json | 112 - .../d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json | 112 - .../c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json | 94 - .../cc2ac405-1710-46fa-aeba-dd86797c666c.json | 94 - .../49fcb3e2-2883-4c3d-b519-d511c6b10162.json | 94 - .../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json | 94 - .../49029c9e-a831-4219-8e26-df20862ad3e1.json | 94 - .../6dedd117-eab0-4c31-b50b-4890099d9904.json | 94 - .../71c20c06-efb8-428e-9e9d-e4fedf11041a.json | 94 - .../862f3d57-8f5f-4372-b6fb-876fb35efba4.json | 94 - .../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json | 94 - .../c1331fa1-7793-4526-b24b-02261bb4437f.json | 94 - .../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json | 94 - .../cd0452a7-0370-4024-a51f-b3deff290db9.json | 94 - .../6fd85045-d600-451f-8d27-da637add4081.json | 94 - .../a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json | 94 - .../5f43832f-14fa-49e1-a851-949163aec826.json | 94 - .../1f8869e7-e434-469e-906d-d34621582cba.json | 148 - .../8f9d05db-9bb0-4998-bc75-96dbfa695548.json | 130 - .../2681e475-da0a-48a9-ab68-e0bf59240f90.json | 148 - .../e2986d78-100d-417a-9f38-9a570a335d95.json | 130 - .../1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json | 130 - .../1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json | 148 - .../4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json | 148 - .../daebee0b-3856-4270-94c6-c14bd84f5cf5.json | 130 - .../1be99417-352e-4a94-8108-b43123553667.json | 148 - .../8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json | 130 - .../9533891f-c2f7-4e82-9f39-131768dbc28a.json | 130 - .../b8a47660-f0a5-4136-a743-979863c53e3a.json | 148 - .../2673bea2-42eb-42a5-9dc2-13d43341c9b2.json | 148 - .../6f5555c2-588a-48d1-811c-be53634bbdef.json | 130 - .../9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json | 148 - .../0519d9fb-f220-40ab-8257-f20ed98a8b47.json | 130 - .../ece70375-447f-41e8-aa03-8f4b26abea73.json | 130 - .../7bbaffdd-f822-48cf-a0f2-e66b16db678d.json | 130 - .../27c5c441-64ce-41dd-8384-f84c8f6ccc14.json | 130 - .../38a14e6a-2094-4e0b-be22-45181ede2a63.json | 130 - .../cee37c2c-2766-47b7-9192-a141e5d22f2d.json | 148 - .../d1d69392-8717-462d-9ce0-c7ddf5faf97d.json | 148 - .../72071bb1-57c0-4727-8100-ba24d8da10f5.json | 148 - .../7626c158-edaf-48f3-9ac3-1188be0c6032.json | 148 - .../c37be7a8-dc10-4fea-962b-202986a4581e.json | 148 - .../223dc616-b20f-4065-91a7-3c35bfd11c94.json | 148 - .../4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json | 148 - .../c8030a87-0cdf-4918-b0d5-d1fb0e284656.json | 148 - .../e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json | 148 - .../64872b1a-1eae-4171-95ec-a80c782b69f0.json | 148 - .../37484401-c7fe-469d-889a-e70f7cadbf82.json | 148 - .../8cf36288-3add-4fcd-a012-0df9eae2a059.json | 148 - .../f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json | 148 - .../de409ce8-fb68-4113-8879-23712769cbde.json | 148 - .../264f20d7-1574-448c-8917-eb3f20810819.json | 148 - .../0ebaec42-9190-4326-95dd-5ecb48bf1a72.json | 148 - .../29515933-c60b-4686-b475-70ef53d75457.json | 148 - .../414174a9-7e44-4f7b-94ce-0757639f5af7.json | 148 - .../48513083-f854-455e-8455-ddbd2698ec03.json | 148 - .../0b373560-854f-4482-81d0-6c984e130144.json | 148 - .../1a021cab-d569-4077-af5e-1643f45de03d.json | 148 - .../e26e230d-59b3-4243-a6c4-3845ab74b89b.json | 148 - .../aa0991d0-9c5e-4f94-bc12-3342ca389e99.json | 148 - .../397abe47-d5e9-487d-b883-ec49db16c584.json | 148 - .../82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json | 148 - .../670382ab-a8a1-43f3-a572-b9a5aeae23ef.json | 148 - .../a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json | 148 - .../7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json | 148 - .../4f164e8b-55a1-498f-b586-cf78da7d0b57.json | 148 - .../a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json | 148 - .../7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json | 148 - .../93398c1f-3129-4be4-83b5-62a4a45c6b84.json | 148 - .../62493784-f899-4736-bdce-2107ec99a752.json | 148 - .../9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json | 148 - .../76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json | 148 - .../2dc5ab6f-2427-42ae-9582-a0e6139f451a.json | 148 - .../0db97be6-6562-47d8-bd1a-5b469250e54b.json | 148 - .../228e4dc4-e517-4023-b690-7f0c321286b2.json | 148 - .../9442b27c-c94d-41c0-a752-3bd82385272d.json | 148 - .../561039ac-b156-40eb-bf53-21a275b858ca.json | 148 - .../d801d700-7b4d-4a62-883b-3d85b05385ea.json | 148 - .../b8f24058-4441-4d19-898e-80470cc7b685.json | 148 - .../1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json | 148 - .../0200a1b3-71f1-4633-96a5-4ca9883a67a7.json | 148 - .../55479901-aec7-4875-b792-ba73b54aa37a.json | 148 - .../872597b2-4392-4f23-b5b2-41d418b6cf89.json | 148 - .../5cb437b5-5993-418d-bd9f-81dea71d9edf.json | 148 - .../c471cdf7-73f9-48c9-a970-baa66b609093.json | 148 - .../794a71b4-8a43-4c69-a663-369eea6a84a3.json | 148 - .../2ad22375-4ed8-4be6-a012-a6f6799581e2.json | 148 - .../a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json | 148 - .../ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json | 148 - .../5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json | 148 - .../10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json | 148 - .../a550663c-2a04-4dfb-8663-b177a7181f3d.json | 148 - .../72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json | 148 - .../5e41f068-f009-4e32-bac1-9de5220a2ce2.json | 148 - .../eca1331f-6503-481a-b77b-3d96791f54e8.json | 148 - .../69def7de-a916-4d23-984b-e676e91e1d8c.json | 148 - .../679c6e0b-9e0b-4224-b1e3-59df149739a0.json | 148 - .../2335433d-37c6-47f0-ad3b-5e0a42e9488f.json | 148 - .../fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json | 148 - .../70d2697e-0df5-40ae-9268-b906c9cabd9d.json | 148 - .../0a30fd70-2381-4a4b-89aa-dbd169c856f0.json | 148 - .../b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json | 148 - .../bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json | 148 - .../44b20109-d534-4aa9-867d-fa59935ef6d0.json | 148 - .../d1196312-4153-4a38-aa46-2940d63d7924.json | 148 - .../4b1e3070-04ef-47e7-b720-739320194e7b.json | 148 - .../247f400e-dca8-4dab-bebf-092f778f02c9.json | 148 - .../d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json | 148 - .../d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json | 148 - .../05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json | 148 - .../a6ef712e-014e-470e-8d5b-f3b51f677aee.json | 148 - .../35a039ba-06be-4ec2-9bde-a6a6db2eefec.json | 148 - .../97cb96f8-ce4c-403f-bfbc-386d3c611c81.json | 148 - .../3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json | 148 - .../237218ac-4c74-4647-82b1-700360ddfdbd.json | 148 - .../2858d126-d2ef-4512-8fc8-c39faf24b908.json | 148 - .../d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json | 148 - .../379ec82f-a6a7-4976-a4a6-ab80cb9da293.json | 148 - .../c4df42d1-a838-4717-a814-40559fcd7342.json | 148 - .../f022d826-3252-4def-b37b-3ce44d78f4ce.json | 148 - .../cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json | 148 - .../278c2132-3415-48f4-a839-ed09d71e9240.json | 148 - .../92bbda1a-ecb1-493d-aa39-a29522c1a11e.json | 148 - .../f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json | 148 - .../59a98f5d-d017-4b1a-a563-5abd113337e9.json | 148 - .../a41597ed-fbab-41af-9625-c277ca988546.json | 148 - .../e311eb59-f217-4bc2-b69b-dcea434797a8.json | 148 - .../69b037c3-bae2-4889-b10d-e732c45851e9.json | 148 - .../adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json | 148 - .../4464d588-62b2-440b-8188-2450bd7a94c5.json | 148 - .../bf358648-a41d-43ee-8c14-f8b8eef41871.json | 148 - .../afd99f12-f739-40d3-aa11-ef3a45316931.json | 148 - .../49b4a24b-ddf1-47f0-ba39-9366892a1213.json | 148 - .../ea14a487-39c3-488b-b52b-998e57135487.json | 148 - .../02f74b6a-7f63-484e-a7c1-0c53bd801b87.json | 148 - .../e492c59d-4b03-4dce-983e-a8724de35a60.json | 148 - .../53de0394-8516-4882-b2bc-c7e62e3d8ef0.json | 148 - .../56d4c1c5-5238-45dc-8331-64a14b830779.json | 148 - .../7003c9d4-c758-4373-a7a3-04822978bf35.json | 148 - .../75a7dcb6-789c-49de-b209-4cf7d27465e4.json | 148 - .../e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json | 148 - .../f18bfd44-3097-4eb8-a09c-2372c3ecd738.json | 148 - .../9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json | 148 - .../fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json | 148 - .../60ba1f0d-7e85-49e4-8c73-330d74de6707.json | 148 - .../29d1c194-8b87-466c-8701-e0fcf267665c.json | 148 - .../31e8f616-7b64-4d1a-b395-20bf8bb4629c.json | 148 - .../cc3f315d-3cea-47e4-83b4-b5045e778c5e.json | 148 - .../5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json | 148 - .../06f2cb33-3937-4fde-84e2-6b5467f051c6.json | 148 - .../f35c4efa-3767-4a0e-8769-06230cda2512.json | 148 - .../6cb65d6a-6c46-4991-8154-f28b101954f6.json | 148 - .../6e15a49b-7dc4-4d69-965e-cb962c084e4a.json | 148 - .../9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json | 148 - .../b609c002-fa0a-46a8-b5a1-9213ee89606c.json | 148 - .../b147fc7f-0e31-49ca-abfd-ba990a925097.json | 148 - .../e4fbfe23-2b70-459e-821b-db0116d43d8c.json | 148 - .../2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json | 148 - .../aca2c665-79f2-4226-b806-307be277ed08.json | 148 - .../d37a63df-6d38-4083-bf87-11064162efde.json | 148 - .../16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json | 148 - .../47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json | 130 - .../7199c8b3-8346-4200-b07e-4362ad13a7db.json | 130 - .../de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json | 130 - .../17e011c3-1a53-40ae-b7b4-cb24c23df3de.json | 130 - .../1125dd05-2f0d-48ca-825c-f5efa18564aa.json | 130 - .../88014e0d-e89b-4fed-9eb6-5276bd7658df.json | 130 - .../7cc9bfc2-570d-456c-918f-68fd4b711f05.json | 130 - .../77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json | 130 - .../ba0ce7ce-a755-4337-bfec-0391680d3625.json | 112 - .../4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json | 112 - .../6868a1e5-ee86-4f89-8452-5e939ac19169.json | 112 - .../4a151d43-5fac-4afe-9c23-ba0e86a60849.json | 112 - .../5f16d574-adef-4016-abcf-9e7936771ba7.json | 112 - .../f3e0300f-39ed-4cfd-bd03-218904836037.json | 130 - .../42c82c00-b74e-4152-a222-15d481a13e0c.json | 148 - .../68096be8-c49f-4a23-824e-1275248369f7.json | 112 - .../c91270bd-3731-452a-b429-6cd4943d1194.json | 112 - .../337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json | 148 - .../3b00f881-8f73-4608-8cbb-846fe7d1cfea.json | 148 - .../2821dfdc-291b-405e-bd81-cf536c802885.json | 148 - .../7d441240-7e85-4776-b51c-3c1bc84456ba.json | 148 - .../840d35d9-441e-4ba3-bbc3-1f4ff2627517.json | 112 - .../0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json | 130 - .../b72e2988-75e4-4d26-9a47-daae4786b02f.json | 148 - .../643cf5a3-8992-4126-87c9-814887314266.json | 148 - .../f81f1f67-6506-481f-87ce-a17a6a7578f3.json | 112 - .../32b35218-a099-410e-8a65-a0d6e2f380a6.json | 148 - .../deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json | 112 - .../e42a9986-4dcc-4017-be97-8135646c7424.json | 112 - .../ffc92063-606a-4f31-bfdd-5683aa748ccc.json | 148 - .../23a5398c-0911-4a66-930d-abada12bf985.json | 148 - .../80b0bbcb-a57a-453c-8fff-502646520b1d.json | 112 - .../e383c939-b952-4fdd-94e3-eb3716691860.json | 130 - .../daf873f9-ab03-49df-96cb-a0f5a8613048.json | 130 - .../f4cff132-3b2f-4e03-bb49-098b16d87cef.json | 112 - .../f80685de-058c-4ab8-aa35-dc7321d1cea6.json | 130 - .../c8e4349d-a084-4eb5-990f-403ba930a9ad.json | 130 - .../729ca9c0-0680-49f1-97b9-5581be17a352.json | 112 - .../fdd4add5-b44d-46f9-8c98-da3120df4161.json | 112 - .../6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json | 112 - .../95271b8c-4135-48bf-bbad-ae94baa37640.json | 112 - .../f437e790-efe1-4dc5-8ccc-5b0bfd800069.json | 112 - .../7d0f761a-2650-4029-b1e9-13af2f0cc69d.json | 130 - .../49fc601e-4ac6-4672-a53d-0e89f19959c1.json | 130 - .../6195e81a-d5a5-40af-96f6-259252009ad7.json | 112 - .../2dec0f50-d374-4af3-9d27-80fcf50dac2c.json | 148 - .../96722888-0cc9-4dfd-b38d-91f4118c0be2.json | 112 - .../683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json | 112 - .../121344ec-61ef-49c5-a74b-b86f605d513e.json | 148 - .../8594f86b-a7f2-4046-a3a7-830d7ac20690.json | 112 - .../c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json | 112 - .../0411ac30-1536-4639-8350-fc11d53298e3.json | 112 - .../92281e58-4160-4d76-9119-b38fb47ffd8f.json | 112 - .../43687871-2e19-4d2b-9754-1cb6527496c1.json | 112 - .../1debe1de-b394-4856-a946-9d14bd867bf6.json | 130 - .../80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json | 130 - .../62478772-bb85-4d3f-a916-c3d17db3ee61.json | 130 - .../a070bae2-c927-418b-91cc-161781c4f5b7.json | 148 - .../b884c919-a272-4f67-9a09-3d232f56d083.json | 148 - .../deac33dd-187b-4406-a76a-b33caf417380.json | 148 - .../185bd742-d7d4-4600-86bd-bcda75ed2ebc.json | 130 - .../901e4de6-3ef6-4c2a-873c-cdcc47201974.json | 112 - .../a051d5d6-18e6-483d-a000-4a52a06de676.json | 148 - .../94d77182-8952-4a63-b02b-3d8bd8a8dead.json | 112 - .../9a48d808-0280-4175-a28a-7e9ba8ac6deb.json | 148 - .../f0d9f57d-d552-44ea-a91c-751854133316.json | 130 - .../561cfba1-856d-4809-b5c7-41481735e1d6.json | 148 - .../995d1caf-b735-44dd-adff-875e3203aa46.json | 130 - .../81767043-23c2-4229-b3b5-1c24e470d52a.json | 130 - .../4f6344bc-af30-46f9-b6f8-41ff925d064e.json | 130 - .../abac8640-40be-4eb5-9035-2bf6fd436a7a.json | 148 - .../6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json | 112 - .../8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json | 112 - .../5ad53725-ed5a-41f3-8ff6-7404f3f981db.json | 112 - .../ae2d05b4-5e80-4b00-af67-b94609b073eb.json | 112 - .../592f2811-c197-423e-89d4-e25ee5a324fb.json | 112 - .../17795e7b-e912-440f-a80e-63233d3b6d8c.json | 112 - .../375cf55f-64f6-42f6-a947-1487feffb196.json | 130 - .../94d2eddd-f7db-4360-ac58-0af39ce66935.json | 148 - .../996ca604-e01c-4a95-9286-60b6dc04f67d.json | 130 - .../b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json | 130 - .../83e15cba-4fec-48f2-9be4-78decbd96f66.json | 130 - .../493617c0-37eb-4c83-b175-2507a3647b5e.json | 130 - .../97f494ce-3c9c-4a19-a237-d458be611a0a.json | 130 - .../f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json | 130 - .../5bf73fba-520f-4a2f-9296-8240847eb8ec.json | 130 - .../3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json | 130 - .../ef987556-7277-48d8-ac07-532586773a3a.json | 112 - .../add7eddb-7a8b-4c78-9864-c4316a97ce5e.json | 130 - .../caf02954-1eed-44eb-b5f4-df47c90828d7.json | 130 - .../00798930-daa2-4e79-82c6-2cccf1c3a0cb.json | 130 - .../71658cf8-0189-49dc-847f-b9a9b5faee4a.json | 148 - .../3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json | 130 - .../04c71231-2025-4e1a-b7ed-56b245868089.json | 130 - .../08b2edd0-f8e9-47cd-b19d-53fdc7209917.json | 148 - .../79a43841-4032-4a20-8b5a-83b4b446d107.json | 130 - .../a2c16ab8-1098-490a-8d0a-392d835427e0.json | 148 - .../0aa12860-7ebe-49c2-a5af-1926d23e34f8.json | 130 - .../796d3ec1-9c26-4ead-87cb-4eb866209120.json | 148 - 5320 files changed, 1099844 deletions(-) delete mode 100644 data/global-mmlu-lite/alibaba/qwen3-235b-a22b-instruct-2507/c8ab4e94-d8e8-417f-be18-fececf3c815c.json delete mode 100644 data/global-mmlu-lite/anthropic/claude-3-5-haiku-20241022/402c8833-1827-46fc-a497-46b40a6794ff.json delete mode 100644 data/global-mmlu-lite/anthropic/claude-3-7-sonnet-20250219/acd2082a-ce0c-418f-9383-f3c9f11735a2.json delete mode 100644 data/global-mmlu-lite/anthropic/claude-opus-4-1-20250805/c65ed336-b283-46c2-8284-c4695cad588d.json delete mode 100644 data/global-mmlu-lite/anthropic/claude-sonnet-4-20250514/5ebb009d-b548-4f2b-b075-feb76ca295d2.json delete mode 100644 data/global-mmlu-lite/cohere/command-a-03-2025/c7df2916-bde4-4987-9139-fcfd18a14ac1.json delete mode 100644 data/global-mmlu-lite/deepseek/deepseek-r1-0528/56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json delete mode 100644 data/global-mmlu-lite/deepseek/deepseek-v3.1/ad3211a9-4390-4247-b64d-600191a88a75.json delete mode 100644 data/global-mmlu-lite/google/gemini-2.5-flash-preview-05-20/1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json delete mode 100644 data/global-mmlu-lite/google/gemini-2.5-flash/129c8b21-f97e-4284-9574-33d5932332f7.json delete mode 100644 data/global-mmlu-lite/google/gemini-2.5-pro/3644fd67-0f46-4de3-b542-edf219d0e0cd.json delete mode 100644 data/global-mmlu-lite/google/gemini-3-pro-preview/c0692e14-6484-4d02-8dac-55ce4373fb15.json delete mode 100644 data/global-mmlu-lite/google/gemma-3-27b-it/ab4940d1-118c-479a-bd37-1ea2da6f02a3.json delete mode 100644 data/global-mmlu-lite/google/gemma-3-4b-it/85552093-435f-4d85-897d-4e74c3655533.json delete mode 100644 data/global-mmlu-lite/mistralai/mistral-medium-3/4ddc0062-6577-4ab9-85f1-791fd2822776.json delete mode 100644 data/global-mmlu-lite/mistralai/mistral-small-2503/50fc4840-933b-43ec-847e-1834b30f9f14.json delete mode 100644 data/global-mmlu-lite/openai/gpt-4.1-2025-04-14/6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json delete mode 100644 data/global-mmlu-lite/openai/gpt-5-2025-08-07/a668c931-34e4-4702-a84c-97d8c6f59ef4.json delete mode 100644 data/global-mmlu-lite/openai/o3-mini-2025-01-31/3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json delete mode 100644 data/global-mmlu-lite/unknown/aya-expanse-32b/938a35f1-195d-49c8-9a16-90fab96692bd.json delete mode 100644 data/global-mmlu-lite/unknown/granite-4.0-h-small/ce756801-f75e-4250-9721-1d627a37f055.json delete mode 100644 data/global-mmlu-lite/unknown/o4-mini-2025-04-16/b83b41d4-6c95-4c7d-a290-65d89bf776c2.json delete mode 100644 data/global-mmlu-lite/xai/grok-3-mini/31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json delete mode 100644 data/global-mmlu-lite/xai/grok-4-0709/a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json delete mode 100644 data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json delete mode 100644 data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json delete mode 100644 data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json delete mode 100644 data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json delete mode 100644 data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json delete mode 100644 data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json delete mode 100644 data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json delete mode 100644 data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json delete mode 100644 data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json delete mode 100644 data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json delete mode 100644 data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json delete mode 100644 data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json delete mode 100644 data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json delete mode 100644 data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json delete mode 100644 data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json delete mode 100644 data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json delete mode 100644 data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json delete mode 100644 data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json delete mode 100644 data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json delete mode 100644 data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json delete mode 100644 data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json delete mode 100644 data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json delete mode 100644 data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json delete mode 100644 data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json delete mode 100644 data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json delete mode 100644 data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json delete mode 100644 data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json delete mode 100644 data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json delete mode 100644 data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json delete mode 100644 data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json delete mode 100644 data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json delete mode 100644 data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json delete mode 100644 data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json delete mode 100644 data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json delete mode 100644 data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json delete mode 100644 data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json delete mode 100644 data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json delete mode 100644 data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json delete mode 100644 data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json delete mode 100644 data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json delete mode 100644 data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json delete mode 100644 data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json delete mode 100644 data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json delete mode 100644 data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json delete mode 100644 data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json delete mode 100644 data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json delete mode 100644 data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json delete mode 100644 data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json delete mode 100644 data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json delete mode 100644 data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json delete mode 100644 data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json delete mode 100644 data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json delete mode 100644 data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json delete mode 100644 data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json delete mode 100644 data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json delete mode 100644 data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json delete mode 100644 data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json delete mode 100644 data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json delete mode 100644 data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json delete mode 100644 data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json delete mode 100644 data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json delete mode 100644 data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json delete mode 100644 data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json delete mode 100644 data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json delete mode 100644 data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json delete mode 100644 data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json delete mode 100644 data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json delete mode 100644 data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json delete mode 100644 data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json delete mode 100644 data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json delete mode 100644 data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json delete mode 100644 data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json delete mode 100644 data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json delete mode 100644 data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json delete mode 100644 data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json delete mode 100644 data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json delete mode 100644 data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json delete mode 100644 data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json delete mode 100644 data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json delete mode 100644 data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json delete mode 100644 data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json delete mode 100644 data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json delete mode 100644 data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json delete mode 100644 data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json delete mode 100644 data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json delete mode 100644 data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json delete mode 100644 data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json delete mode 100644 data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json delete mode 100644 data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json delete mode 100644 data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json delete mode 100644 data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json delete mode 100644 data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json delete mode 100644 data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json delete mode 100644 data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json delete mode 100644 data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json delete mode 100644 data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json delete mode 100644 data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json delete mode 100644 data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json delete mode 100644 data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json delete mode 100644 data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json delete mode 100644 data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json delete mode 100644 data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json delete mode 100644 data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json delete mode 100644 data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json delete mode 100644 data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json delete mode 100644 data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json delete mode 100644 data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json delete mode 100644 data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json delete mode 100644 data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json delete mode 100644 data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json delete mode 100644 data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json delete mode 100644 data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json delete mode 100644 data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json delete mode 100644 data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json delete mode 100644 data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json delete mode 100644 data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json delete mode 100644 data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json delete mode 100644 data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json delete mode 100644 data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json delete mode 100644 data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json delete mode 100644 data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json delete mode 100644 data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json delete mode 100644 data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json delete mode 100644 data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json delete mode 100644 data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json delete mode 100644 data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json delete mode 100644 data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json delete mode 100644 data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json delete mode 100644 data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json delete mode 100644 data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json delete mode 100644 data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json delete mode 100644 data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json delete mode 100644 data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json delete mode 100644 data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json delete mode 100644 data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json delete mode 100644 data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json delete mode 100644 data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json delete mode 100644 data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json delete mode 100644 data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json delete mode 100644 data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json delete mode 100644 data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json delete mode 100644 data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json delete mode 100644 data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json delete mode 100644 data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json delete mode 100644 data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json delete mode 100644 data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json delete mode 100644 data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json delete mode 100644 data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json delete mode 100644 data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json delete mode 100644 data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json delete mode 100644 data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json delete mode 100644 data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json delete mode 100644 data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json delete mode 100644 data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json delete mode 100644 data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json delete mode 100644 data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json delete mode 100644 data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json delete mode 100644 data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json delete mode 100644 data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json delete mode 100644 data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json delete mode 100644 data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json delete mode 100644 data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json delete mode 100644 data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json delete mode 100644 data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json delete mode 100644 data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json delete mode 100644 data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json delete mode 100644 data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json delete mode 100644 data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json delete mode 100644 data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json delete mode 100644 data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json delete mode 100644 data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json delete mode 100644 data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json delete mode 100644 data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json delete mode 100644 data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json delete mode 100644 data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json delete mode 100644 data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json delete mode 100644 data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json delete mode 100644 data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json delete mode 100644 data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json delete mode 100644 data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json delete mode 100644 data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json delete mode 100644 data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json delete mode 100644 data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json delete mode 100644 data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json delete mode 100644 data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json delete mode 100644 data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json delete mode 100644 data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json delete mode 100644 data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json delete mode 100644 data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json delete mode 100644 data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json delete mode 100644 data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json delete mode 100644 data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json delete mode 100644 data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json delete mode 100644 data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json delete mode 100644 data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json delete mode 100644 data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json delete mode 100644 data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json delete mode 100644 data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json delete mode 100644 data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json delete mode 100644 data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json delete mode 100644 data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json delete mode 100644 data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json delete mode 100644 data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json delete mode 100644 data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json delete mode 100644 data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json delete mode 100644 data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json delete mode 100644 data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json delete mode 100644 data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json delete mode 100644 data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json delete mode 100644 data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json delete mode 100644 data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json delete mode 100644 data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json delete mode 100644 data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json delete mode 100644 data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json delete mode 100644 data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json delete mode 100644 data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json delete mode 100644 data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json delete mode 100644 data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json delete mode 100644 data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json delete mode 100644 data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json delete mode 100644 data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json delete mode 100644 data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json delete mode 100644 data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json delete mode 100644 data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json delete mode 100644 data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json delete mode 100644 data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json delete mode 100644 data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json delete mode 100644 data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json delete mode 100644 data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json delete mode 100644 data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json delete mode 100644 data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json delete mode 100644 data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json delete mode 100644 data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json delete mode 100644 data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json delete mode 100644 data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json delete mode 100644 data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json delete mode 100644 data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json delete mode 100644 data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json delete mode 100644 data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json delete mode 100644 data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json delete mode 100644 data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json delete mode 100644 data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json delete mode 100644 data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json delete mode 100644 data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json delete mode 100644 data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json delete mode 100644 data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json delete mode 100644 data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json delete mode 100644 data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json delete mode 100644 data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json delete mode 100644 data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json delete mode 100644 data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json delete mode 100644 data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json delete mode 100644 data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json delete mode 100644 data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json delete mode 100644 data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json delete mode 100644 data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json delete mode 100644 data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json delete mode 100644 data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json delete mode 100644 data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json delete mode 100644 data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json delete mode 100644 data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json delete mode 100644 data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json delete mode 100644 data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json delete mode 100644 data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json delete mode 100644 data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json delete mode 100644 data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json delete mode 100644 data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json delete mode 100644 data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json delete mode 100644 data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json delete mode 100644 data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json delete mode 100644 data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json delete mode 100644 data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json delete mode 100644 data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json delete mode 100644 data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json delete mode 100644 data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json delete mode 100644 data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json delete mode 100644 data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json delete mode 100644 data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json delete mode 100644 data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json delete mode 100644 data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json delete mode 100644 data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json delete mode 100644 data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json delete mode 100644 data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json delete mode 100644 data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json delete mode 100644 data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json delete mode 100644 data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json delete mode 100644 data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json delete mode 100644 data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json delete mode 100644 data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json delete mode 100644 data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json delete mode 100644 data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json delete mode 100644 data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json delete mode 100644 data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json delete mode 100644 data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json delete mode 100644 data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json delete mode 100644 data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json delete mode 100644 data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json delete mode 100644 data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json delete mode 100644 data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json delete mode 100644 data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json delete mode 100644 data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json delete mode 100644 data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json delete mode 100644 data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json delete mode 100644 data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json delete mode 100644 data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json delete mode 100644 data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json delete mode 100644 data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json delete mode 100644 data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json delete mode 100644 data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json delete mode 100644 data/hfopenllm_v2/0-hero/Matter-0.2-7B-DPO/0d7928c3-c769-474e-8249-7a5c70c4c559.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-34B-32K/f63536ed-752b-4538-9b92-2514a617a4bf.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat-16K/8ff13de2-ea43-4392-992f-ba70b6023e96.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat/02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-34B/74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-6B-Chat/ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-6B/05307b41-d832-4533-99bd-c8608bf8e64c.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-9B-32K/c09bd9b0-6f85-4120-94a9-b628c68bccb7.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat-16K/9f971385-1146-4436-91a6-0e52d4db1f07.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat/80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-9B/db88e3f5-58a9-4783-9093-a6df96483342.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-34B-200K/8cd90f8a-d8dc-469b-95b9-260fcef804d2.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-34B-Chat/b2c82703-2b5c-407d-b84f-a8f8261ac894.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-34B/55462e67-5eca-4e9d-9095-51fcf12de5fa.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-6B-200K/25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-6B-Chat/efc036b6-d8de-4393-87a1-d4f86fb44d91.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-6B/a5144406-eb85-43b2-a49d-be6b06d6b04a.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-9B-200K/900184ad-656d-416b-956f-5f6e3a991d1b.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-9B/7a58954a-5d7d-4640-99fd-773249640237.json delete mode 100644 data/hfopenllm_v2/01-ai/Yi-Coder-9B-Chat/4ea3146c-b912-424a-b0a9-7c37348348c8.json delete mode 100644 data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct/b0276278-6d86-49c0-a246-cd9110ac1deb.json delete mode 100644 data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi/04216f67-1385-43bf-b7de-5bae7a60f379.json delete mode 100644 data/hfopenllm_v2/1024m/PHI-4-Hindi/fbf7b76b-7ced-4217-8e14-1d02184e271c.json delete mode 100644 data/hfopenllm_v2/1024m/QWEN-14B-B100/74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json delete mode 100644 data/hfopenllm_v2/152334H/miqu-1-70b-sf/295938e1-ade2-4d36-beca-3cbe506b5b90.json delete mode 100644 data/hfopenllm_v2/1TuanPham/T-VisStar-7B-v0.1/f331782f-ea09-41bd-8c6a-e964c88d7e09.json delete mode 100644 data/hfopenllm_v2/1TuanPham/T-VisStar-v0.1/e4e3d79a-1de9-43be-a029-0be4f60e472b.json delete mode 100644 data/hfopenllm_v2/3rd-Degree-Burn/L-3.1-Science-Writer-8B/6914ac28-b543-4f36-81f1-f7491c018e3b.json delete mode 100644 data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1/b7378f41-46ab-41af-94cc-e7fb10738658.json delete mode 100644 data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot/acedae59-6192-4ac4-a354-d520ecd6ba36.json delete mode 100644 data/hfopenllm_v2/3rd-Degree-Burn/Llama-Squared-8B/ff105961-761d-4261-8a44-20acf2e7f440.json delete mode 100644 data/hfopenllm_v2/4season/final_model_test_v2/fa0901f6-514e-44ae-84dc-0b793f26169e.json delete mode 100644 data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-Instruct-preview/d2dff5df-343b-40f3-85de-14eb72dab050.json delete mode 100644 data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-SFT-preview/8fa3010f-b7a1-4fc1-9156-ba70453add86.json delete mode 100644 data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K-100steps/58034f99-3b01-46d6-aea9-90c75d073bb0.json delete mode 100644 data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K/e6c08c9c-6d01-45c7-8a24-219b756b8632.json delete mode 100644 data/hfopenllm_v2/AELLM/gemma-2-aeria-infinity-9b/cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json delete mode 100644 data/hfopenllm_v2/AELLM/gemma-2-lyco-infinity-9b/95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json delete mode 100644 data/hfopenllm_v2/AGI-0/Art-v0-3B/082f25f0-994c-438a-8086-b1e439aca466.json delete mode 100644 data/hfopenllm_v2/AGI-0/Artificium-llama3.1-8B-001/31423cbd-08cd-4079-b1c5-ba412acf1b51.json delete mode 100644 data/hfopenllm_v2/AGI-0/smartllama3.1-8B-001/2669bd86-da65-4d87-8464-bfa8c741ce0b.json delete mode 100644 data/hfopenllm_v2/AI-MO/NuminaMath-7B-CoT/ab2c19ff-5671-446f-b09e-731e2ae515ca.json delete mode 100644 data/hfopenllm_v2/AI-MO/NuminaMath-7B-TIR/36250dc3-cb51-43be-8ab0-6788eb5bda7c.json delete mode 100644 data/hfopenllm_v2/AI-Sweden-Models/Llama-3-8B-instruct/cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json delete mode 100644 data/hfopenllm_v2/AI-Sweden-Models/gpt-sw3-40b/9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json delete mode 100644 data/hfopenllm_v2/AI4free/Dhanishtha/038c32da-add5-4299-ac17-df6ef3fdea58.json delete mode 100644 data/hfopenllm_v2/AI4free/t2/25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json delete mode 100644 data/hfopenllm_v2/AIDC-AI/Marco-o1/77655d60-872f-468a-acc6-d584ef5bf46a.json delete mode 100644 data/hfopenllm_v2/Aashraf995/Creative-7B-nerd/4de378c8-ccf6-4f0b-8287-3d138a8645b9.json delete mode 100644 data/hfopenllm_v2/Aashraf995/Gemma-Evo-10B/8039cadf-6644-44e7-8452-90e9c8069e28.json delete mode 100644 data/hfopenllm_v2/Aashraf995/Qwen-Evo-7B/8914d89d-c873-4704-998e-dc807e96030b.json delete mode 100644 data/hfopenllm_v2/Aashraf995/QwenStock-14B/c2e9fc29-db07-4b49-a98a-084158831ac4.json delete mode 100644 data/hfopenllm_v2/AbacusResearch/Jallabi-34B/58724539-6fc5-40d9-ba43-87410959894d.json delete mode 100644 data/hfopenllm_v2/Ahdoot/StructuredThinker-v0.3-MoreStructure/b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json delete mode 100644 data/hfopenllm_v2/Ahdoot/Test_StealthThinker/782b2df0-d1b3-414c-a4bd-59052a4441a9.json delete mode 100644 data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder/b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json delete mode 100644 data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0/2824e8d4-2749-4b18-a3a1-b987ed215ac6.json delete mode 100644 data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1.1/53176984-ba93-4a64-b81e-21f6e0f65bcd.json delete mode 100644 data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1/53252698-7d17-4f2a-9106-3b744ae7a985.json delete mode 100644 data/hfopenllm_v2/Alepach/notHumpback-M0/6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json delete mode 100644 data/hfopenllm_v2/Alepach/notHumpback-M1-v2/35f11d5e-88c4-4a95-8d06-a40bee648b00.json delete mode 100644 data/hfopenllm_v2/Alepach/notHumpback-M1/ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json delete mode 100644 data/hfopenllm_v2/Alibaba-NLP/gte-Qwen2-7B-instruct/95733620-e1e7-4442-b9c3-a699165df5e7.json delete mode 100644 data/hfopenllm_v2/Alsebay/Qwen2.5-7B-test-novelist/cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json delete mode 100644 data/hfopenllm_v2/Amaorynho/BBAI2006/72be5537-198a-43e9-9840-a803083158d3.json delete mode 100644 data/hfopenllm_v2/Amaorynho/BBAI270V4/2e9a3443-970d-4f37-a356-277a11c81754.json delete mode 100644 data/hfopenllm_v2/Amaorynho/BBAIIFEV1/1188402f-aa1c-4306-b031-c92ff0a5dd64.json delete mode 100644 data/hfopenllm_v2/Amaorynho/BBAI_375/ee2f567a-6403-46d5-9a6b-bd029f81d660.json delete mode 100644 data/hfopenllm_v2/Amu/t1-1.5B/d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json delete mode 100644 data/hfopenllm_v2/Amu/t1-3B/87d66efc-173f-4c14-b76c-d8b7e00d575d.json delete mode 100644 data/hfopenllm_v2/ArliAI/ArliAI-RPMax-12B-v1.1/47f62378-c3cc-408f-a0d1-71eb3f522f57.json delete mode 100644 data/hfopenllm_v2/ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1/dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json delete mode 100644 data/hfopenllm_v2/Arthur-LAGACHERIE/Precis-1B-Instruct/e4087285-1d1a-465e-ac88-91310e939710.json delete mode 100644 data/hfopenllm_v2/Artples/L-MChat-7b/09f189d9-74fd-47bb-b5fb-7994cba56ae2.json delete mode 100644 data/hfopenllm_v2/Artples/L-MChat-Small/5754c262-6ddf-4f54-9722-22ff20a8d76f.json delete mode 100644 data/hfopenllm_v2/Aryanne/QwentileSwap/cc1bd811-ec88-4514-8b47-4140ded4f03d.json delete mode 100644 data/hfopenllm_v2/Aryanne/SHBA/3f08155d-8551-4472-86fe-7988cd6df78b.json delete mode 100644 data/hfopenllm_v2/Aryanne/SuperHeart/339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json delete mode 100644 data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json delete mode 100644 data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json delete mode 100644 data/hfopenllm_v2/Ateron/Glowing-Forest-12B/6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json delete mode 100644 data/hfopenllm_v2/Ateron/Lotus-Magpic/99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json delete mode 100644 data/hfopenllm_v2/Ateron/Way_of_MagPicaro/b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json delete mode 100644 data/hfopenllm_v2/AuraIndustries/Aura-4B/c3d39b6c-02af-410d-8a5c-224495b04572.json delete mode 100644 data/hfopenllm_v2/AuraIndustries/Aura-8B/0426fcba-3db4-492d-b622-e34ab8d3fc8f.json delete mode 100644 data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B-v2/aa099cfe-ac9a-42dd-8357-f4d8115133ca.json delete mode 100644 data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B/ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json delete mode 100644 data/hfopenllm_v2/Aurel9/testmerge-7b/b359a7a3-cf2c-4952-b308-333672dadcec.json delete mode 100644 data/hfopenllm_v2/Ayush-Singh/Llama1B-sft-2/0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json delete mode 100644 data/hfopenllm_v2/Azure99/Blossom-V6-14B/e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json delete mode 100644 data/hfopenllm_v2/Azure99/Blossom-V6-7B/45d019ab-b23c-4fc3-baf5-d57576e9945c.json delete mode 100644 data/hfopenllm_v2/Azure99/blossom-v5-32b/e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json delete mode 100644 data/hfopenllm_v2/Azure99/blossom-v5-llama3-8b/9be442e8-4b77-43e0-a981-887338e59b78.json delete mode 100644 data/hfopenllm_v2/Azure99/blossom-v5.1-34b/a07b6326-f393-490e-b696-d8b45f593d4b.json delete mode 100644 data/hfopenllm_v2/Azure99/blossom-v5.1-9b/b66ed91a-98d5-407c-9896-9c2e2a31e9da.json delete mode 100644 data/hfopenllm_v2/BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference/9c70921d-956b-4727-9201-1addbd01bb8b.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Llama3-70B/4ba6d51e-314a-4db4-9552-568a4093e01a.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Mistral-7B/835f5056-56bf-4a6c-886f-fbe6f263ac07.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-70B/c2a63afa-9d25-41dc-b25f-848f5a640501.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-8B/f64f9d24-e448-4bb6-89c3-edb66499bac9.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Mistral-7B/2de14bfb-844a-4711-815e-8f63487a78fd.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Qwen2-7B/f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B/98187b98-0cc8-4756-9cb7-c53deb998f90.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B/8c79c60d-ebf4-4409-be4f-928a54cedd1d.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-mistral-7B/5d5cebeb-faf0-4fdf-8749-6307080e82f2.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B/e926ce8f-45bb-4f3d-b579-ecadb3df6468.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B/070609d6-5f41-4712-9ad7-e215b1a6bb81.json delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-mistral-7B/8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json delete mode 100644 data/hfopenllm_v2/BAAI/OPI-Llama-3.1-8B-Instruct/53587959-25f9-43aa-a34b-f274d8bc93af.json delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/Meta-Llama-3-8Bee/2a7f80ed-d404-4c81-b000-b65c83069121.json delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/smol_llama-101M-GQA/f0983645-4adb-4ddb-bf2f-33480cb7f421.json delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu/161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA/694a02f9-4729-4d0b-97ce-80adaef29be2.json delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-openhermes/0521f51d-22c1-4821-8f04-23c533411668.json delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/8fdea71b-5e68-4a78-aefc-8a00650464c4.json delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan/e2ba5674-9251-4a4e-9eb8-046c834da400.json delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e/4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-instruct-orpo/886e0b8b-b2dc-434f-a299-50f668006241.json delete mode 100644 data/hfopenllm_v2/BSC-LT/salamandra-7b-instruct/7a6a9443-f331-4dfa-acf9-6aa30049bade.json delete mode 100644 data/hfopenllm_v2/BSC-LT/salamandra-7b/6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json delete mode 100644 data/hfopenllm_v2/Ba2han/Llama-Phi-3_DoRA/cfecfce3-090d-4c2e-826c-03c0c5337e98.json delete mode 100644 data/hfopenllm_v2/Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB/5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json delete mode 100644 data/hfopenllm_v2/BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0/ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json delete mode 100644 data/hfopenllm_v2/BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/114f246a-6049-40bf-ad86-9a822d13cf74.json delete mode 100644 data/hfopenllm_v2/BlackBeenie/Bloslain-8B-v0.2/82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json delete mode 100644 data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1/ed3c1349-a154-4866-890f-2b115ffaf127.json delete mode 100644 data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge/47942c55-5ddb-4fda-9c5b-34676ae2046a.json delete mode 100644 data/hfopenllm_v2/BlackBeenie/Neos-Gemma-2-9b/d860210b-4c8a-4d15-ad3a-4e39905f91ed.json delete mode 100644 data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-8B/d137f429-2b65-4ee9-9d66-3f619b270fad.json delete mode 100644 data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-base/1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json delete mode 100644 data/hfopenllm_v2/BlackBeenie/Neos-Phi-3-14B-v0.1/6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json delete mode 100644 data/hfopenllm_v2/BlackBeenie/llama-3-luminous-merged/676342d2-f37a-4b6a-967d-3ac750243470.json delete mode 100644 data/hfopenllm_v2/BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco/950b7108-0192-4875-b4e9-c3e43ab71e08.json delete mode 100644 data/hfopenllm_v2/Bllossom/llama-3.2-Korean-Bllossom-AICA-5B/85672df5-2f35-43be-8648-9937c66872dc.json delete mode 100644 data/hfopenllm_v2/BoltMonkey/DreadMix/051c5642-3b23-4879-9d10-639d1b3127d7.json delete mode 100644 data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json delete mode 100644 data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json delete mode 100644 data/hfopenllm_v2/BoltMonkey/SuperNeuralDreadDevil-8b/703df6c3-dae4-437f-9379-f8c264797adc.json delete mode 100644 data/hfopenllm_v2/BrainWave-ML/llama3.2-3B-maths-orpo/1e349ad3-d29b-4a4b-97e7-b82055e41b07.json delete mode 100644 data/hfopenllm_v2/BramVanroy/GEITje-7B-ultra/8f677a76-932c-4c35-9708-4b723226aa19.json delete mode 100644 data/hfopenllm_v2/BramVanroy/fietje-2-chat/ebfe625f-ff1f-45f9-826c-9351ea4134e1.json delete mode 100644 data/hfopenllm_v2/BramVanroy/fietje-2-instruct/66e6a757-ac22-47f3-82ce-81af45e1d3cf.json delete mode 100644 data/hfopenllm_v2/BramVanroy/fietje-2/1cd840c7-d432-495c-a3df-af1fa6264259.json delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-base/066f520f-9a64-4564-abfc-6435732c3585.json delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-chat/aced5181-040a-48c0-bc5f-78d0de3afae8.json delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-base/a4889a38-84d2-4ae1-b8a9-297b4400602d.json delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-chat/d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-base/9859afee-02ca-4c48-acc8-acfd20c37e4e.json delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-chat/e222d12b-c796-4890-a584-cd689bae7ea6.json delete mode 100644 data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412/c16850f8-0b80-4455-8f38-8ec453cd1d41.json delete mode 100644 data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct/0d400b0f-cc82-4c86-b600-93a31b133f9d.json delete mode 100644 data/hfopenllm_v2/Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B/90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json delete mode 100644 data/hfopenllm_v2/CausalLM/14B/6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json delete mode 100644 data/hfopenllm_v2/CausalLM/34b-beta/e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json delete mode 100644 data/hfopenllm_v2/CausalLM/preview-1-hf/5e9c1273-536d-4280-8fff-9931f46dc968.json delete mode 100644 data/hfopenllm_v2/Changgil/K2S3-14b-v0.2/460ca160-ac34-4091-ba2d-986b53532b55.json delete mode 100644 data/hfopenllm_v2/Changgil/K2S3-v0.1/ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json delete mode 100644 data/hfopenllm_v2/ClaudioItaly/Albacus/a29a69d3-d64e-4463-aa52-0a9d6d012c98.json delete mode 100644 data/hfopenllm_v2/ClaudioItaly/Book-Gut12B/4539c16e-1ac6-47f4-88eb-a09842497330.json delete mode 100644 data/hfopenllm_v2/ClaudioItaly/Evolutionstory-7B-v2.2/2ff33c55-1236-4c57-8809-2d3076e43cc7.json delete mode 100644 data/hfopenllm_v2/ClaudioItaly/intelligence-cod-rag-7b-v3/281ba822-49a2-4746-bc04-8de046439508.json delete mode 100644 data/hfopenllm_v2/CohereForAI/aya-23-35B/0606d916-95ea-4318-af0c-3942329071c6.json delete mode 100644 data/hfopenllm_v2/CohereForAI/aya-23-8B/005159f0-da68-480d-972c-c160d145a682.json delete mode 100644 data/hfopenllm_v2/CohereForAI/aya-expanse-32b/2f6abb5d-52b3-44b0-b960-115793485fb1.json delete mode 100644 data/hfopenllm_v2/CohereForAI/aya-expanse-8b/6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json delete mode 100644 data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus-08-2024/26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json delete mode 100644 data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus/d4536913-5708-45e4-a024-45ae37fdae13.json delete mode 100644 data/hfopenllm_v2/CohereForAI/c4ai-command-r-v01/848860aa-7de3-4fae-afca-ac11224b96c5.json delete mode 100644 data/hfopenllm_v2/CohereForAI/c4ai-command-r7b-12-2024/0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/20b69120-d476-4e34-b3c6-8cef11d6ee78.json delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/696bbbfc-49dd-444e-a90b-76821845a726.json delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-odpo-v1.0/e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-sft-v1.0/b26ba2b7-1365-4b1c-a1be-35d588e02d36.json delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0/64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0/c4e572cb-1d12-4baf-a4d8-a55422692207.json delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0/c6123e10-b1f9-49dc-888b-083881e6ef09.json delete mode 100644 data/hfopenllm_v2/CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/e1647f10-fec5-463d-b8e5-6b2b880bd687.json delete mode 100644 data/hfopenllm_v2/CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/6d5fa235-8d69-456e-9f23-0f702760baf4.json delete mode 100644 data/hfopenllm_v2/CombinHorizon/YiSM-blossom5.1-34B-SLERP/e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json delete mode 100644 data/hfopenllm_v2/CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/603e95c9-7e7f-4892-93f7-92f92b256865.json delete mode 100644 data/hfopenllm_v2/CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/3e2fd38a-186e-49aa-915c-7eb3cde50562.json delete mode 100644 data/hfopenllm_v2/CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/16d55e66-9015-4d72-81e4-3f14c42b0368.json delete mode 100644 data/hfopenllm_v2/ContactDoctor/Bio-Medical-3B-CoT-012025/696644b9-bd40-4047-bb85-0cb19510a96c.json delete mode 100644 data/hfopenllm_v2/ContactDoctor/Bio-Medical-Llama-3-8B/cbae8c39-0aec-4859-98bc-3b2d065833ad.json delete mode 100644 data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge2/15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json delete mode 100644 data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge3/357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json delete mode 100644 data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme/a50a542b-668e-47b1-a37e-805a58eea3d1.json delete mode 100644 data/hfopenllm_v2/Corianas/Neural-Mistral-7B/00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json delete mode 100644 data/hfopenllm_v2/Corianas/Quokka_2.7b/26782941-b918-44c5-a7f6-5f770e47c3d6.json delete mode 100644 data/hfopenllm_v2/Corianas/llama-3-reactor/5547ddaf-8fbb-4259-8b88-e946fc3d2404.json delete mode 100644 data/hfopenllm_v2/CortexLM/btlm-7b-base-v0.2/bee5ea59-b97a-4783-b763-b6bd432d4558.json delete mode 100644 data/hfopenllm_v2/Cran-May/SCE-2-24B/8150333f-8e79-4230-af8b-7ddb1d5eeb21.json delete mode 100644 data/hfopenllm_v2/Cran-May/SCE-3-24B/be8510a9-ecd4-4ac7-9930-3200cacb7b50.json delete mode 100644 data/hfopenllm_v2/Cran-May/T.E-8.1/887e4574-f876-4e75-afb8-e543bcb30020.json delete mode 100644 data/hfopenllm_v2/Cran-May/merge_model_20250308_2/fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json delete mode 100644 data/hfopenllm_v2/Cran-May/merge_model_20250308_3/c0f05e38-6592-478a-9c46-26567f24ff85.json delete mode 100644 data/hfopenllm_v2/Cran-May/merge_model_20250308_4/06cc2913-8e05-44bf-a128-9a7c4aeff536.json delete mode 100644 data/hfopenllm_v2/Cran-May/tempmotacilla-cinerea-0308/86368d5b-0509-4b52-b988-58bcf7e1043e.json delete mode 100644 data/hfopenllm_v2/CreitinGameplays/Llama-3.1-8B-R1-v0.1/77b89fe6-464b-4017-a77f-8750e2668a82.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Broca/d2e47d86-23dd-4c95-a7fb-99518615d09f.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-BrocaV9/0a09891e-ac97-4c3a-8364-7106a851f1a8.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav3/eb41fe62-ac46-4630-bb2d-6b907f271737.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav6/d540a6c8-e9ec-4413-b9d2-dee68533c377.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav7/5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emerged/6701738c-27e4-4bbd-b614-fbc297c3164f.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emergedv3/7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-FinalMerge/32b6e4af-69ba-49b7-9367-dfafe3e390e8.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyper/e16deaf7-da55-40ba-ac18-860fa3f14d34.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-HyperMarck-dl/8a7a5886-0618-4615-9cdf-46f5d19a29fe.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv3/66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv4/a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv5/4a6237a7-019c-4310-971e-84b08d1b5067.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-MegaMerge-pt2/996e781e-5939-41ac-b347-95c99037c34a.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-MergeStock/e880fa0e-ae49-4398-91bd-eadf8695425f.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-ReasoningMerge/da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Ultimav2/6d709396-1ae1-4e5c-a03c-13c1e9425202.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Unity/5b616df9-e15a-4f84-98b4-c2cb532c1b95.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SFT/0f6552d9-3cbe-447e-909b-068e5ceed4c9.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SLERP/2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke/51a64f37-256c-4fe7-b28c-6117520f04ec.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernickev3/03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-partialmergept1/3b0f5dea-db9b-4657-9807-6b3e56d38823.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwenfinity-2.5-14B/2d19e9ff-e331-4171-ae90-47e44f3f8885.json delete mode 100644 data/hfopenllm_v2/CultriX/Qwestion-14B/6bfb8b24-1abd-405b-b01d-7d7111705dbb.json delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMerge/c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMergev1/72569796-1b11-48cc-ada7-e8c09522dd54.json delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14B-v5/58403e30-bd2b-4f4c-ad41-daa890c77d40.json delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14B/eb8e1f1d-c6b3-407c-b172-d240553d2f89.json delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14Bv1/356d75a0-6520-46c1-afa9-7dbb2596a5c1.json delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14Bv2/78681e0c-5fe2-4920-af7b-99345cea3efe.json delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14Bv3/ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json delete mode 100644 data/hfopenllm_v2/DRXD1000/Atlas-7B/17d0d377-bca4-411c-be11-6c5cfce07798.json delete mode 100644 data/hfopenllm_v2/DRXD1000/Phoenix-7B/d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json delete mode 100644 data/hfopenllm_v2/DUAL-GPO/zephyr-7b-ipo-0k-15k-i1/389821ff-d8e2-4d1d-8fb2-57a689867ac5.json delete mode 100644 data/hfopenllm_v2/DZgas/GIGABATEMAN-7B/7913f782-29b0-48bd-bc62-37da9a5ac7d9.json delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherDrake-SFT/b0930974-999e-4372-9d21-b9790e0bad4c.json delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherSett/8265f577-f504-4a56-9cf0-42c34766559a.json delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherTOT/82044cd2-1a46-406e-bc68-397ce41b29ea.json delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherTOT/de09e323-8cf1-4aa9-9537-e8ad30a8c297.json delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherUncensored/bfe543b4-ec38-488e-ae04-125cd358b61f.json delete mode 100644 data/hfopenllm_v2/Daemontatox/Cogito-MIS/be36d8ae-b81c-4b4e-aa2f-5999c7582237.json delete mode 100644 data/hfopenllm_v2/Daemontatox/CogitoDistil/342b435f-89e9-48ad-ab0f-2c1f52f4571a.json delete mode 100644 data/hfopenllm_v2/Daemontatox/CogitoZ/b0c8737d-d838-4da1-909b-b218e22119dc.json delete mode 100644 data/hfopenllm_v2/Daemontatox/CogitoZ14/4cd40f28-842f-44d5-9eb2-86238077fc55.json delete mode 100644 data/hfopenllm_v2/Daemontatox/DocumentCogito/0758051c-2d75-402e-af0e-769096cbb17c.json delete mode 100644 data/hfopenllm_v2/Daemontatox/DocumentCogito/c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json delete mode 100644 data/hfopenllm_v2/Daemontatox/Llama3.3-70B-CogniLink/b8467118-d895-41fa-81c7-89892e1844d5.json delete mode 100644 data/hfopenllm_v2/Daemontatox/Llama_cot/30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json delete mode 100644 data/hfopenllm_v2/Daemontatox/MawaredT1/89b92cda-c5b6-45ed-a534-361c9d34794a.json delete mode 100644 data/hfopenllm_v2/Daemontatox/Mini_QwQ/48cdf76a-886d-41ec-8580-00ed4232b601.json delete mode 100644 data/hfopenllm_v2/Daemontatox/NemoR/116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json delete mode 100644 data/hfopenllm_v2/Daemontatox/PathFinderAI2.0/bb103828-70fe-4767-9302-6750d839129e.json delete mode 100644 data/hfopenllm_v2/Daemontatox/PathFinderAi3.0/7b58ab54-239b-4e49-93f1-c3940df61474.json delete mode 100644 data/hfopenllm_v2/Daemontatox/PathfinderAI/559067a2-816c-4091-893e-b1c7860171ec.json delete mode 100644 data/hfopenllm_v2/Daemontatox/PathfinderAI/ec502619-880b-4b7c-acfe-c43cf6514e3f.json delete mode 100644 data/hfopenllm_v2/Daemontatox/Phi-4-COT/6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json delete mode 100644 data/hfopenllm_v2/Daemontatox/PixelParse_AI/636e2f93-3242-491c-9df5-003aa1dacecf.json delete mode 100644 data/hfopenllm_v2/Daemontatox/RA2.0/1f4efa23-816d-49be-8659-feb003f4b3ef.json delete mode 100644 data/hfopenllm_v2/Daemontatox/RA_Reasoner/d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json delete mode 100644 data/hfopenllm_v2/Daemontatox/RA_Reasoner2.0/9ab53055-86f5-4a88-976f-015dd9c9e832.json delete mode 100644 data/hfopenllm_v2/Daemontatox/ReasonTest/ba34083a-9b13-46d9-8f36-aa3ddd586711.json delete mode 100644 data/hfopenllm_v2/Daemontatox/Research_PathfinderAI/6a39d734-ad73-4c4a-9583-3563e336d4b3.json delete mode 100644 data/hfopenllm_v2/Daemontatox/SphinX/2af71e88-4931-4359-b92a-c64fa33df802.json delete mode 100644 data/hfopenllm_v2/Daemontatox/Sphinx2.0/bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json delete mode 100644 data/hfopenllm_v2/Daemontatox/TinySphinx/2de872b2-10c7-44dd-91c3-f20205207da6.json delete mode 100644 data/hfopenllm_v2/Daemontatox/TinySphinx2.0/5cabed09-d8ea-46c2-bb78-012dac954d6b.json delete mode 100644 data/hfopenllm_v2/Daemontatox/Zirel-7B-Math/8236db6a-ff8a-4237-af5a-03bb258f8e59.json delete mode 100644 data/hfopenllm_v2/Daemontatox/Zirel_1.5/1a7b078e-bc1f-400f-a0cd-f7b535548f23.json delete mode 100644 data/hfopenllm_v2/Daemontatox/mini-Cogito-R1/fdaf561c-567c-416d-a74a-ac3c07c5be5b.json delete mode 100644 data/hfopenllm_v2/Daemontatox/mini_Pathfinder/58900b3b-303b-49c8-b807-7b8d06601568.json delete mode 100644 data/hfopenllm_v2/Dampfinchen/Llama-3.1-8B-Ultra-Instruct/7ac5a45a-7b41-4f63-8556-8737638a00ea.json delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-10b/3cb55475-30c8-43c8-8d7d-394450fdc117.json delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO/f5e140ff-0c0e-4769-8116-63cf50255773.json delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4-v2/df85ec6e-1325-40ce-8087-d960a1d767dd.json delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4/a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-3b-GRPO/11842dd9-0572-41ef-aaa0-8d19f3420efc.json delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-Llama3-8b-ORPO/01abccec-1cea-4060-89be-289987d0a2ce.json delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-R1-Qwen-1.5b/dce8226c-57bd-4255-b813-8a70494f0a1a.json delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-2/7f80e69c-eec6-49ac-a088-6248ee25f736.json delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-3/e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML/e6ad37be-28f4-43b4-9df1-b7b47d31232e.json delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0/5514368a-1f7d-4cd0-b7f7-d116b753f975.json delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1/c0e29cf8-897f-4e07-abb4-71c801d34301.json delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0/68310379-65b2-482d-892b-f76547bce2b0.json delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML/a034c4ec-d4cd-439b-8dbd-e67685ea7616.json delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7/e4b761d3-bb84-4433-b9fb-4c92ecae6279.json delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/mistral-7b-test-merged/38d78d30-be6d-476c-a3aa-d9a40f570a56.json delete mode 100644 data/hfopenllm_v2/Darkknight535/OpenCrystal-12B-L3/36e60f6c-60f7-4b17-88fe-82810e195fc7.json delete mode 100644 data/hfopenllm_v2/DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/a6c647e8-ed24-4150-8563-dd9b20e21498.json delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/db8c6169-bfc1-48bb-be53-fa93c673f051.json delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/41437fc9-6d48-4317-a8de-ab4e63b2cf46.json delete mode 100644 data/hfopenllm_v2/DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-9B/3349d66c-e12b-49c1-a406-e0e77b697458.json delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-DEADLINE-10B/7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-J.GutenBerg-10B/ac749485-df6d-485e-8fa7-63bdfd744167.json delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-Mighty-Sword-9B/54363a4b-312b-4035-a1c3-b5321311cec4.json delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/aa9e2b9e-cd25-4492-9801-eba7d40b4365.json delete mode 100644 data/hfopenllm_v2/DavidAU/L3-DARKEST-PLANET-16.5B/c6b484b8-f6f3-4516-aff5-c2f6438c9047.json delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Dark-Planet-8B/c6c760c9-a345-4e25-b333-b403bf6db389.json delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct/65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct/92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json delete mode 100644 data/hfopenllm_v2/DavidAU/L3-SMB-Instruct-12.2B-F32/59ddd478-c1cd-4bd8-80c3-fdebe762414a.json delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/02f63fc6-9376-4fb5-b067-63493238cc27.json delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Stheno-v3.2-12.2B-Instruct/dd7597fd-27f5-4e77-a44f-b01d0db82719.json delete mode 100644 data/hfopenllm_v2/DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B/20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json delete mode 100644 data/hfopenllm_v2/DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/c4e9d045-3769-4828-a2ca-7fa508873089.json delete mode 100644 data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/0a0501ec-4ecd-47c1-914b-d473f795cef2.json delete mode 100644 data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json delete mode 100644 data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json delete mode 100644 data/hfopenllm_v2/Davidsv/SUONG-1/def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/5e1aa809-ef20-445e-a05b-eccd585d5991.json delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter1/cfe4ea72-ddb9-49b5-9599-99f215e112e5.json delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter2/81d63d8e-88dd-4b16-b9b8-d07604878f8f.json delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/81f8208b-f7e7-4685-bb84-321d9e097470.json delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/a0c9a434-9b8c-47c5-b511-9daac7901686.json delete mode 100644 data/hfopenllm_v2/DavieLion/Lllma-3.2-1B/28b60eae-1b38-4404-8db1-3fb2997583f4.json delete mode 100644 data/hfopenllm_v2/DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT/746862a2-a90c-4612-91d0-f989b9eed1a5.json delete mode 100644 data/hfopenllm_v2/Deci/DeciLM-7B-instruct/715ee057-9c9a-4e04-991c-7040b1eef65b.json delete mode 100644 data/hfopenllm_v2/Deci/DeciLM-7B/4dc1d103-3458-4b8c-9e63-b98effd69667.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.1-8B-Inst/070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst/8406a5b8-a87d-489b-b75b-00e9f675f09f.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0/11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1/6b542f5a-ea62-45ce-8e98-436a4d058877.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1/9b280640-bfee-4730-acc3-386a54b2434c.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/causal_gpt2/eff5171b-6119-4013-8aa8-8a4f0215b045.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0/471c5fed-f155-4521-9d9c-b5370ca91bec.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2/690be099-3ace-484f-b01f-2fe6b324d12a.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2_v1/71fbd15f-5eec-40d9-84e8-07323f3ffac6.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst/eb93dd3e-3d13-4234-bb66-f6177648aa2b.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0/f7ec1ed7-cc30-4879-8ab1-4909011553d5.json delete mode 100644 data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1/3e100704-dbd3-4d05-b325-5bb4bc90e51c.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B/12f003ef-1098-4d3f-aed7-7343034157bc.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B_v2/9de2e564-3a30-4f1c-80da-6432a245a64f.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Lexora-Medium-7B/dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Llama-3-8b-Ita/8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/baf93ef6-56f3-4809-93f6-32dcf4730388.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Llama-3.1-Distilled/f6df14bd-207c-4fea-b789-c9f9aef749b3.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita/97766a7f-cf5b-46ae-b51e-5c5702ae000b.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v2/d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v3/275d4bf0-566c-4b50-86b9-38c7f45df143.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v5/aa504db9-81f3-424f-b7d9-683ebe31f5d8.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v6/2cc209b7-ef10-435d-a840-b904ab741491.json delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2.5-7B-Instruct-MathCoder/9b9390ac-fd65-4a58-9834-5352aa340cdc.json delete mode 100644 data/hfopenllm_v2/DeepMount00/mergekit-ties-okvgjfz/4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json delete mode 100644 data/hfopenllm_v2/Delta-Vector/Baldur-8B/4bc5a0db-1c88-4c61-9343-1d340305ecc5.json delete mode 100644 data/hfopenllm_v2/Delta-Vector/Control-8B-V1.1/74527f51-dcec-4b82-8ba8-075c933404f5.json delete mode 100644 data/hfopenllm_v2/Delta-Vector/Control-8B/ac31bc90-3854-4d38-925d-ef8dc7e75d24.json delete mode 100644 data/hfopenllm_v2/Delta-Vector/Darkens-8B/88583cff-1adc-4b1b-8e68-07f0074d0ae2.json delete mode 100644 data/hfopenllm_v2/Delta-Vector/Henbane-7b-attempt2/fadbac9e-7224-41d1-abfa-7039cbcba9f6.json delete mode 100644 data/hfopenllm_v2/Delta-Vector/Odin-9B/1fb90540-0fa0-44ca-ad67-1e3503f6b729.json delete mode 100644 data/hfopenllm_v2/Delta-Vector/Tor-8B/047784e2-c1ee-40d9-a60d-e43504825801.json delete mode 100644 data/hfopenllm_v2/DevQuasar/DevQuasar-R1-Uncensored-Llama-8B/ee60453d-2d51-46f7-8a18-c651d590f0e7.json delete mode 100644 data/hfopenllm_v2/Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO/b0ac4b11-f7b4-4753-baae-310a92f08259.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test/324db8b3-38c7-4a2c-82e8-7bebfa38e760.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore/54dd9033-61b9-4f26-9cde-e04c7136524b.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/L3-8B-WolfCore/d0973d6c-373c-41cd-9e62-52470c044dac.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame-test/da15da67-b316-4c2e-86a5-c1f88eece9cb.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame2-test/b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame3-test/bce7b15d-1670-46db-bdff-24fb38bc3fd9.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Kakigori/15e5e02f-27b9-4063-b601-42c2b17180f9.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-2/51b0c546-0dde-4668-a8b8-3b9753a31aa0.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-3/45842b1c-cf68-44a7-928f-2da454cdd13f.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-4/c15cdefd-dbe3-432e-aab0-3c43540cd320.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/1f489afa-a01d-40f3-836a-9e386c502d1d.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/94bcc87e-eb06-4321-9b72-2f99168cf92a.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-GreenSnake/c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Nocturne/b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment/a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment/f07c3a4a-2a8e-45c4-a726-be95726df2db.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment/f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi/65acabdc-ea5f-426c-820b-2b79f2b20b44.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/96b00cfa-1383-4b36-a043-17eb39678ffc.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/3b8a796e-6bde-4506-8335-bd3cc72482e1.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/65d9e237-2757-459e-94e7-e382213e4eeb.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake/c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Unleashed-Twilight/2e7d3674-d0b0-4b87-8bd8-8202114b7665.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-WolFrame/30d21295-beb1-4179-8c6f-7bac79b29474.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-test/e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1b-test/7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1c-test/336effcd-d8fc-4477-846f-70fc40bdc111.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1d-test/28f87820-d587-498e-b713-7c0af0cdc324.json delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B/f1b671ab-ebb3-43ec-86fa-832982d04cc1.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Again-8B-Model_Stock/327cde83-d107-4455-bc03-7e03026c52e6.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Alita99-8B-LINEAR/7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json delete mode 100644 data/hfopenllm_v2/DreadPoor/AnotherTest/92c8afbe-7735-40c8-af0e-29da687c2070.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire-8B-model_stock/bca052ac-6556-49d8-94e3-f4bda560a5d3.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_1.3-8B_model-stock/5f74fe6e-8575-4cea-959b-e6ba03c7e273.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V2-8B-Model_Stock/b0f696f5-ed70-4293-999d-a9121192c137.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V2.1-8B-Model_Stock/18751a6f-062c-4915-bbe0-ae222cf9ae0b.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT-8B-Model_Stock/398ebe04-638f-4a11-b99d-6778ff3ff97b.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock/b4f197f2-3456-4221-b222-10dfbbb50f56.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V3-8B-Model_Stock/0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V4-8B-Model_Stock/1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V4_ALT-8B-Model_Stock/496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Asymmetric_Linearity-8B-Model_Stock/37071760-d24c-43cc-9965-d8c7873c0ee8.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LINEAR/91a71a49-5dd4-43b1-9e1c-fd9492236712.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED/d1d48abb-6dcf-4905-958f-c3a3e75feac6.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED_ALT/68282f29-f56f-420b-bd1e-9cc54783c1a5.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Autumn_Dawn-8B-LINEAR/cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel-8B-LINEAR/22a9d3b8-ac45-4433-8926-5d28681af922.json delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel-8B-Model_Stock/57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel_V2-8B-Model_Stock/24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel_V2_ALT-8B-Model_Stock/6ed62f64-c2be-4bca-b17d-bd0184a3d498.json delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel_V3-8B-Model_Stock/db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Blunt_Edge-8B-SLERP/7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json delete mode 100644 data/hfopenllm_v2/DreadPoor/BulkUp/6f286418-d8e3-4c11-8941-cfe5a18b1037.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Cadence-8B-LINEAR/b0a83b1f-3af2-45e8-9d88-d7302a529112.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Caelid-8B-Model_Stock/0462fce1-51b4-48d8-8278-a90048ffd637.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Casuar-9B-Model_Stock/e02f597c-c368-4223-ac90-c99d82c90634.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Condensed_Milk-8B-Model_Stock/32e63ffc-c64e-4562-ba99-14873f5bac2e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/CoolerCoder-8B-LINEAR/6af4faad-05c2-488b-9685-e11ae4e1cbf0.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Damasteel-8B-LINEAR/8aa7701b-7019-44a0-851f-cfc9108fdfbd.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Dearly_Beloved-8B-TIES/a2f95fad-5ab5-47d0-b9aa-33358c673caf.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Decayed-8B-LINEAR/aef73a77-9df7-4d4f-89ef-50905d326198.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Derivative-8B-Model_Stock/e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Derivative_V2-8B-Model_Stock/8ff39438-907c-465f-ac7a-5a25cfd8d824.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Derivative_V2_ALT-8B-Model_Stock/83d831c5-a74f-4699-9961-664a7a51b7b8.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Derivative_V3-8B-Model_Stock/83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Elusive_Dragon_Heart-8B-LINEAR/3811cc34-45cb-4932-b862-39bf042331e0.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Emu_Eggs-9B-Model_Stock/5b2a16a1-7a2a-40b7-add6-b99378b6af00.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Eunoia_Vespera-8B-LINEAR/1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Fu_sion_HA-8B-SLERP/742e0a1c-7496-4076-bdbf-ada0a8e528c2.json delete mode 100644 data/hfopenllm_v2/DreadPoor/HOT_STINKING_GARBAGE/f0664035-3256-444c-b848-ef603e0d46b5.json delete mode 100644 data/hfopenllm_v2/DreadPoor/H_the_eighth-8B-LINEAR/9159aaa6-8663-491f-901a-74da4c343d20.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Happy_New_Year-8B-Model_Stock/5179b145-9fdb-4ab5-8cca-87966ecf6519.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Heart_Stolen-8B-Model_Stock/da872193-1d25-4e8e-bc22-9138a9d121ba.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Heart_Stolen-ALT-8B-Model_Stock/967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Here_We_Go_Again-8B-SLERP/dd615b4c-189e-4361-bcf4-879fd59b28a2.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Howdy-8B-LINEAR/0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Incidental-8B-Model_Stock/8c583b51-4349-48af-98d9-8eaaf43d60b6.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Irina-8B-model_stock/34aab556-5e97-4ea2-9ada-d17dc3624be2.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Kindling-8B-Model_Stock/fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json delete mode 100644 data/hfopenllm_v2/DreadPoor/L3.1-BaeZel-8B-Della/b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Laughing_Stock-8B-Model_Stock/7f371c11-e8f0-4233-b359-aac39c0a1110.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Lava_Lamp-8B-SLERP/9f758d4e-d121-4688-8ece-8dc67a499811.json delete mode 100644 data/hfopenllm_v2/DreadPoor/LemonP-8B-Model_Stock/903b8c71-d54d-4ce4-9845-71eb8ca8733a.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Lydia_of_Whiterun-8B-LINEAR/9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Matryoshka-8B-LINEAR/28109e00-87c1-4809-a4fc-dddebba52621.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Mercury_In_Retrograde-8b-Model-Stock/6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Minthy-8B-Model_Stock/03a8091c-473e-4fbe-af70-35f791a23a0f.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Minthy_ALT-8B-Model_Stock/ed75e9ed-841b-4783-a201-bc72651afd0a.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Minthy_V2-8B-Model_Stock/38cd418c-9770-49d2-8b30-ac47e445cee3.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Minus_Penus-8B-Model_Stock/d49b6a48-ae81-467d-87c5-b17f9ca306f8.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Morphing-8B-Model_Stock/39b7e250-9f71-4833-941e-85692a48b6e6.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock/c0d102a2-ff8c-45ac-a825-31472b98b871.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Nother_One-8B-Model_Stock/7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Noxis-8B-LINEAR/d34b899e-b067-4c9c-9fa2-439f8b2d589d.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Nullsworn-12B-LINEAR/8c7b2332-510b-42d3-bcbb-e177c35d27d5.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Nwah-8B-Model_Stock/685f107f-e431-4dba-a117-8d6f1dd2c296.json delete mode 100644 data/hfopenllm_v2/DreadPoor/ONeil-model_stock-8B/e1570804-85b6-4518-a099-5f21ab27d12c.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Oh_Boy-8B-LINEAR/a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json delete mode 100644 data/hfopenllm_v2/DreadPoor/OrangeJ-8B-Model_Stock/1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR-lorablated/c901a9ee-069a-4e3e-ac52-3017d67d8800.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR/08317b59-ff74-43c8-bea5-2a266c38816e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/RPMash-8B-Model_Stock/4106d4d3-344a-4c1f-b9ce-a3140d435013.json delete mode 100644 data/hfopenllm_v2/DreadPoor/RPMash_V3-8B-Model_Stock/2b308fad-8494-4056-8b84-82733cd2710a.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Rusted_Gold-8B-LINEAR/93c867d0-4f10-440c-838c-91d1633fe584.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-LINEAR/1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-Model_Stock/151226ba-9744-45bc-b923-30df57f7aa3e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Sellen-8B-model_stock/98363657-0793-4eb3-94de-28961afc92ea.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Something-8B-Model_Stock/a32b4ded-6bff-441e-afbd-736e6d8cce5c.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Spring_Dusk-8B-SCE/326bcf4a-02e9-4218-8bf2-55a94a79435e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Summer_Dawn-8B-SCE/145facc2-ab11-4c68-b841-762e0ad9bd5a.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Summer_Dusk-8B-TIES/d3e6aae6-9284-4309-8d8c-02c9e797a58b.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-SCE/6ee8537c-90e8-4455-83ca-c8c375a5ead7.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-TIES/6efbfb38-57e5-46c7-b765-f7d0356afb97.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Sun-8B-Model_Stock/f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock/1c9b325b-92b3-499a-a3ea-026269c63c88.json delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST02-Ignore/c546ccde-cef3-4de2-a49f-24517d76dde5.json delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST03-ignore/e85d3ccf-f48d-4e5c-b893-771a107773d4.json delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST06-ignore/b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST07-ignore/97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST08-ignore/b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Trinas_Nectar-8B-model_stock/3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json delete mode 100644 data/hfopenllm_v2/DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock/7ba52efb-3890-4691-8740-9f051f1f645e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/VENN_1.2-8B-Model_Stock/7b192b49-057e-418a-b47d-44b0ec82a6b6.json delete mode 100644 data/hfopenllm_v2/DreadPoor/WIP-Acacia-8B-Model_Stock/f2120d53-bef6-44d6-84a6-a6f8e3537188.json delete mode 100644 data/hfopenllm_v2/DreadPoor/WIP_Damascus-8B-TIES/f5408aa9-85c8-46e5-b225-0480b2e18e97.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Wannabe-8B-Model_Stock/c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/What_A_Thrill-8B-Model_Stock/52659d37-67f8-45b8-88e4-11917dc90488.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Winter-8B-SCE/556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Winter_Dawn-8B-TIES/048fc971-3baf-4740-a132-2f9476d01b7a.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Winter_Dusk-8B-TIES/abd28d25-01e0-474d-be35-08d816d281f5.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Winter_Night-8B-Model_Stock/17f49724-6553-4baa-b354-45ffd0f2c844.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Yafune-8B-Model_Stock/3e60d982-d7d5-432b-962e-b7734cc90534.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Yearn_V3-8B-Model_Stock/79a0fdf3-b432-4598-be62-f9eb57fa5a43.json delete mode 100644 data/hfopenllm_v2/DreadPoor/ZEUS-8B-V17-Abliterated_ALT/662566e0-2af3-40d6-90de-9b361bcae355.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Zelus-8B-Model_Stock/d81c0035-a0b1-426c-9080-8ccbf745642b.json delete mode 100644 data/hfopenllm_v2/DreadPoor/Zelus_V2-8B-Model_Stock/100bc243-158c-4e5c-918b-1439bf26fee8.json delete mode 100644 data/hfopenllm_v2/DreadPoor/felix_dies-mistral-7B-model_stock/45e32080-1464-40e0-a232-310fdda967eb.json delete mode 100644 data/hfopenllm_v2/DreadPoor/hakuchido-8B-MODEL_STOCK/e89b279f-d548-4aa8-b5e5-0bffdd98b840.json delete mode 100644 data/hfopenllm_v2/DreadPoor/ichor-8B-Model_Stock/777a53f9-891c-4f9e-99a8-bb1988f61f19.json delete mode 100644 data/hfopenllm_v2/DreadPoor/ichor_1.1-8B-Model_Stock/f15846b1-8eaa-411b-88f7-25064161af4e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/inexpertus-8B-Model_Stock/e803fc85-fb98-4db8-aab0-a63100dcd5fc.json delete mode 100644 data/hfopenllm_v2/DreadPoor/inexpertus_1.1-8B-LINEAR/50620749-5ecf-41eb-a131-611675560e07.json delete mode 100644 data/hfopenllm_v2/DreadPoor/inexpertus_1.2-8B-LINEAR/2d40a551-6440-4d71-87e4-639d486c1c5e.json delete mode 100644 data/hfopenllm_v2/DreadPoor/mergekit-nuslerp-nqzkedi/22235942-2e3e-4ef4-b7a0-5800f507571a.json delete mode 100644 data/hfopenllm_v2/DreadPoor/remember_to_breathe-8b-Model-Stock/ac06867d-3a34-42f6-9e2e-226cf86748f6.json delete mode 100644 data/hfopenllm_v2/DreadPoor/test/394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json delete mode 100644 data/hfopenllm_v2/DreadPoor/test_ALT/03e52d4f-78d7-453c-9685-844dd1636904.json delete mode 100644 data/hfopenllm_v2/DreadPoor/tests_pending-do_not_use_yet/3ce136d5-be81-4b8c-a7dc-4e1346935d35.json delete mode 100644 data/hfopenllm_v2/ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2/fb35accf-0c5d-4f72-8d73-ba366a41a76d.json delete mode 100644 data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2/75e5ca5d-cce1-4463-b398-553399ce6833.json delete mode 100644 data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2/c426bae7-b98d-4343-b419-ac8206196a95.json delete mode 100644 data/hfopenllm_v2/Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-j-6b/58ba7ca1-8cca-4668-836b-824491d9cf01.json delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-neo-1.3B/23da100a-13b9-42a7-ba79-234be551d0e4.json delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-neo-125m/2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-neo-2.7B/4b87eea2-169c-411e-9d15-caf6b7826590.json delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-neox-20b/62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-1.4b/0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-12b/b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-160m/7fadc486-767e-45ef-979d-74ecb858cb99.json delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-1b/d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-2.8b/0999a066-1151-4445-b130-00d8fe4a516e.json delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-410m/1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-6.9b/1a59412f-fe78-4ecf-8951-8f2996dd374f.json delete mode 100644 data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4/b5403311-2069-488d-af98-27da14496c15.json delete mode 100644 data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3/6c10c176-b2b6-4216-91c0-1444944612f7.json delete mode 100644 data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B/80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json delete mode 100644 data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9/0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json delete mode 100644 data/hfopenllm_v2/EnnoAi/EnnoAi-7B-French-Instruct-202502/4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json delete mode 100644 data/hfopenllm_v2/EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0/8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json delete mode 100644 data/hfopenllm_v2/Epiculous/Azure_Dusk-v0.2/524e634f-280c-4f3a-9f1f-bdda19fad740.json delete mode 100644 data/hfopenllm_v2/Epiculous/Crimson_Dawn-v0.2/cb82e92b-f207-4fbd-9bfe-43184769cdbd.json delete mode 100644 data/hfopenllm_v2/Epiculous/NovaSpark/0b674103-4e55-41f4-accb-b7be73671801.json delete mode 100644 data/hfopenllm_v2/Epiculous/Violet_Twilight-v0.2/fa0290e0-723f-4502-90b6-c77007fffc1f.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Alpaca-Llama3.1-8B/c3827ecd-d02a-4464-a098-110f4fb54516.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it-Philos/af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it/959a4e4d-211c-4e45-94f1-f8f877e0b36f.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3/96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/DeepPhi-3.5-mini-instruct/ed5d2ca8-d551-493d-8877-348204ef91cc.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/DeepThinkers-Phi4/04e20a14-8346-4801-8515-189861c857cb.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/FineLlama3.1-8B-Instruct/eec2da56-ba0a-418f-afe1-8a46882b9839.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-12B-v1.13a-philosophers/321cf68b-9220-4ada-89da-061341a20a9d.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-12B/86fda025-2345-4a40-9094-223b96b21f13.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/3c734233-9868-4ba6-83c0-2b63f2ce8980.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/f5e0e809-08b8-43dd-a44d-875f365610c3.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/8d267135-a7e6-4ec5-ae09-66478804bb66.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/4940ed0e-2c1e-4408-9806-49ceed30a69e.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/5244ee3c-7d65-434a-acfe-cdb277ff5264.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/eba4644f-d455-4a23-a16f-8ecb038ffe7f.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/fb270319-7010-4946-b60c-409aebe41aaa.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/0220984e-fe8c-4e72-bc3e-92b949ffe769.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math/16482634-ec03-463a-9deb-2230ee955800.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/4c1db32d-96fc-4a66-b083-530a3e75ad6d.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2/c0c5c846-395a-47ac-9e8e-e598939f317d.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT/6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B/d017e3bf-2abe-4b84-810e-e0eaf973adc3.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-R1.1-Llama-3.1-8B/62a3ecb8-f6d1-429c-807f-5545b2a5897f.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Llama-3.2-3B-Agent007-Coder/748557ce-1a49-4b3a-9c38-9007dc04aafb.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math/95d43d01-a75e-4af4-a2cc-b60f832071d3.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0/4dc7c889-7839-4047-b48c-33be5b688e72.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/751851c8-9a7f-4135-a106-eab4efbd0734.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/33b8b64f-7da5-45aa-bf80-7145ef704229.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT/2662d257-49e2-430d-b44f-b0b347c61271.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2/870b639b-ee7a-4b13-872b-52657539c836.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3/6ff20678-a335-4fa8-8126-9f96ce247f34.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/19c4ea89-896a-4577-a386-c2470eaf743f.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1/22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-0/071ca686-5950-4af4-80f2-969b1008e370.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect/78977c34-33f8-4037-86e0-dfce1d01c3f8.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-R01/480e4294-c8d9-4088-9b8c-7a239d57f683.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2/be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2A/b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2B/985e479b-658a-4548-9b5e-c9c04b8838c1.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2C/d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1-V1/5050c787-2f95-4a17-a4b0-c094860627b5.json delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1_1/bb5c8274-4324-47f2-94c5-d0c831ce0de7.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2/8113a26a-5941-4f3d-872a-bdde5456ad97.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-12B-v1.2/5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos/88d79858-3a35-43eb-8da6-95b80b5deef6.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos/63266a49-01ea-40f1-83ef-778f391aff2b.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos/f0da069a-833f-489a-a923-c79542a3a9a6.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos/205b9da8-d561-41ec-946e-1d2f9a43e437.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/c086f693-cef1-4212-9c17-669b210f4caa.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/290995f2-9982-4f29-ac74-dc646905206c.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/c60e65e6-d771-4c53-80d0-c1e09aa39377.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection/fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo/5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/a0b4a345-3530-4da2-8403-87259bbd1405.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos/707270e3-334b-4eba-84c0-2795ae53d79a.json delete mode 100644 data/hfopenllm_v2/Eric111/CatunaMayo-DPO/c827bee3-a181-42bc-9387-ca132d59c8ba.json delete mode 100644 data/hfopenllm_v2/Eric111/CatunaMayo/d3e8949b-f6f8-459f-891b-f4900ff806cd.json delete mode 100644 data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json delete mode 100644 data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties/4cf4479a-622a-4bc2-86f2-aa526216f24c.json delete mode 100644 data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b-Ties/6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json delete mode 100644 data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/87b5e360-7867-4edd-b45e-e7bb92a91b69.json delete mode 100644 data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/d93116b8-28ff-41ea-8273-56f7ae11cf18.json delete mode 100644 data/hfopenllm_v2/Etherll/Qwen2.5-7B-della-test/ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json delete mode 100644 data/hfopenllm_v2/Etherll/Qwen2.5-Coder-7B-Instruct-Ties/5e5602cc-b4de-4247-aa6d-940817fc849b.json delete mode 100644 data/hfopenllm_v2/Etherll/Replete-LLM-V3-Llama-3.1-8b/cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json delete mode 100644 data/hfopenllm_v2/Etherll/SuperHermes/aec03bd9-808a-4c3f-bbde-40bcac5775fb.json delete mode 100644 data/hfopenllm_v2/Eurdem/Defne-llama3.1-8B/b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json delete mode 100644 data/hfopenllm_v2/FINGU-AI/Chocolatine-Fusion-14B/c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json delete mode 100644 data/hfopenllm_v2/FINGU-AI/L3-8B/a93c5674-599b-429c-a322-3c6bc7248f45.json delete mode 100644 data/hfopenllm_v2/FINGU-AI/Phi-4-RRStock/5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json delete mode 100644 data/hfopenllm_v2/FINGU-AI/Q-Small-3B/c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json delete mode 100644 data/hfopenllm_v2/FINGU-AI/QwQ-Buddy-32B-Alpha/1b49cb06-3ee1-4945-aaed-12c868d9e45e.json delete mode 100644 data/hfopenllm_v2/FINGU-AI/RomboUltima-32B/65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json delete mode 100644 data/hfopenllm_v2/FINGU-AI/Ultimos-32B/7fecc176-debf-4bf7-b3f3-479d05678a1e.json delete mode 100644 data/hfopenllm_v2/FallenMerick/Chewy-Lemon-Cookie-11B/3c965626-a264-40db-93e1-cd7659d0662e.json delete mode 100644 data/hfopenllm_v2/Felladrin/Llama-160M-Chat-v1/50fa6f0c-d689-4380-b619-253209b5badc.json delete mode 100644 data/hfopenllm_v2/Felladrin/Minueza-32M-UltraChat/adb25c88-6113-4307-bbf0-d377f757bc18.json delete mode 100644 data/hfopenllm_v2/FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/b9ac5e03-c878-4e46-a89c-1906f3b91dce.json delete mode 100644 data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/d6a6badf-4472-44b5-af9e-4282e4406a8e.json delete mode 100644 data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/92e62d3a-3091-4538-b6da-ba705e11687a.json delete mode 100644 data/hfopenllm_v2/FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json delete mode 100644 data/hfopenllm_v2/FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/5013ccfc-6bc5-4862-898c-1ca781f92572.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb/38fff98c-72b1-453c-a2cf-cf077dd19d10.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/42911928-ef64-474b-828a-02ce3383773e.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb/5b9acd52-7eb6-4099-98be-ecd6cae07835.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/666bef5a-2d62-4743-bff1-07365716ab19.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/85de411c-2308-4824-bd6e-3327eeb6fe3e.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb/df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/c41df02e-5aff-4de6-a1c4-d45b5585e29d.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/aa587b4a-9c19-4231-ba72-9b66446460f9.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected/be14e75e-4fb1-41aa-b168-1ec23eb305e0.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb/73be4a2b-28c9-4208-8107-3734fea25008.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected/9f8fc05a-8658-4ed3-994a-965e6882d242.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb/ced11f6e-490d-42e9-8f3e-00e22cfc2910.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/70ba788b-fe8c-4667-a859-0fb122de22b9.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected/e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb/15cacfe0-bdfb-4b87-a813-bfa70ff71984.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected/e1eab0cf-2c6d-44b2-8aaf-a75347741529.json delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2_pretrained_200k_fineweb/ed221db8-cf81-4257-8785-db9381eec5b7.json delete mode 100644 data/hfopenllm_v2/FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/b314468b-401a-4318-b022-c966bf3366aa.json delete mode 100644 data/hfopenllm_v2/FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs/a0dbb2eb-66c7-48a3-a85c-725b49141edf.json delete mode 100644 data/hfopenllm_v2/FuJhen/mistral-instruct-7B-DPO/812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json delete mode 100644 data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_e2e/77af2424-0a23-49f3-97b0-316d04a33547.json delete mode 100644 data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_viggo/6f422676-2d7e-40ed-a5e3-4afc25564cfc.json delete mode 100644 data/hfopenllm_v2/FuseAI/FuseChat-7B-v2.0/43923dd6-838a-4259-a938-7766dfd9c07e.json delete mode 100644 data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.1-8B-Instruct/dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json delete mode 100644 data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.2-3B-Instruct/16a782dc-0795-4281-aad6-4f664a0940ab.json delete mode 100644 data/hfopenllm_v2/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/5d24d4ad-9f37-4634-ba23-74fbc74fd298.json delete mode 100644 data/hfopenllm_v2/GalrionSoftworks/MN-LooseCannon-12B-v1/043cd315-fcb7-4871-ae79-dee3fdefaef0.json delete mode 100644 data/hfopenllm_v2/GalrionSoftworks/MagnusIntellectus-12B-v1/3c377d7e-14bc-4c82-9ada-7560552abbe4.json delete mode 100644 data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-2-Merged/43bb650b-8bb7-41b4-866a-cb2dad1499d6.json delete mode 100644 data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-Merged/bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json delete mode 100644 data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaUltra-Merged/14a1872c-7afd-4cd4-ad87-853e4fc0847e.json delete mode 100644 data/hfopenllm_v2/GenVRadmin/llama38bGenZ_Vikas-Merged/887e4ca9-ed48-4b33-b933-f8534a8d0377.json delete mode 100644 data/hfopenllm_v2/GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct/c585488d-4043-482f-b1fa-4a61e96f7f0f.json delete mode 100644 data/hfopenllm_v2/GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct/d64541f6-19ef-4f04-a991-93efec6fe24f.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1c13e194-8bee-4456-a249-f71e7e34b0eb.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1d3db737-20e7-4da1-a311-e60de0b41c93.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/7b73d50e-358b-4961-8b58-63765ce5a82a.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/81dfd69c-cf01-4114-8157-fd09af6f490c.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/01863b4f-9550-49c3-ad83-74c0bb535eb9.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/edd25437-38bc-443c-9da3-bc041270447e.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/31836d43-5022-488f-ba9e-379195809069.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/josie-3b-v6.0/2a5a3ed6-7137-49e2-a141-497ceba88757.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/69423132-adc9-4b97-b799-15f37de1d7e5.json delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0/54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json delete mode 100644 data/hfopenllm_v2/GreenNode/GreenNode-small-9B-it/cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json delete mode 100644 data/hfopenllm_v2/GritLM/GritLM-7B-KTO/7fbc0323-1c78-46b6-a08a-6e5870c64e53.json delete mode 100644 data/hfopenllm_v2/GritLM/GritLM-8x7B-KTO/1c769f0d-b99d-4b82-a529-f5264f7b3349.json delete mode 100644 data/hfopenllm_v2/Groq/Llama-3-Groq-8B-Tool-Use/a9365685-e299-48e2-931a-c63e123a9e00.json delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-1.0-8b-Llama-3/bdf2d61a-daa1-4b1f-9245-43ff263540fb.json delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-1.5-12b-Nemo/f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO/29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo/c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small/0b11eb9a-61c8-4af1-8335-24bef2597e5d.json delete mode 100644 data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/7d31e5fd-700a-42a8-bea8-8989e8c52603.json delete mode 100644 data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json delete mode 100644 data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/2fae7e4a-8c28-4be8-9391-ca79077e32c2.json delete mode 100644 data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/436e651e-6f04-44ff-ab3d-db8ed0d639bd.json delete mode 100644 data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge/9fbccac2-c840-494e-a24d-a6f0c9a07b88.json delete mode 100644 data/hfopenllm_v2/HPAI-BSC/Llama3-Aloe-8B-Alpha/a4ee6a33-df51-4a4e-a13d-45488a094fd7.json delete mode 100644 data/hfopenllm_v2/HPAI-BSC/Llama3.1-Aloe-Beta-8B/a3923f10-e64c-4556-9616-4fe7072eff60.json delete mode 100644 data/hfopenllm_v2/HPAI-BSC/Qwen2.5-Aloe-Beta-7B/ca15d972-9075-42df-884b-5d069f6ff425.json delete mode 100644 data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1.2/905909a5-abef-46bf-9392-c97873e229df.json delete mode 100644 data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1/95bd05cf-8f59-409d-a99e-d249bad6c561.json delete mode 100644 data/hfopenllm_v2/Hastagaras/L3.2-JametMini-3B-MK.III/76b12246-33f6-4992-a0ab-38704dcf6345.json delete mode 100644 data/hfopenllm_v2/Hastagaras/Llama-3.1-Jamet-8B-MK.I/e4415806-0ec0-465a-b28f-9c8741436fb4.json delete mode 100644 data/hfopenllm_v2/Hastagaras/Zabuza-8B-Llama-3.1/98e62ab5-d35a-42dd-904b-bed9c50f3745.json delete mode 100644 data/hfopenllm_v2/HelpingAI/Cipher-20B/8fb3596e-224e-492b-bdb6-a95a16656eb0.json delete mode 100644 data/hfopenllm_v2/HelpingAI/Dhanishtha-Large/154203c4-d86e-4c36-806b-c45c5cc568ce.json delete mode 100644 data/hfopenllm_v2/HelpingAI/Priya-10B/e42c01f7-2869-4103-bbfd-81aa5a15c140.json delete mode 100644 data/hfopenllm_v2/HelpingAI/Priya-3B/323d2f94-5e04-4627-9f74-129217f53eea.json delete mode 100644 data/hfopenllm_v2/HeraiHench/DeepSeek-R1-Qwen-Coder-8B/6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json delete mode 100644 data/hfopenllm_v2/HeraiHench/Double-Down-Qwen-Math-7B/691cace3-5316-4f5b-8693-67efb24a0a06.json delete mode 100644 data/hfopenllm_v2/HeraiHench/Marge-Qwen-Math-7B/d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json delete mode 100644 data/hfopenllm_v2/HeraiHench/Phi-4-slerp-ReasoningRP-14B/f6f515d3-f5e9-4362-be51-bb8fc05527e6.json delete mode 100644 data/hfopenllm_v2/HiroseKoichi/Llama-Salad-4x8B-V3/2e1e215f-b622-439f-a13f-531441e25ae3.json delete mode 100644 data/hfopenllm_v2/HoangHa/Pensez-Llama3.1-8B/d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json delete mode 100644 data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-alpha/ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json delete mode 100644 data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-beta/4eedd6d4-279f-4660-8d71-708a27bb53e0.json delete mode 100644 data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-gemma-v0.1/9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json delete mode 100644 data/hfopenllm_v2/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1/e5c0fbc9-f424-4b04-839a-8335adaf89cc.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B-Instruct/d91107fa-eb8d-4d01-90a2-fc9831f337b2.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B/926999bf-1ba6-4321-82b2-fcced4336739.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M-Instruct/57d481bf-0db9-4208-afda-dcd20df13964.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M/eb417e47-fe63-4dc5-b3e5-28782f3782da.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M-Instruct/b0f516dd-7185-4906-87a5-3c6f019894d0.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M/1e562944-a205-4ef7-aff1-3776595d131c.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B-Instruct/6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B/2064938d-9f05-4740-a4d4-2a2da0eac21d.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/43240184-8245-43ff-a971-678523918fe0.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/b3b854b6-700c-4297-b335-6acc3c385f84.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M/a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/88e1dd78-d3bc-401b-88e9-d963bac181db.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/a41bd607-f319-4063-a6e4-813f43e40568.json delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M/8629aef1-c673-4b17-a9cc-b361a53bdaa7.json delete mode 100644 data/hfopenllm_v2/HumanLLMs/Humanish-LLama3-8B-Instruct/532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json delete mode 100644 data/hfopenllm_v2/HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407/843f9927-9865-4066-9cc0-f0522d3b914f.json delete mode 100644 data/hfopenllm_v2/HumanLLMs/Humanish-Qwen2.5-7B-Instruct/eeecb2cb-e286-443f-84aa-d825702a4ad8.json delete mode 100644 data/hfopenllm_v2/IDEA-CCNL/Ziya-LLaMA-13B-v1/36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json delete mode 100644 data/hfopenllm_v2/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0/c4e810f1-ffb3-4ece-b445-64e339761530.json delete mode 100644 data/hfopenllm_v2/IlyaGusev/gemma-2-2b-it-abliterated/025725b6-0034-48c0-a720-5fc210e5e24b.json delete mode 100644 data/hfopenllm_v2/IlyaGusev/gemma-2-9b-it-abliterated/7bdd8928-c336-494e-9c87-de9ecc2749b8.json delete mode 100644 data/hfopenllm_v2/Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0/ff7369dc-3ff2-424b-80b0-e06a141b54f3.json delete mode 100644 data/hfopenllm_v2/Intel/neural-chat-7b-v3-1/a6dc7253-75fd-4897-be85-8ac89fc11f8e.json delete mode 100644 data/hfopenllm_v2/Intel/neural-chat-7b-v3-2/296ceacc-542a-4000-bf9b-ae59b33a53ce.json delete mode 100644 data/hfopenllm_v2/Intel/neural-chat-7b-v3-3/13870577-7579-48b4-9c92-202318ca6ecc.json delete mode 100644 data/hfopenllm_v2/Intel/neural-chat-7b-v3/6ebd2806-2623-4773-93bd-1036ff01cb8c.json delete mode 100644 data/hfopenllm_v2/IntervitensInc/internlm2_5-20b-llamafied/99d6a44b-d556-4674-8ade-a5b30cf99255.json delete mode 100644 data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.5/605118a3-316a-46b5-9719-f596e361a2a8.json delete mode 100644 data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.7/271d2829-fbd4-438e-9f09-59539af68c8b.json delete mode 100644 data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/107bc549-75c1-4272-b567-f8ab9f6cd675.json delete mode 100644 data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/dfb451e9-c1c1-45a1-8082-155763366129.json delete mode 100644 data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/b2d80977-d079-42ec-b057-5aac530b9d70.json delete mode 100644 data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated/16b33b80-3b4b-4edb-b89f-3d93dca8969c.json delete mode 100644 data/hfopenllm_v2/J-LAB/Thynk_orpo/63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json delete mode 100644 data/hfopenllm_v2/JackFram/llama-160m/538f2b43-328c-456d-8a40-ff2b37924453.json delete mode 100644 data/hfopenllm_v2/JackFram/llama-68m/fb7a68e6-716e-48c6-96c0-d227735f9a7c.json delete mode 100644 data/hfopenllm_v2/Jacoby746/Casual-Magnum-34B/3593d4b8-5602-4cca-935f-a76e342f060a.json delete mode 100644 data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B/72d503fc-b221-498e-811a-a806769175d6.json delete mode 100644 data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B/ad7d9698-d9e6-4f2d-9767-987835626c8c.json delete mode 100644 data/hfopenllm_v2/Jacoby746/Proto-Athena-4x7B/98899942-fcf0-41de-8587-44d7429bea47.json delete mode 100644 data/hfopenllm_v2/Jacoby746/Proto-Athena-v0.2-4x7B/bb51eb59-88f6-49c2-814a-11b2c80313d0.json delete mode 100644 data/hfopenllm_v2/Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B/d8563f36-e299-4186-a5dc-9dae51824e1f.json delete mode 100644 data/hfopenllm_v2/Jacoby746/Proto-Harpy-Spark-v0.1-7B/43bc0528-7bc5-4eac-8848-c9995079450f.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-1epoch/ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-5epoch/24629e14-d197-4a5b-adff-7840af652f22.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-1epoch/9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-5epoch/46548403-6eb5-4f7a-874c-1327420f4cab.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-1epoch/0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-5epoch/aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/ad03cae6-b126-4157-a225-9576e4d651d0.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/0d57b65d-3dd4-4185-b8cf-531105e94b5e.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/f8882044-6e71-4788-b2ee-f51f85e67ecc.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT/3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep/e26743b9-4caf-46f8-bd5a-7e4445c850b1.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep/febd4016-3a30-4b26-93e5-f7b556781b9b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep/ae82125e-94ac-48ca-8240-807e4b7ef9a0.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4/5321fa0b-b010-4e1d-9f20-a97b56f4f937.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep/d25a4602-ea50-4a53-952c-112ba250123b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep/232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep/975f54fe-a581-4ce1-b0c1-7becb7605f09.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5/92ae4461-48bc-47fe-a3ad-ea4c3452d395.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep/638e1cc0-9baf-4555-a278-4b21c46af86f.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep/cef4161a-4e1c-4a92-bca8-b07f957a13b1.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep/715b556b-2bc0-4864-b4b1-b7413a5d45bc.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4/7552ad5c-5d1f-478b-a931-036083b2954e.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/821d67e5-da8d-4383-8825-3bfa72a91fc9.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/dc35237c-606d-4609-927a-566bea767312.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/3924d1af-e167-4186-a34b-d9b4b8c26d59.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/08f933a0-b096-4271-890e-0df7e20d1d20.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/8434e448-ed77-45f2-9c31-39128912f842.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/d801037b-1eb0-4058-9096-429e5237e015.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/e0c46f18-598e-402f-8955-68e71fab67cd.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/ec658058-1075-4918-9dc9-fc79d0dcf897.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/b68baa86-3e1a-4888-98ba-2ecede79b4a7.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/0b11c8ab-2cfa-425d-9d81-d999f94401db.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/a3e48db8-3679-4f19-853d-82a73ef49400.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/7dbf35b2-80c1-4181-80f9-850ea51cead2.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/231f47db-1662-4313-9ff4-f32883f5615c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/c79df898-14c6-4f00-9f65-0d01cd34ed61.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/2c52917f-c396-410d-bc78-c93c433797fc.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/0f1d2925-4e1c-495b-94be-f3515fbd53d7.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/5cbb1972-9895-4689-9f6f-7e0037829a78.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/6bc42e37-1f31-47cb-97e4-9d0b28b53691.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/e4c06400-da86-4448-b421-23476f50bdb3.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/9018f443-a63f-4e07-b10b-272f66d1eb0d.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/548d1536-b941-43a9-a60b-ae5448b70933.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/99853109-17d9-46fa-a502-e4c977c1fb8f.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/e171a0a0-f46d-404f-84e8-539155284e17.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/151cb8c4-0a7d-4886-80ea-560902e1f932.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1acb97c4-a9d2-4ec8-9486-77eb6857646c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/81562e50-23c5-4ef1-b98c-b40625f3b8c6.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/95fa292a-ee64-4844-9646-ce3cc7f730d2.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/4d14c584-b5a1-41cd-9605-78088dfebd7f.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/4b0ab369-e72f-4229-b449-3a21ee9d2c95.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/478b6c1f-3329-4c9b-9d90-59b8b551c1af.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/212f8dd2-3c61-45bd-a3de-2326334feb73.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/9251282e-f72f-406e-a2cf-e7063516f624.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/f374772b-2685-41e2-a455-9002e48e3739.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/6db801f8-5253-47c0-b87e-6779bff42f6b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/0d704671-c0b6-4296-85b5-eaf972d6be6a.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/7e31545f-0865-4843-914b-a71f8a84314f.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/431c7130-5a19-4a71-8a92-fea9726769ac.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/ca850c4a-14d0-4145-9977-0d33e6e3e362.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/7389caa3-6d8f-43e3-b3f2-d9320e56f621.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/e4085c6a-bc16-4328-a724-4b9838b55faa.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/b929b955-1fbb-43d0-add1-4d58fdc4097c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/df723a0f-9a32-42f3-9421-780159f7d821.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep/c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep/60c02070-7554-4764-8a02-841ca75a0d5c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/d243f226-149b-4824-837e-e80ab68bae9d.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/4f9361d0-2ad9-44da-a1d9-876d43451ae6.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/7cd2c0da-15b8-4ad6-8cad-feb68631c079.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1fd0d1db-1d75-4b10-bae8-33023c2c7466.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep/c6c02512-6c91-4818-a084-c48915fd83de.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5/326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep/b3a190d1-5b86-4439-a21e-1f118239db82.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep/b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep/05a59445-b816-4982-9b1a-1c2394ffbaa9.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5/ff952579-e92d-4af8-9497-f49fed5efba0.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep/b541ede0-6de9-4557-8280-43567fd3dd96.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep/8514f601-0bb2-4639-90cc-29e96088e7de.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep/57e6d0cf-943a-4b83-a1f4-4f03b5066523.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5/ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1/14b260e6-4300-43ec-b7af-587a2f5b03fb.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/53de1fc9-7097-4103-b731-588a7bf39f80.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT/1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/51b62d59-f39c-49ca-af0a-73df6440e29d.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/622a0ae1-0eb5-49f0-bc44-d396c7233e27.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/71291a41-283e-42ca-b192-7b759e3c3712.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/7e504fef-b304-4c1a-856d-06e56a8869d7.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/099ce031-1e11-4a07-bac1-03bef9b915d6.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/cbc43c7a-d8ac-4b03-a383-703f7fa51757.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/5eb10878-11e6-43ad-9bb5-658a3495129c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/273f0d50-aa4e-4469-8360-2ce0a2e1a850.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/9da9a0e6-257a-41f6-b3a3-e3279a4924db.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/dfed058c-48b2-4e1e-9a29-624771e3e9dd.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/8438a108-0d5d-48b6-b73a-981d13329daa.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/88616292-1e38-4481-af30-6b60e28fb097.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/44094907-0b09-4706-a117-116a7e10a6e5.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/d19e8078-87e9-4760-9b91-6b5f478820e1.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/896464f1-01bc-4370-8d90-3368323b2908.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/9889f0b9-9051-485c-bd44-32b1e56b865c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/6563ce79-6df4-4c78-89e2-064f1250d898.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/b1778755-e6e6-47e2-925d-44d786c4ff62.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/40831e23-0a9e-4bdc-a365-9399b6b82ff9.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/4a60fa82-34dc-4b0c-9102-65adac5039e4.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/d7962833-660a-4b9b-9836-8a2f3251f38e.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/ad8ecabf-a868-496e-892b-582efb54fa6a.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/49f25d3d-80c9-4723-8fa9-1501d44d70aa.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/70ea520c-3e0c-4412-9dbe-40a00801335c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/8e7f8bad-812b-4f6c-8dea-1cf44584c300.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/20e5d087-7b20-4a39-81da-7334354b61f0.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/4c5a769c-0472-402c-8e97-d24e5b302bac.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/96166735-ed03-4931-81c9-d3daed1913d9.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/776fd8d8-9846-4359-97d4-2340425d1315.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/fc7284d9-a73f-4562-a781-5cb87247183f.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/26ab447c-a850-4197-983a-a0dca4532029.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/ee9e2131-aa99-49e1-9814-f0664614354b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/23c472f7-f060-4a69-8f72-12490675825a.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/04172bef-c06b-4c08-b2af-9e1fe4d97664.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/3436355a-d2fe-411f-a764-4cb8284deb4c.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/265655c0-2ead-4dd7-8c7e-4bee69d51bce.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/645cae82-9e7b-4d1b-b944-e3783089c1c1.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/ce7e3a31-c65b-4521-b685-fcbd067c75d9.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/adb53e2c-5dee-4840-8eae-e0186c6e103f.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/ba89563d-f53a-4bf0-91e1-92ac950523d8.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/ed816bcb-bbe9-48ae-a6ac-3603779a985f.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/f347ed24-066a-4cba-8478-f03628cb2b5b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/ffddfea0-d17e-44e7-8931-a9601e9cb26b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/a0038c34-130b-49dc-a93f-94706a3dad50.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/b902e2b2-a0b3-4467-b076-b98717c40d74.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/4c749665-59ff-49df-a193-0262f66e6003.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/c99899c6-95e1-4dea-ac12-f8df49728a3b.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/13deca9f-073e-444b-bf79-35e816f7c312.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/b146daaf-ce1f-4520-bc19-21ce8679b220.json delete mode 100644 data/hfopenllm_v2/Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2/45e1d037-1ed0-472c-a311-c651fde270fc.json delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/470d52be-9dbd-4714-b004-f65cc82d245f.json delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/c836fd05-1969-439c-91e1-fd0cab816f6c.json delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/14774c6b-eb03-4abc-92df-1e7a196ca8a4.json delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/5293ae0c-8022-44d4-b2f5-4f5390dff93e.json delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/9020f91f-a8f0-447d-af68-247aa81a25c6.json delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/0cd6837a-8c3f-4529-9ea0-8755e1725467.json delete mode 100644 data/hfopenllm_v2/Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/7cb17011-cf77-4e86-b67f-84e6ff4b8086.json delete mode 100644 data/hfopenllm_v2/Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/086831f9-c677-428b-a997-4da58733633c.json delete mode 100644 data/hfopenllm_v2/Josephgflowers/Cinder-Phi-2-V1-F16-gguf/d71893b8-b82c-490b-a700-b579d64e0610.json delete mode 100644 data/hfopenllm_v2/Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama/9893689f-c27d-4148-a27f-cd07b07e98b7.json delete mode 100644 data/hfopenllm_v2/Josephgflowers/TinyLlama-Cinder-Agent-v1/90f2df23-a9ec-44be-ade5-89b59cb7368a.json delete mode 100644 data/hfopenllm_v2/Josephgflowers/TinyLlama-v1.1-Cinders-World/afd545da-390a-478a-b0f5-ea819f088f27.json delete mode 100644 data/hfopenllm_v2/Josephgflowers/TinyLlama_v1.1_math_code-world-test-1/ce776f68-856f-4aee-b7e4-e55d15e8d714.json delete mode 100644 data/hfopenllm_v2/Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1/9b015729-524c-44f3-9c2c-c42981d7a61e.json delete mode 100644 data/hfopenllm_v2/Josephgflowers/Tinyllama-r1/56a54ffc-4692-496c-95df-8e4ad19d4d95.json delete mode 100644 data/hfopenllm_v2/JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3/4b105969-2ce5-4c62-89ef-efd392c2ca89.json delete mode 100644 data/hfopenllm_v2/JungZoona/T3Q-qwen2.5-14b-v1.0-e3/31af79b1-48c1-4399-9d16-8582c92996ee.json delete mode 100644 data/hfopenllm_v2/Junhoee/Qwen-Megumin/59a67f29-cb7d-497c-b7bb-1764a665ae33.json delete mode 100644 data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-1415/fe57367c-74b7-483e-af54-4f404cbea75b.json delete mode 100644 data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-500/fda2277b-1513-416e-b586-ed05920a0bb4.json delete mode 100644 data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-final/b3dde216-f80a-4664-aadc-b5f5dd3e5895.json delete mode 100644 data/hfopenllm_v2/KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step/07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json delete mode 100644 data/hfopenllm_v2/Khetterman/DarkAtom-12B-v3/ba76c356-cd6a-4636-8ab1-18bb9df69881.json delete mode 100644 data/hfopenllm_v2/Khetterman/Kosmos-8B-v1/c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json delete mode 100644 data/hfopenllm_v2/Kimargin/GPT-NEO-1.3B-wiki/6f296f0e-80ca-49b7-94e7-cb45b795c715.json delete mode 100644 data/hfopenllm_v2/KingNish/Qwen2.5-0.5b-Test-ft/b5509e11-820a-4ad4-8c6a-0294762502a8.json delete mode 100644 data/hfopenllm_v2/KingNish/Reasoning-0.5b/90d73665-8d83-4e74-ab7d-29b1d3b6181b.json delete mode 100644 data/hfopenllm_v2/KingNish/Reasoning-Llama-3b-v0.1/72387647-cbac-4b72-9c22-db7029a39457.json delete mode 100644 data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.1/6219ec01-4b6a-4acd-aee1-96c3e8e48643.json delete mode 100644 data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.2/5c323d7c-25cd-4718-8a1f-54d986cadaf2.json delete mode 100644 data/hfopenllm_v2/KingNish/qwen-1b-continued-v2/adfab21a-941b-4efc-8b63-fdfb3074ba9b.json delete mode 100644 data/hfopenllm_v2/KingNish/qwen-1b-continued/350d00a4-7501-4130-a069-323530bc9729.json delete mode 100644 data/hfopenllm_v2/Kquant03/CognitiveFusion2-4x7B-BF16/ea809d28-178e-4a0b-ab5a-34739077c5ff.json delete mode 100644 data/hfopenllm_v2/Kquant03/L3-Pneuma-8B/243d5ccd-58f3-4da5-8718-553f3f456490.json delete mode 100644 data/hfopenllm_v2/Krystalan/DRT-o1-14B/a45537a7-76a6-4855-b83b-abe965f13460.json delete mode 100644 data/hfopenllm_v2/Krystalan/DRT-o1-7B/9be911b6-b9f4-47b1-849d-62eb20c9e944.json delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5/33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-DT-v0.1/4355fbdd-ac72-4f26-8e07-b7e8d774d238.json delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3/4bffc633-e20c-4874-b7db-d1b7dabb8070.json delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.1/2d5c844d-d950-4254-bac2-0a986659c541.json delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.3/f6e74b3c-9ee4-40c3-bf92-35d965503a04.json delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7b-v0.4-slerp/8f1d2600-7347-48b8-9759-11570598459d.json delete mode 100644 data/hfopenllm_v2/Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT/cd653bfd-2c06-4224-aeeb-bf591995a69e.json delete mode 100644 data/hfopenllm_v2/Kumar955/Hemanth-llm/cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json delete mode 100644 data/hfopenllm_v2/L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1/4828bd36-5453-4383-8985-08d04a7ebecd.json delete mode 100644 data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki100p/4c2baa59-c2f1-4779-9d21-1f69c0821968.json delete mode 100644 data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki10p/555c1079-c4d0-4b9e-9d2d-769e7ba32429.json delete mode 100644 data/hfopenllm_v2/LEESM/llama-3-8b-bnb-4b-kowiki231101/58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json delete mode 100644 data/hfopenllm_v2/LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p/eea2a38a-4f1b-48d0-894c-09974894f264.json delete mode 100644 data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/3d8063ab-0ad5-43e4-83ff-90b46dee766f.json delete mode 100644 data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct/da5e0284-7c44-42d4-a110-a23880de277f.json delete mode 100644 data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-32B-Instruct/bef017bb-47b1-48e4-93c4-3b222a16af7a.json delete mode 100644 data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct/401c83b0-b7d2-4987-9e46-f127fdbb595f.json delete mode 100644 data/hfopenllm_v2/LLM360/K2-Chat/c6fde59b-73ed-4179-a907-076be068b262.json delete mode 100644 data/hfopenllm_v2/LLM360/K2/90997fea-6c67-493e-bd8e-5327cfb33ea4.json delete mode 100644 data/hfopenllm_v2/LLM4Binary/llm4decompile-1.3b-v2/08957d63-7462-44ff-9dd8-060a5801a31b.json delete mode 100644 data/hfopenllm_v2/Lambent/qwen2.5-reinstruct-alternate-lumen-14B/a434f569-e7d6-4464-afa8-6104be43fa06.json delete mode 100644 data/hfopenllm_v2/Langboat/Mengzi3-8B-Chat/e32ed251-e817-409f-b4c3-8f168f1ff822.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBA100/1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.0/608398da-ae2a-4be2-aaf9-6ec8899aa63d.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.2/80e04641-be7d-4351-a4f6-1318981ef834.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.3/e74222c6-636c-4075-8d4d-30c73fa70fda.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.6/aed80361-9304-44a0-934a-52976d7f1bf3.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.61/709bd280-b03e-4908-808f-34566bc968f4.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.62/66c495b3-4b09-42ad-b742-4d753c3bde7a.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.63/e24f7be6-3051-4990-8b93-121aec5402eb.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.64/0321571b-4246-4490-bd6c-7b106eb8e15a.json delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1/54dbf947-ab18-40dd-9cd7-a496289b2e72.json delete mode 100644 data/hfopenllm_v2/LenguajeNaturalAI/leniachat-gemma-2b-v0/d841e204-ed6a-439d-8408-d5cfb3b38dae.json delete mode 100644 data/hfopenllm_v2/LenguajeNaturalAI/leniachat-qwen2-1.5B-v0/96b57891-83e3-4948-ad48-64a2a370e166.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/CheckPoint_A/30301818-6dad-45f9-acfb-a68ccc7c0609.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/CheckPoint_B/50743107-30de-4c5d-bf83-cc003af8a5db.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/CheckPoint_C/625ee1b3-e0a1-4a86-83a4-6e66b380f864.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/CheckPoint_R1/89fda762-1989-4850-837c-f79ef538c58c.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/LCARS_AI_001/1de1f906-0e36-4f79-b159-16ef8ee33ab3.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/LCARS_AI_1x4_003_SuperAI/d8588222-9e4b-47c1-9f86-92f47c9c8e38.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/LCARS_AI_StarTrek_Computer/15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/LCARS_TOP_SCORE/81225b85-1523-49c1-b770-897112d2e6ae.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/Mixtral_AI_SwahiliTron_7b/254deaf7-a253-4d41-a10d-1143f86b288c.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI/ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI_001/eed0b3b4-e277-49ee-aed5-f3599b2d5653.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b/96a21b6e-ed47-40fb-85cd-15924330e60d.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2/f41f5471-6384-4510-85d2-41f236082583.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_002/2728eccc-525f-4350-901b-dbc352c78014.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_001/3e7ae935-46c3-427c-8713-41c659c1828a.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_006/66782676-c942-4aff-b754-b96cd96cf1f9.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_007/941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT/caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT/d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT/615bf89b-9357-46f4-82ed-f49b0021da01.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/06398630-23ad-4000-8ea2-fcca230568d7.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/bdfa30f8-da0f-418f-adaf-caafda4c81a5.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/bd5e550c-5355-4e01-bafc-2ca89899253a.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/3a09590f-28f3-4161-8a93-d42cec62aa90.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/f276ad54-4e3b-4718-ae1f-0479565e4565.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_RP/dec20396-6555-4773-bf02-2cd1fcedda89.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_TextVision/eebc33e1-0016-4adf-815a-72653a34c01b.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M1/803c3898-c1a6-4832-ac3a-a86139489810.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M2/bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M3/99debdd2-1dea-4eb6-be5c-c144656cfe20.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_12/ad67bb88-7f74-4eb4-b771-0b3b60be4416.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_14/af2f579d-1e8a-47d8-8e44-a599bee83e37.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_001/763c840e-ea73-453e-8e54-5f4fd6fda9cd.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_002/4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR/ffc4ef41-4a28-4816-be54-8ffd8e153073.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder/f75fe902-f1c7-4e6c-87d6-128688db8d94.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001/dbd3098b-4532-441b-a81c-072c52579be6.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003/438e4aa3-5e02-446e-bd3a-07ef724d24ff.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/027fdc55-61eb-416c-b6ad-4408912d151b.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student/37a4895d-def5-494d-9b62-d8c97ba9350b.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher/0d53c27e-962c-428f-b540-35ab027883a8.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001/6f7b2d91-24d6-442c-93a5-9afc88e9a308.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002/21793520-7d1a-4040-bb96-fa7fe98ae580.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder/59d53c40-5b16-4a70-a693-5fb554cf7614.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math/b28a569c-6bdf-4547-a2ce-c3e224764be3.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster/2de129c8-2259-4367-a619-85d9e8f61e06.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder/c242030f-fb2b-42dc-a5d1-687273b17282.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder/3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student/ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1/f8c131a4-1fee-4694-8753-88853418ef4b.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2/27dec9ff-fb18-43dd-949f-7c0587a5858f.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1/060df34d-ab67-43e1-bd56-ebaceb77abd3.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_BIBLE_002/a6357673-3daa-4593-8593-2b65a7d5477e.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatML_002/121d4877-1955-48db-a23a-6b0ad0623b9e.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA/1f1eab02-219e-4ad8-af50-e103541e1c9d.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA_003/b4cccfb3-1c17-48a3-a211-a26c44de757f.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_TEMP_/05e97a86-681d-42a2-8a47-beade25d8fc9.json delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_Top_Teacher_/6c0899b4-f066-45f6-827d-11c535ef0634.json delete mode 100644 data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.0/f9660557-b9f6-4ecc-b260-c245f0e62b5b.json delete mode 100644 data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.1/89168032-5840-4c2c-821e-b3d717ade46f.json delete mode 100644 data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.2/10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1/6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2/5e715199-7030-47b4-89c6-83ba0968c07c.json delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1/3fca39e8-443d-47da-a858-83a68c18eec9.json delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2/b7518bd2-d3af-49e6-823a-f8d507e8e60f.json delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3/fa399f16-1652-430c-be19-afaf5ab96be1.json delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP/cbe5032b-122c-4a0b-a099-50e998a4bc77.json delete mode 100644 data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-1B-SLERP-V1/fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json delete mode 100644 data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-7B-SLERP-V8/1a18d49c-ad7b-4823-abbc-7191e9d659cd.json delete mode 100644 data/hfopenllm_v2/LilRg/10PRYMMAL-3B-slerp/9e2c614e-1104-43a6-9e8f-b7851562e01a.json delete mode 100644 data/hfopenllm_v2/LilRg/ECE-1B-merge-PRYMMAL/7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json delete mode 100644 data/hfopenllm_v2/LilRg/ECE_Finetunning/a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-6B-slerp/21f6688c-be52-4352-9c95-d37c0a5f6c94.json delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V3/e92ba586-7bee-4a9b-b388-e35efde3d36f.json delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V4/45ed0bb3-efbf-4a32-9735-d814aa08790a.json delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V5/eff28375-89a7-4970-9342-428b07d0c6f4.json delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V6/23877e30-b8fb-45ea-a803-47df757ea909.json delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V7/8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-slerp-Merge/d2d4b5a5-109d-4d26-a166-3d97b341584e.json delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged/ac404d92-7a06-4758-ab1d-fcf840c2b995.json delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged/95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/c101e272-24d2-44db-9b0f-2ed4d17cec41.json delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged/2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3.1-8B-unsloth-merged/a414aefd-ce24-49a9-b431-0c6014ebfbd8.json delete mode 100644 data/hfopenllm_v2/Locutusque/CollectiveLM-Falcon-3-7B/91fcb6a3-d351-48c8-87e8-e2a06642e925.json delete mode 100644 data/hfopenllm_v2/Locutusque/Hercules-6.0-Llama-3.1-8B/3cd90efa-ddf0-43c4-884c-84337ded14b2.json delete mode 100644 data/hfopenllm_v2/Locutusque/Hercules-6.1-Llama-3.1-8B/c66c21e9-a332-40f9-ae87-bdd78a25d753.json delete mode 100644 data/hfopenllm_v2/Locutusque/Llama-3-NeuralHercules-5.0-8B/0b4def91-29df-45d9-8dd4-c4097ec47ba3.json delete mode 100644 data/hfopenllm_v2/Locutusque/Llama-3-Yggdrasil-2.0-8B/2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json delete mode 100644 data/hfopenllm_v2/Locutusque/TinyMistral-248M-v2.5/8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json delete mode 100644 data/hfopenllm_v2/Luni/StarDust-12b-v1/ce4cc270-57da-4d08-9130-62508b409cb2.json delete mode 100644 data/hfopenllm_v2/Luni/StarDust-12b-v2/4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3/de3c949d-bab5-4430-bdd1-48e1b7860934.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4/011e53cd-409f-479b-9c3d-bfce75a1277b.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5/1ff40e45-5be4-4625-9f66-5599a829903d.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/fed97d94-2949-4383-8f25-fa79bd413508.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6/f4820bc8-7dfd-4439-af95-21b6cc9367ac.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/36e576bb-de50-49ec-a91f-f134c11bbe38.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7/0edd388b-7a1b-4334-9b72-52d84653ff67.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5/b3199674-328e-41a0-9aa4-bf39aec735bc.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6/52db4d79-7040-4525-934e-0f33e4acec63.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7/ee34821e-9182-433f-a8b0-745711e23738.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8/10ef0990-5356-432f-b24c-dd107188ec5f.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9/47de680d-33b1-4441-92da-4b97a5fc513f.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8/96ac0351-2ade-4d76-bcf9-bc0f633f8694.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/31aae266-c14b-451f-8bab-62ee7d5d382e.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1/f6edb102-e867-46d1-afdc-3c45166bd510.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2/8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9/dcf33a22-5e57-4476-a2cb-ebd60407a920.json delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion/15659480-be0b-41c8-a463-873be444b194.json delete mode 100644 data/hfopenllm_v2/Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json delete mode 100644 data/hfopenllm_v2/Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/93aa3a13-5069-410f-a1df-6944e0231e0e.json delete mode 100644 data/hfopenllm_v2/Lyte/Llama-3.2-3B-Overthinker/427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json delete mode 100644 data/hfopenllm_v2/M4-ai/TinyMistral-248M-v3/c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json delete mode 100644 data/hfopenllm_v2/MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json delete mode 100644 data/hfopenllm_v2/MLP-KTLim/llama-3-Korean-Bllossom-8B/5b5d42d7-8012-46f1-826f-32d839806048.json delete mode 100644 data/hfopenllm_v2/MTSAIR/Cotype-Nano/5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json delete mode 100644 data/hfopenllm_v2/MTSAIR/MultiVerse_70B/21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1/c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3/6586fa94-9f43-4814-8c8a-8ed244ac94e7.json delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/df7d7db2-867e-47f0-9abf-d71b79e97630.json delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/e2502e7e-3a10-49f3-b5c6-b20496fed998.json delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.3/51cde18f-09b0-4b66-a962-811ee49e192f.json delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1/4ea48b42-8026-4799-b35d-46757fd2753f.json delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1/52e9b4ae-9119-4f26-87e4-6532d1148ecd.json delete mode 100644 data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-Chat-v0.1/4bda68c0-cc09-4945-961b-48776b7b5fc8.json delete mode 100644 data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-SFT-v0.1/18ea0ad0-a216-4906-a96c-c8b040398dbd.json delete mode 100644 data/hfopenllm_v2/MagusCorp/grpo_lora_enem_llama3_7b/1e2321f6-93bd-4acf-9f5b-c82807a40233.json delete mode 100644 data/hfopenllm_v2/ManoloPueblo/ContentCuisine_1-7B-slerp/13032961-52a1-43cf-b69d-1802c43e1bcc.json delete mode 100644 data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC2/9d444061-2c29-499a-8906-77ef58aba34d.json delete mode 100644 data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC3/1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json delete mode 100644 data/hfopenllm_v2/MarinaraSpaghetti/NemoReRemix-12B/8ce733ea-e6e9-4f9b-ab28-f93202507265.json delete mode 100644 data/hfopenllm_v2/MarinaraSpaghetti/Nemomix-v4.0-12B/0e88aa91-609c-4d2d-9296-25b06eeb0342.json delete mode 100644 data/hfopenllm_v2/Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json delete mode 100644 data/hfopenllm_v2/Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial/5e31a55c-f222-4192-b031-27bb40ba56fa.json delete mode 100644 data/hfopenllm_v2/Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json delete mode 100644 data/hfopenllm_v2/Marsouuu/general3B-ECE-PRYMMAL-Martial/8e721067-898d-45ca-b4f5-9f523c4ce3d3.json delete mode 100644 data/hfopenllm_v2/Marsouuu/general3Bv2-ECE-PRYMMAL-Martial/be5d5480-ce4c-4ade-8c6a-c08cd2826909.json delete mode 100644 data/hfopenllm_v2/Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial/54dec074-29f8-4863-be37-2c08f6f2c3cb.json delete mode 100644 data/hfopenllm_v2/Marsouuu/lareneg3B-ECE-PRYMMAL-Martial/88a15025-556b-469d-be77-c773f2c61038.json delete mode 100644 data/hfopenllm_v2/Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial/b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.1/97ce858e-a64f-4881-b6d0-0a2c0814336d.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.2/1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Llama-3-70B-Instruct-v0.1/337bb321-9c6e-4751-9c9b-d8ba0120dd07.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.10/cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.8/6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.9/6cc4404a-f3e1-47b9-b56b-34e4269e1261.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow/8d820e43-ff42-4247-9ad0-4ed8e70672b4.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.1/d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.8/9813dd88-ff70-4d9e-86c5-9b73444275c5.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-llama3.1-70b/ac677432-e7d1-4439-9c05-426059c285ef.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3-4b/018f270f-3cfe-403c-a236-483038a0b04e.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3.5-4b/718a40ea-26b1-4cf4-9584-57be798640ae.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-72b/207a28a9-ae24-4a31-be95-96296b2e466d.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-7b/72efedb8-d456-41ed-b1ae-4887cb6c18f8.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2.5-72b/ac91fb37-5742-4a3d-b93a-86c63b90cad5.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-rys-78b/c71d025d-e954-4420-b397-e07c3644d1f4.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3-70b/968c3759-de5f-4255-ba95-cafc7a3c70a7.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3.1-70b/5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-phi3-4b/1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-72b/7908f572-8886-4add-ae84-b4ec0ec17c26.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-7b/9e04ec5c-2208-4569-9b63-4768ed4262b9.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2.5-72b/ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-rys-78b/c7579616-0c21-443a-a149-0c51a0ae92ac.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3-70b/ef7a1429-db2f-433b-a606-339a9d868e7a.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3.1-70b/f531e13c-79ed-45da-a246-857fd2c884c1.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-phi3-4b/0f525d93-663a-442c-9a51-1ad3a5054172.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-72b/15af21e1-3193-47fa-a3fc-1f087216d4d9.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-7b/67b270d9-3422-4770-9957-7bde65acca0a.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-rys-78b/e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.4-llama3-70b/4ff256af-73c7-4a5a-96da-19546a786c59.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.4-qwen2-7b/225cbeef-1d0d-40fc-949d-4ba6696fb690.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.4-rys-78b/24fcd662-5abb-4bf8-b8df-1c21b048cd92.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.5-qwen2-7b/7badcb45-7826-4fd1-b964-c697fbda76cc.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.6-qwen2-7b/bfb532f1-3319-46ff-80ae-0ca783a18bb6.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.7-qwen2-7b/ea304515-b41f-4e96-a0ec-78c897ebf9a4.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.1-baguette-3b/1fe79ea5-1922-4a5e-8857-1c832353b0a6.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-3b/9098d70f-cbcd-4f6c-bcba-0b1da743396e.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-78b/df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.1-llamaloi-3b/f68957d5-20a1-438f-9931-6a787aaed467.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.2-baguette-3b/416e0c04-9119-4230-ba71-b0f47e2d4997.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-3b/d57780e2-154e-437d-ac2f-0007e1f9140e.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-78b/027d464b-1375-4de7-aa57-e1473d16ba89.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.3-baguette-3b/a81f20fa-57e8-498c-a162-6d8a9be09ee6.json delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.3-instruct-3b/d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json delete mode 100644 data/hfopenllm_v2/Minami-su/Amara-o1-7B-Qwen/f681d612-f574-4641-b34e-95b6de97f9e8.json delete mode 100644 data/hfopenllm_v2/Minami-su/Amara-o2-7B-Qwen/cae1adaf-e424-4dcd-943b-5bbb708aca57.json delete mode 100644 data/hfopenllm_v2/Minami-su/test-7B-00/969ac825-92f2-448c-899a-226e69dee377.json delete mode 100644 data/hfopenllm_v2/Minami-su/test-7B-01/e108ad28-c155-4162-852c-0f588a136bdc.json delete mode 100644 data/hfopenllm_v2/Minami-su/test-v2-7B-00/93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json delete mode 100644 data/hfopenllm_v2/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/c1b16b84-9392-48f3-b483-0a9786925506.json delete mode 100644 data/hfopenllm_v2/ModelSpace/GemmaX2-28-9B-v0.1/b0c6e08d-b426-49d5-8a66-ee3d70131b62.json delete mode 100644 data/hfopenllm_v2/MoonRide/Llama-3.2-3B-Khelavaster/6a6651a3-b34e-404d-ac25-42c151fb9ba3.json delete mode 100644 data/hfopenllm_v2/Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged/da63b789-5571-4ed8-976e-146d385b18e2.json delete mode 100644 data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLong-8b-v4i/87b900e7-3bab-4e60-b0ef-349667cb2656.json delete mode 100644 data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b/c9fd4740-4990-4174-b782-9b63c34d6407.json delete mode 100644 data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1211-3B/2582a049-e940-408b-b2d9-7a7bdf470e49.json delete mode 100644 data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2/99310118-d2ec-4647-85db-fcc22aee9161.json delete mode 100644 data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct/bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json delete mode 100644 data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct/6767e14a-bbfa-4a0d-8120-1f48a565474e.json delete mode 100644 data/hfopenllm_v2/MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/70260aac-1bbf-4913-9dcc-58633d055314.json delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1.1/fba6e1a2-c197-4731-91ea-f6d059ba8b16.json delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1/22e74d0c-70d6-43c5-be4d-62842d93fedf.json delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v2/f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v1.1/ecdb4661-426a-46be-aefc-7e04483cebc0.json delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v2/236976b3-af46-45ac-a8a5-f5897e3468a1.json delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v-0.1.0/fd175296-a5f6-4914-80e9-b8b75bc659de.json delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v0.1.0/d910bbaa-d55c-4b00-9320-856a8a6713c0.json delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.3/99a5f123-5d2e-469b-884e-c9a64c6bc197.json delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.4/ed17a715-f0ae-461c-9618-ac952c450ec5.json delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-instruct-v0.5.0/3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama-3_1_instruct-v0.6.0/b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama3.1-70B-v0.2-fp16/39893637-552a-48d8-9b83-433415eb26c3.json delete mode 100644 data/hfopenllm_v2/NCSOFT/Llama-VARCO-8B-Instruct/f9549713-f487-4e26-bfeb-ec6d394b7014.json delete mode 100644 data/hfopenllm_v2/NJS26/NJS_777/02579c41-f117-4412-9c00-ee7db3e9ab97.json delete mode 100644 data/hfopenllm_v2/NLPark/AnFeng_v3.1-Avocet/bfa1d761-00aa-4438-a5de-972d934c63d5.json delete mode 100644 data/hfopenllm_v2/NLPark/B-and-W_Flycatcher-3AD1E/20a84d88-05c2-4e02-8c84-2afa84cc659f.json delete mode 100644 data/hfopenllm_v2/NLPark/Shi-Ci-Robin-Test_3AD80/84eedce3-3a93-4630-b914-aa281fd2efda.json delete mode 100644 data/hfopenllm_v2/NTQAI/NxMobileLM-1.5B-SFT/b3b7b62f-ac82-4ef9-9634-afb81645ec19.json delete mode 100644 data/hfopenllm_v2/NTQAI/Nxcode-CQ-7B-orpo/283c5166-b9c5-4d20-9653-0cd0346d87c1.json delete mode 100644 data/hfopenllm_v2/NYTK/PULI-GPTrio/478b54cd-6410-41e5-8a53-4e46bcd9d7af.json delete mode 100644 data/hfopenllm_v2/NYTK/PULI-LlumiX-32K/de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json delete mode 100644 data/hfopenllm_v2/Naveenpoliasetty/llama3-8B-V2/ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json delete mode 100644 data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-Instruct/553fd36d-08dd-46a3-ab04-77b9039e7921.json delete mode 100644 data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-sft/e2bae853-cc0f-456a-a635-98d5f87ac47c.json delete mode 100644 data/hfopenllm_v2/Nekochu/Llama-3.1-8B-German-ORPO/d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json delete mode 100644 data/hfopenllm_v2/Nekochu/Llama-3.1-8B-french-DPO/5d92e02f-b590-4b6b-8c64-30690f79e916.json delete mode 100644 data/hfopenllm_v2/Nekochu/Luminia-13B-v3/e10f38df-b5d5-47c6-924f-563c6f8a6616.json delete mode 100644 data/hfopenllm_v2/Nekochu/Luminia-8B-RP/27257dc9-750c-4673-8865-986434bc5c0e.json delete mode 100644 data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-12B/e599f3f8-e5eb-4bfe-a102-efc5a967434d.json delete mode 100644 data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-8B/8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated/f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0/2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/567f8f54-225f-4d9b-be06-f24091adc1e6.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R/ebb59730-9522-4c45-8f42-c0d941fd728c.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.01/2c44fa8c-ebd3-4ea6-8578-61da38965c09.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.02/3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.03/0ab721ba-fbda-44ca-a349-1d3abfaabe62.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.01/2fea1128-4f0c-40d8-be87-72c42c0648fb.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.02/db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.03/28399fd0-840c-49d3-8179-407ed83d3bfc.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.10/d7108c13-e14a-4366-9a39-204f853b1bee.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01/56152d05-9273-4701-8c0a-723e2cab618d.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03/55d2f23d-cb6c-42d2-8b57-837451d3c6df.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_V1.01/7479ae87-e795-4e20-848a-291614176def.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04/04ceb40e-bde8-487b-9d29-dc8f681af9be.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04/e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01/9954194c-69b5-4eb4-8b32-859845548cb0.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03/2afbc279-242a-4276-85f0-facd29c2d89b.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_V1.01/ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Mediver_V1.01/d03c73ca-7364-4517-aea4-f0ac564c49df.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Medusa_v1.01/1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1/f2363099-c39a-4874-bf77-ccc0fa087680.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_V1.01/596eeee8-3600-4f8a-8888-978b610eb2ca.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Stormeder_v1.04/595ddba1-c450-4b69-85b7-0e3118c8c6c7.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Typhoon_v1.03/64890314-bba0-4fb2-8c21-38b413cff4c8.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.1/470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.11/00a1579e-8636-4eca-9a63-c0b067a5f3dc.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Dolto_0.1/a52cc4c9-6d60-4083-ac77-591e247d86c9.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1.01/ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1/c4d11b01-ae5b-4198-b102-07160f100a41.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1/19405ead-2263-4613-8053-43beeafb4bfc.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OrcaSun_V1/6c698a60-a813-4be7-b55f-b684029b492d.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1/b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_SunOrca_V1/a20a529e-c52e-41b7-a8ee-909167048bfb.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Sydonia_0.1/2735e6f4-839f-4ab1-8ede-3447891b1b26.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Syneridol_0.2/e74e7e7f-8550-4cba-97cd-2626c82d6b29.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.1/14f4c00d-8915-413d-8e85-79f395127682.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.11/9119b586-d3b2-4ce0-a243-d584e2087184.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v1/629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2.1/a6ac828c-904b-413a-a5fa-a5ed06a28143.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2/251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_Halo_0.1/962b48a3-23d7-4104-b34d-4e5c2af31d58.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_MagLight_0.1/e4b0be31-6f9a-4a57-b433-e561da9bd827.json delete mode 100644 data/hfopenllm_v2/Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a/9a31f208-b7d8-4baa-b96e-99926ecb35af.json delete mode 100644 data/hfopenllm_v2/Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/8d933df1-60cb-471d-bfc3-b11c93150203.json delete mode 100644 data/hfopenllm_v2/Nexusflow/NexusRaven-V2-13B/35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-1epoch/3530db9a-0d61-4cf8-9fff-b15f6488c845.json delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200/7d9901e0-eafe-4d49-a5bb-fab059708bcb.json delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500/ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/acemath-200/6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/phi-4-14b/0aa7572c-1aa6-4997-a2a2-3b557fbde639.json delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/phi-4-1steps/6f5df760-2d3e-47b1-b55e-4031a5f11d41.json delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/phi-4-300steps/ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420/2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain-Eris_BMO-Violent-12B/95ebc5b8-a541-4fca-9e7c-692720e73362.json delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-GRPO-v0.420/09a2508d-a171-493f-9ff2-e7f375815c91.json delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-V0.420-12B/12a4a921-5859-4fd6-9d64-677a7d8ef696.json delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain_BMO-12B/b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json delete mode 100644 data/hfopenllm_v2/Nitral-AI/Hathor_Stable-v0.2-L3-8B/d162cf7c-3ef4-420f-aab4-789a98b1195a.json delete mode 100644 data/hfopenllm_v2/Nitral-AI/Hathor_Tahsin-L3-8B-v0.85/7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json delete mode 100644 data/hfopenllm_v2/Nitral-AI/Nera_Noctis-12B/24677f2a-ea89-4289-bcb6-13699de9782f.json delete mode 100644 data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.1/3e09df3c-2224-4a29-8e55-18a485db2b25.json delete mode 100644 data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.2/cc0bd236-8fc4-43d3-a18f-4b2afb112946.json delete mode 100644 data/hfopenllm_v2/Norquinal/Alpha/5afd4c0f-b61d-452f-8c48-d298780d91d5.json delete mode 100644 data/hfopenllm_v2/Norquinal/Bravo/eac52141-4fd8-4e21-9c78-920ab8933e5a.json delete mode 100644 data/hfopenllm_v2/Norquinal/Charlie/8449837f-64ac-4293-b1f8-210e62779202.json delete mode 100644 data/hfopenllm_v2/Norquinal/Delta/ab8a665c-8234-484f-a8a9-8ee79d73edff.json delete mode 100644 data/hfopenllm_v2/Norquinal/Echo/a954242f-41a6-49d7-a71d-3bfe940cdb92.json delete mode 100644 data/hfopenllm_v2/Norquinal/Foxtrot/6d1c518f-3f42-49eb-9208-b30e27e7e87e.json delete mode 100644 data/hfopenllm_v2/Norquinal/Golf/87931db7-42a4-48df-b5a5-8bd934061dbe.json delete mode 100644 data/hfopenllm_v2/Norquinal/Hotel/54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json delete mode 100644 data/hfopenllm_v2/NotASI/FineTome-Llama3.2-1B-0929/7129efad-8ab2-4f7a-b6ed-055989b3e131.json delete mode 100644 data/hfopenllm_v2/NotASI/FineTome-Llama3.2-3B-1002/cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json delete mode 100644 data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-1B-1007/0f053a45-cd79-4e51-9b4c-ae5c51006c17.json delete mode 100644 data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-3B-1007/d8002b35-1454-4635-a31e-b419c7000b53.json delete mode 100644 data/hfopenllm_v2/NousResearch/DeepHermes-3-Mistral-24B-Preview/4c08530e-d529-49a1-a3fe-2351c422981a.json delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Llama-3-8B/d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Mistral-7B/70656b13-e0a2-4ef4-af43-0d9995d57af6.json delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-2-Theta-Llama-3-8B/6544f1ca-02a6-4e58-98f0-e19cc6082682.json delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-70B/5cd3796f-fb31-49c1-a974-019c5c5b20ae.json delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-8B/49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.2-3B/59720f7e-7e09-483f-8332-8dc7aa19ae78.json delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/a3a89e4a-0589-4776-a1da-227552482e94.json delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/448fda35-bfdc-42ae-90f9-d44383e0a454.json delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-2-SOLAR-10.7B/0d97542e-82b6-4f27-9822-62b67e7690c2.json delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-llama-2-7b/2725bd69-839d-4427-8e05-0e289fff70de.json delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Llama-2-13b-128k/adb71488-adb8-4848-bf1d-aecd04cb6718.json delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-128k/c7736577-c4c3-4233-9308-a4bb9b2dbb89.json delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-64k/76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-128k/1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-64k/5e1513f1-4375-4380-85fa-b96a419c013b.json delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-32k/fadbf3b2-283a-4f8e-9acf-463d75924b97.json delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-64k/c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json delete mode 100644 data/hfopenllm_v2/Novaciano/ASTAROTH-3.2-1B/a9aa164e-386b-4987-9f49-2dde64ade45c.json delete mode 100644 data/hfopenllm_v2/Novaciano/BLAST_PROCESSING-3.2-1B/e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json delete mode 100644 data/hfopenllm_v2/Novaciano/Cerberus-3.2-1B/1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json delete mode 100644 data/hfopenllm_v2/Novaciano/Cultist-3.2-1B/80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json delete mode 100644 data/hfopenllm_v2/Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP/afb24bf8-3c47-4278-9b84-19b05017745b.json delete mode 100644 data/hfopenllm_v2/Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative/4f8cda4d-959b-41ab-a79d-d2b35968eb89.json delete mode 100644 data/hfopenllm_v2/Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json delete mode 100644 data/hfopenllm_v2/Novaciano/HarmfulProject-3.2-1B/6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json delete mode 100644 data/hfopenllm_v2/Novaciano/LEWD-Mental-Cultist-3.2-1B/38cb02a8-862d-40e1-922a-e65f537df87e.json delete mode 100644 data/hfopenllm_v2/Novaciano/La_Mejor_Mezcla-3.2-1B/f816e2a7-2629-4abe-9ed0-3d1299e95194.json delete mode 100644 data/hfopenllm_v2/Novaciano/Sigil-Of-Satan-3.2-1B/286fae5b-544a-4033-9092-d633fc80f47b.json delete mode 100644 data/hfopenllm_v2/NucleusAI/nucleus-22B-token-500B/93477bf6-ea00-418b-8a2f-975a9554263e.json delete mode 100644 data/hfopenllm_v2/NyxKrage/Microsoft_Phi-4/3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json delete mode 100644 data/hfopenllm_v2/OEvortex/Emotional-llama-8B/d1e9a242-941f-4461-b75b-7043c2c01ef7.json delete mode 100644 data/hfopenllm_v2/OEvortex/HelpingAI-15B/e39661af-ad93-41d7-8892-1230064f1a1c.json delete mode 100644 data/hfopenllm_v2/OEvortex/HelpingAI-3B-reloaded/595b61b2-5220-48f6-91a0-3aa0d37c63d8.json delete mode 100644 data/hfopenllm_v2/OEvortex/HelpingAI2-9B/3173263e-2a42-4e8d-956e-8175ef464e76.json delete mode 100644 data/hfopenllm_v2/OEvortex/HelpingAI2.5-10B/f77f8291-1573-4fb6-a984-1cc099c09621.json delete mode 100644 data/hfopenllm_v2/OliveiraJLT/Sagui-7B-Instruct-v0.1/c4681e14-513c-4e5e-af8c-88ca11849176.json delete mode 100644 data/hfopenllm_v2/Omkar1102/code-yi/0c220edd-2563-4fec-99a4-ef8c210ca5ce.json delete mode 100644 data/hfopenllm_v2/Omkar1102/code-yi/bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json delete mode 100644 data/hfopenllm_v2/OmnicromsBrain/NeuralStar_FusionWriter_4x7b/85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json delete mode 100644 data/hfopenllm_v2/OnlyCheeini/greesychat-turbo/f180fddd-077f-43f9-b2d9-38c5f33be44d.json delete mode 100644 data/hfopenllm_v2/Open-Orca/Mistral-7B-OpenOrca/ef384329-8406-4767-ac1a-3eba3131f726.json delete mode 100644 data/hfopenllm_v2/OpenAssistant/oasst-sft-1-pythia-12b/2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-falcon3-10b-v24.2-131k/38b2dbbe-be86-4ef0-a39b-89841f662141.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-70b-v21.2-32k/999a8091-22bd-4c08-bee1-772202e7edde.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.1-8k/fda91d98-d259-430c-929b-78852cab64ec.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.2-32k/535bfa4f-ab63-4832-9f17-7b245ff2b2af.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k/681a6cc5-5519-4b13-8b50-93adcab4a3f7.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k/141dd12c-6901-4a96-a051-f35647ddcc73.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k/5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k/7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k/7938a00e-4e11-4223-a900-fa53df168ab7.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k/8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/a334d998-21a5-4108-96e3-9935507a9f8f.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.1-131k/941e27c6-81da-4ce1-b1c8-544c1426cd11.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k/e409a374-685b-482d-82e4-2436dca37309.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k/84713625-97b6-4fad-982d-41b5c500d73a.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k/b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k/ec896115-21ef-4337-9fdd-32a04c574a05.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.1-200k/d8e5f49b-7bf3-41d4-a91e-c566219609f6.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.2-200k/ce1a92a3-6bec-410f-ab42-c567c5d23856.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k/0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-zero-14b-v22.3-32k/aeee0165-ac7e-4da6-8102-ba60f43587de.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-zero-3b-v21.2-32k/b47b8666-2556-45df-ba5b-9a5e94186784.json delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-zero-56b-v21.2-32k/0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json delete mode 100644 data/hfopenllm_v2/OpenGenerativeAI/Bifrost-14B/86599961-3ec2-4837-89a4-809f1dd7226c.json delete mode 100644 data/hfopenllm_v2/OpenGenerativeAI/Bifrost/dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json delete mode 100644 data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-human-data/cd77d407-3be3-4b84-8a73-34a15744de93.json delete mode 100644 data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-v1.1/1cd20db5-0225-4724-b1f9-7c32eae456e1.json delete mode 100644 data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct/dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json delete mode 100644 data/hfopenllm_v2/OpenLLM-France/Lucie-7B/3da2a408-672c-47b8-be32-61f56a15e9f3.json delete mode 100644 data/hfopenllm_v2/OpenLeecher/llama3-8b-lima/94700c3c-f18d-4f96-a794-65bcf483fca9.json delete mode 100644 data/hfopenllm_v2/OpenScholar/Llama-3.1_OpenScholar-8B/6f3481d4-076f-45bd-8564-d485109c7a63.json delete mode 100644 data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2/9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json delete mode 100644 data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored/f1932041-263a-4841-9c8b-c6cc9fa50c21.json delete mode 100644 data/hfopenllm_v2/Orion-zhen/Qwen2.5-7B-Instruct-Uncensored/691bef38-bc9e-4f8d-b774-9d7c62eec72b.json delete mode 100644 data/hfopenllm_v2/Orion-zhen/phi-4-abliterated/5795f693-9ebc-47c6-9d2c-185dd0d32044.json delete mode 100644 data/hfopenllm_v2/P0x0/Astra-v1-12B/eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B/f93b2053-11c4-4868-860f-90fbfe8288fc.json delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B/a0f6f5de-578c-4290-85b5-c51aed985074.json delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B/8ccc76ff-25c9-4706-b6a8-31b49f8be813.json delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/8e7dfd9f-350d-406c-811d-453f1744dd53.json delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/b713d1d2-351f-43a1-b77d-27723e1d4267.json delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/322a9442-174f-4223-b839-6f8f9664d5e5.json delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/Qwen2.5-RomboTiesTest-7B/b12e71d1-c435-4172-a28f-38e26791dadb.json delete mode 100644 data/hfopenllm_v2/PJMixers/LLaMa-3-CursedStock-v2.0-8B/ad33b0e8-39c8-4118-81bd-bc86b482f122.json delete mode 100644 data/hfopenllm_v2/Parissa3/test-model/db8a7864-293b-45e9-995b-5301071c902d.json delete mode 100644 data/hfopenllm_v2/Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/31e3beea-28dc-4b47-a5e9-5fafc89226db.json delete mode 100644 data/hfopenllm_v2/Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ/49315a95-394f-4508-8e6c-7c1d5547c257.json delete mode 100644 data/hfopenllm_v2/Pinkstack/Superthoughts-lite-1.8B-experimental-o1/375d3a94-97af-47ef-82af-afd7581663d4.json delete mode 100644 data/hfopenllm_v2/Pinkstack/Superthoughts-lite-v1/77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-Instruct-CoreCurriculum-12b/3d69ec7d-9999-4e16-8dc9-99fad35e156e.json delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.1.0-12b/d2a7459b-8a12-4529-b978-c7237979f16b.json delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.2.0-24b/e7a228ad-69de-471a-9f31-6bdc7221999c.json delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b/9196ae39-adb0-4d53-8399-0ccd4d628065.json delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-SakuraKaze-V1.0.0-12b/ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json delete mode 100644 data/hfopenllm_v2/PowerInfer/SmallThinker-3B-Preview/05f69fd6-a77e-478d-ad86-3e83e615e892.json delete mode 100644 data/hfopenllm_v2/PranavHarshan/LaMistral-V4/5b8e9508-befb-4674-bd84-9c722a0864ce.json delete mode 100644 data/hfopenllm_v2/PranavHarshan/MedNarra-X1/8beb3730-23e8-4b89-933d-2d3f1a1d1365.json delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended/07417712-1933-4920-8964-67ba74bf6d01.json delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved/ae4cc05d-a65a-4f18-a99c-f133603686d1.json delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_32K-PoSE/54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended/a717d466-9157-4991-8459-f39847d914a2.json delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved/15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended/921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved/863969d9-e567-43cc-a0a9-7f80eaba374a.json delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended/2987fa45-363e-4a07-8e9f-db01586a135b.json delete mode 100644 data/hfopenllm_v2/Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2/3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json delete mode 100644 data/hfopenllm_v2/PrimeIntellect/INTELLECT-1-Instruct/0cacf042-6b62-4b67-8821-97cd703788d0.json delete mode 100644 data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json delete mode 100644 data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/c1308f95-6d55-4ff6-b14e-1bd09b467d99.json delete mode 100644 data/hfopenllm_v2/PuxAI/LUA_model/4ab16120-8d39-4dea-aa76-5c249506848d.json delete mode 100644 data/hfopenllm_v2/PygmalionAI/pygmalion-6b/f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json delete mode 100644 data/hfopenllm_v2/Q-bert/MetaMath-1B/c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/1up-14b/9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Adamant-14B-sce/8b303795-557b-4fa1-bbc6-d36bd77ee739.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Alice-14B/7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Alien-CoT-14B-sce/5a09783b-82da-43ae-a607-2cfea550d931.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Aura-8B-Linear/6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/121cb5fc-2fa2-4718-b325-c40014802e40.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Charlie-8B-Linear/c8891914-c9fb-4b4d-9592-826f04520e7b.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Chromatic-8b-sce/e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/CoT_Phi/da237415-f34e-4cbb-9a94-3ff621f3df8d.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Dyson-14b/479f3bfa-d614-46a9-88c7-9891852b0d8c.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Edu-14B-Linear/f5f0c7da-fb03-4023-81a7-801b0729a19d.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Fugazi14b/40f51424-2922-498d-bbbc-d500667a8554.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/GZA-14B-sce/4f25d177-6bcf-4864-87a4-1beb21a7373d.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Geedorah-14B/b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/GivingTree-8b-sce/d497a7e3-11c2-4e0c-8788-091caabede56.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/GuiltySpark-14B-ties/4a55bcf2-e1c1-4fce-8f79-472dae869b26.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Halo-14B-sce/5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Heretic1.5b/1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Hyde-14b-sce/75065074-7ef6-41ac-be7c-496cc458640a.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Imagine-v0.5-16bit/49a0287b-48d7-44db-bf20-a084919d332f.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Imbue-14b/7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Insom/628542f9-fac6-42a7-8ec5-5cd93f977a7e.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/InspectorDeck-14B-sce/5b0924ae-cf52-4245-a687-91e4b1742c16.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Jekyl-8b-sce/459c2b98-c3af-4334-a4bc-13334efe49b8.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Jigsaw-14B-Linear/b2780aa3-d299-4180-8441-dd54e94255cb.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Katana-8b-sce/f55d398d-0555-4e89-a37c-def04741a0dd.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Knot-CoT-14B-sce/63caf8f8-9e55-4ef6-ae76-ee7184a50675.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Lineage-14B/f82ccde3-bd3b-499c-8b8c-182822392cea.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Lo-Phi-14b/8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Loke-14B-sce/b7cbc2fb-2c52-4c13-9266-52103421f2ee.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/MFDOOM-14B/f4474361-e897-4dbb-a89e-5451a4724474.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/MFGRIMM-14B/de257b5e-4629-4f8a-b08d-d2ca372593e2.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Math_Phi4_Reason/a37aada3-104a-488a-898f-245ff257de46.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Mithril-14B-sce/d9d655d1-d94c-483a-a3a2-ca196e1391d1.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Mononoke-14B-sce/77bf7126-0cb9-43ef-8d23-5f1395f91642.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Motion-8B-Linear/73f410be-3084-4994-8406-f8ac70880626.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Mouse-9B/24caad7a-15fa-4820-91cc-0f544a34d173.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Nova-14b-sce/e087b221-f813-4688-8d98-17980f98ac5b.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/NovaScotia-14b-stock/f4d03bff-3b34-497f-a17f-0379bc562f11.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/2ca21612-ea90-41f3-b618-3ea81c09c3ae.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/d4dc2088-9911-4966-afe9-022df89dd522.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Oasis-14B-ties/ad03a075-8f24-46f6-ae04-5a04eb7061c1.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Origami-14B-sce/2d1da226-e65c-48a0-aabb-46b1cf670a82.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill.16bit/7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors/87018726-9f81-47b1-883e-609afea7fb37.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Phi4Basis-14B-sce/292b9333-96c7-4fc7-bf35-78bbce9f10d3.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Ponder-14B-linear/b44224c3-ed2c-4120-9e2a-e6286358a4da.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/RZA-14B-sce/f7a2c9af-c55c-4307-bfef-1ca709525d82.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Rosemary-14b/d9655f35-edfd-4c53-b359-559870e8019e.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Rune-14b/afdd962d-652a-4395-92f7-c16dc874a779.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/SZA-14B-sce/2594e917-3ebd-428b-8f36-cb0da668695d.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Sake-20b/91a86644-ad96-4c66-8691-1c0b531b572c.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Spok-14b-sce/331f56ce-5e45-46d8-9143-3f66be20b699.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Sumatra-20b/6138ebe0-8483-4cfb-8d95-b334bb09e831.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/SuperNova14b/4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/TB0-8B-sce/a6b0f2bf-08da-472f-b858-8be967a44cdc.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/TBL-8B-sce/57c7553d-f3e5-4a31-8c16-66aae570d8ec.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/ThinkPhi1.1-Tensors/58c31bdd-f86f-4fbb-8549-191bb9f46f02.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Venti-20b/dd25c1dd-0edf-44ca-b18c-633dbd47368f.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Venti-Blend-sce/2a030613-b5f7-4393-ac39-d2d072c913dc.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Vine-14b-sce/f8c73290-c400-4f1f-a00a-516592497b0d.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Wendy-14B/b31908fc-5e7e-45d6-835f-4e86a05b23fb.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/Wu-14b-sce/4320cb98-7f9f-4510-bb88-448ce231bae8.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/bloom-14b-stock/28b986d1-2e67-4462-9165-6cb8f260b6c6.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/caramel-14B/fe1e21cb-7934-4022-a74a-777172310021.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/graphite-14b-sce/90871638-b828-484d-8822-95ffceb20909.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/mocha-14B/04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/mosaic-14b-sce/8c5c22af-f230-4d34-b80d-f42ef27e1675.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/tesseract-14b-stock/f3466a90-541b-4a08-a9c6-d5a79b2299b0.json delete mode 100644 data/hfopenllm_v2/Quazim0t0/time-14b-stock/ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json delete mode 100644 data/hfopenllm_v2/Qwen/QwQ-32B-Preview/859af708-ac37-4749-bc06-73d92338d1f5.json delete mode 100644 data/hfopenllm_v2/Qwen/QwQ-32B/e274380d-e0f7-47c3-afc3-e603e6cecf9e.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-0.5B-Chat/19810be8-ea81-4db5-9854-1830b05a5732.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-0.5B/1258c282-3672-4b42-9d4d-117568e17bf5.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-1.8B-Chat/9b9f6e01-238e-4893-b398-4e1c83c44dfa.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-1.8B/b267621b-dbba-4c4a-bb9f-fa85734d0f59.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-110B-Chat/a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-110B/3d39dcab-55df-4ad3-bdc8-03ae684e4390.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-14B-Chat/1b499881-9edb-4626-a919-977393d6bef1.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-14B/84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-32B-Chat/2e070663-2622-4a8e-bd39-7f0ef9df399e.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-32B/047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-4B-Chat/6d73016e-078e-4ffe-b2ae-5b829d1456df.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-4B/0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-7B-Chat/03d51d90-fd15-42b7-ad5f-c7326cc642a7.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-7B/d3e5c939-c53a-49d6-80cd-34420dbb176a.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B-Chat/ab321358-26f9-4577-a5fb-1f5d4b8784b4.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B/a43aae68-f12c-4a6d-b846-c498cf35f6cd.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-0.5B-Instruct/b84615c0-43c4-49ec-83fe-5d3f8e6026af.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-0.5B/7e687d24-9e12-4ecf-b283-e222efb9473a.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-1.5B-Instruct/4aea143c-28fd-48bb-b911-37ac3fe58220.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-1.5B/34a8daec-bfff-4cf4-9011-0542b30c1d10.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-57B-A14B-Instruct/3e919d7b-53db-41fb-ac93-224e2768b9c6.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-57B-A14B/66becca1-d92b-409f-ab56-44d05cac66fd.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-72B-Instruct/6293b269-7c4c-44da-bd85-e51954c173a1.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-72B/add3b058-e7bc-4b7b-bb98-0d7039979072.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-7B-Instruct/db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-7B/54b055d0-80ae-4bba-b729-bd77b3ec7502.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-Math-72B-Instruct/5c22d0b3-5082-4c6e-865c-71da03cf9378.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-Math-7B/f8e5ee9f-519d-4ed8-bd2a-88897075f401.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-VL-72B-Instruct/b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-VL-7B-Instruct/27df1e06-463b-4519-87eb-a1666ad3f98c.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9d975b05-7bee-462d-a33a-afa0d5af94d4.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9ef9135a-473e-43a5-a460-fd3ec50226f9.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-0.5B/c57cae01-328e-447b-8945-e3cd2c4b8a7b.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-1.5B-Instruct/494c86cf-7f37-49d8-8160-b81859552c87.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-1.5B/6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct-1M/9b10cd14-82f3-4b36-a4be-5092127d68c3.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct/bbd94181-0523-4543-80a7-056b041e03b7.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-14B/e10d8573-e201-460e-a931-49a1b13ceeea.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-32B-Instruct/e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-32B/831246b8-5433-48e6-ba11-8a4239373106.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-3B-Instruct/8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-3B/5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-72B-Instruct/cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-72B/3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct-1M/fc817789-2f44-4d2b-b40e-2422fe33d104.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct/5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-7B/b6740747-19ac-4a9c-892f-6556013ddc8b.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B-Instruct/3263ab46-09ae-4c24-9332-b6874d0d0330.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B/a8706a7e-5693-4768-a955-a448549d2e77.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B-Instruct/3c932329-0440-4799-886f-10bc4a5aeb09.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B/b1e42d9d-827d-4109-8d1b-182694033b21.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/73b07681-8e10-414e-8922-650908f9cf6a.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B/8b1549f8-0602-4538-842c-abe9dca7baff.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Math-1.5B-Instruct/ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Math-72B-Instruct/14c01681-fbef-49c4-b737-a7baaa02d393.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B-Instruct/3ad495c0-da8e-4776-8d05-bc7dce1fe120.json delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B/0762ca9e-f0d4-408e-9992-e91a10e0e65f.json delete mode 100644 data/hfopenllm_v2/RDson/WomboCombo-R1-Coder-14B-Preview/ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json delete mode 100644 data/hfopenllm_v2/RESMPDEV/EVA-Qwen2.5-1.5B-FRFR/1fc39812-77fb-4d0c-b9fb-706e94c40afe.json delete mode 100644 data/hfopenllm_v2/RESMPDEV/Qwen2-Wukong-0.5B/fdc3c502-53ad-4bf7-85ce-51eaed72754b.json delete mode 100644 data/hfopenllm_v2/RLHFlow/ArmoRM-Llama3-8B-v0.1/3f74c1c7-f349-4193-95cf-b0033112fea0.json delete mode 100644 data/hfopenllm_v2/RLHFlow/LLaMA3-iterative-DPO-final/36a803da-83ab-4c49-8855-9344aaa7a68b.json delete mode 100644 data/hfopenllm_v2/RWKV/rwkv-raven-14b/df986996-249e-49f9-b074-91e8dcdf62e2.json delete mode 100644 data/hfopenllm_v2/Rakuten/RakutenAI-2.0-mini-instruct/90f007e9-e323-4a82-b276-ac1b928030ca.json delete mode 100644 data/hfopenllm_v2/Rakuten/RakutenAI-7B-chat/2b627f93-5cc7-4a5e-b682-d129396362e5.json delete mode 100644 data/hfopenllm_v2/Rakuten/RakutenAI-7B/2fde07ac-d218-4cc6-947e-8ceb87eedbee.json delete mode 100644 data/hfopenllm_v2/Replete-AI/L3-Pneuma-8B/2a141bfe-4632-4058-a232-1f2c5540c41f.json delete mode 100644 data/hfopenllm_v2/Replete-AI/L3.1-Pneuma-8B/fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json delete mode 100644 data/hfopenllm_v2/Replete-AI/Llama3-8B-Instruct-Replete-Adapted/c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-Coder-Instruct-8b-Merged/c439478a-1734-4038-aa8b-bb2d12ec022d.json delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-Coder-Llama3-8B/4a36f73a-9495-4ea2-863c-220b8ca6bf99.json delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-Coder-Qwen2-1.5b/faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/a55bf380-d567-4228-b30c-57e9df31e844.json delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/dfd92311-4f3d-4355-8ccf-a59f29914b8f.json delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview/d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-LLM-V2-Llama-3.1-8b/32edb764-2a42-4efe-ac86-9eda81942b84.json delete mode 100644 data/hfopenllm_v2/RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B/36855ebd-2030-4d5d-9c42-ca049244e694.json delete mode 100644 data/hfopenllm_v2/RezVortex/Jajuka-3b/9651a0a1-4004-42f3-ad8f-2aebb38ec967.json delete mode 100644 data/hfopenllm_v2/Ro-xe/FMixIA-7B-DARE-0/a59e55dc-e2b5-43be-8469-49eee0e98d55.json delete mode 100644 data/hfopenllm_v2/Ro-xe/FMixIA-7B-SLERP-27/a956e306-f184-4dbc-ac7a-3793ae735801.json delete mode 100644 data/hfopenllm_v2/Ro-xe/FMixIA-7B-TIES-1/c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json delete mode 100644 data/hfopenllm_v2/Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9/415875b7-fe10-47e7-aca0-029c2f51c067.json delete mode 100644 data/hfopenllm_v2/Rombo-Org/Rombo-LLM-V2.5-Qwen-7b/c505ee64-3d3b-48e2-9c8a-f59609a758e9.json delete mode 100644 data/hfopenllm_v2/RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2/00003185-c291-40c5-bba1-f87eae0afc08.json delete mode 100644 data/hfopenllm_v2/SaisExperiments/Evil-Alpaca-3B-L3.2/328f61d7-677b-4a06-b464-0da42153f9ae.json delete mode 100644 data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Opus-Instruct/9cb5b8fd-062c-4161-9301-640980d21b9f.json delete mode 100644 data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Stheno-Filtered/09284b75-a2f9-40ea-8135-7aa61c626fa2.json delete mode 100644 data/hfopenllm_v2/SaisExperiments/Not-So-Small-Alpaca-24B/e2502331-6ac3-43bc-8218-259b44333283.json delete mode 100644 data/hfopenllm_v2/SaisExperiments/QwOwO-7B-V1/8dde454d-aa48-4ee1-b5c6-f3353087d492.json delete mode 100644 data/hfopenllm_v2/SaisExperiments/RightSheep-Llama3.2-3B/662c8ed2-2407-4606-ac1e-ec7ade185d2d.json delete mode 100644 data/hfopenllm_v2/Sakalti/Anemoi-3B/332aef8c-7c62-463e-ba3c-07ae0205d457.json delete mode 100644 data/hfopenllm_v2/Sakalti/Euphrates-14B/cfdfcf21-e445-430e-a295-946cb8c3fce9.json delete mode 100644 data/hfopenllm_v2/Sakalti/Llama3.2-3B-Uranus-1/a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json delete mode 100644 data/hfopenllm_v2/Sakalti/Magro-7B-v1.1/465d473c-ef28-4725-8cac-02f2a031b22c.json delete mode 100644 data/hfopenllm_v2/Sakalti/Neptuno-3B/2c636544-8676-4eee-8bcd-d623be0275be.json delete mode 100644 data/hfopenllm_v2/Sakalti/Neptuno-Alpha/8b332fac-1cfa-498b-853a-52ec5492ddc7.json delete mode 100644 data/hfopenllm_v2/Sakalti/Oxyge1-33B/2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json delete mode 100644 data/hfopenllm_v2/Sakalti/Phi3.5-Comets-3.8B/69bb0243-75b2-4858-ba6b-5e70cfb516a7.json delete mode 100644 data/hfopenllm_v2/Sakalti/Qwen2.5-1B-Instruct/4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json delete mode 100644 data/hfopenllm_v2/Sakalti/QwenTest-7/87878b74-22ce-4554-914c-03e486d13de3.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-0.5B/5030f8d4-f216-4f78-84f1-dd03b0324bb0.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha-1.1/c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha/38261a01-62df-42b2-9b1d-f924598e70ef.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-1.7B/5736f0b5-3903-4774-a84a-c3db260d36e4.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-14B/70134d58-972e-49c9-8cde-4ba2691d3dc3.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-2.4B/d4bb1440-2064-4752-bcb3-c9cec234fd1b.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-24B-Alpha/d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-2B-V1.1/f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-2B/7bf23db0-877c-4700-95c8-e35dee5e57b4.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-3.7B/07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-4B/8535ffae-f39d-46ed-89bb-a1656885db91.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-7.5B/5e832121-9a67-44d9-973d-fffdb1b37975.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-7B-V1.1-Multilingal/92d3f67d-a026-49e3-a440-68c10fb358ae.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-7B-V1.1/9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-8B-V1.1/489e8e84-5e30-46fa-a421-f52308f051e7.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-8B/a208f807-c930-4e81-8ebd-dcbb4db76442.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-900M/4956539d-a255-4c56-877f-257e463fa3e4.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-Moe2x7.5B/3451eb65-020c-4e34-9128-7410e6b293cd.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJTPass-2/b5cd0061-e4dd-4049-a51e-b16490e69120.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJTPass-4/c4686af6-0b7b-4df3-9152-14a3ef087b7f.json delete mode 100644 data/hfopenllm_v2/Sakalti/SJTPass-5/155885ca-11e7-4cd2-b26c-53e001e2a6f9.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saba-Passthrough-2/d9ca5411-def6-43b3-a522-595131d8e5e6.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saba1-1.8B/e54553ab-0897-4cb5-9213-5bb72758d2b5.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saba1-7B/eed48cdc-18db-4c03-84bf-d2d50e3328b0.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saba1.5-1.5B/d7952aef-37e2-4c15-a1a4-598690773bbb.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saba1.5-Pro-3B/5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saba2-14B-Preview/cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saba2-3B/a12208ce-e9e1-4476-8054-0d565efad92c.json delete mode 100644 data/hfopenllm_v2/Sakalti/Sailor-japanese/f46e1eeb-8b8b-4d47-9510-445109b5518b.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-1.5B/7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-14B/823e886a-1431-4078-81a3-4b941983461d.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-24B/583609f0-de5b-43cd-a667-bb2c36679fd2.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-7.2B/2d2cea8b-167e-4d63-b01c-537f372672f9.json delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-7.6B/f584f596-3a17-404a-81a2-3033ad38cad6.json delete mode 100644 data/hfopenllm_v2/Sakalti/SakaMoe-3x1.6B-Instruct/ebb0930f-92be-4e1b-a2a6-779f69d2151c.json delete mode 100644 data/hfopenllm_v2/Sakalti/SakalFusion-7B-Alpha/b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json delete mode 100644 data/hfopenllm_v2/Sakalti/SakalFusion-7B-Beta/4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json delete mode 100644 data/hfopenllm_v2/Sakalti/Tara-3.8B-v1.1/05ffcb7a-2694-4276-bf45-73e1110bc494.json delete mode 100644 data/hfopenllm_v2/Sakalti/light-1.1-3B/dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json delete mode 100644 data/hfopenllm_v2/Sakalti/light-3B/154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json delete mode 100644 data/hfopenllm_v2/Sakalti/light-3b-beta/998316d2-389a-4ce0-b0b0-0430c1361de7.json delete mode 100644 data/hfopenllm_v2/Sakalti/light-7b-beta/ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json delete mode 100644 data/hfopenllm_v2/Sakalti/llama-3-yanyuedao-8b-instruct/2519485b-47cd-497c-a349-9e69db0266f3.json delete mode 100644 data/hfopenllm_v2/Sakalti/magro-7B/56d86e26-4ee6-4652-9b7b-a538238a24d4.json delete mode 100644 data/hfopenllm_v2/Sakalti/mergekit-01/416b89e4-5e8a-4131-9403-e8967a4127b8.json delete mode 100644 data/hfopenllm_v2/Sakalti/mergekit-della_linear-vmeykci/347a90e8-d8b7-4266-8242-ceac865796a0.json delete mode 100644 data/hfopenllm_v2/Sakalti/model-3/389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json delete mode 100644 data/hfopenllm_v2/Sakalti/qwen2.5-2.3B/6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json delete mode 100644 data/hfopenllm_v2/Sakalti/tara-3.8B/d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-14B-v0.2/f8d362f6-eafc-4d11-bc40-d169d69d3a95.json delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-14B-v0.3/4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-14B-v0.4/de073f45-0d14-4f8a-9d3b-d4fd961186b8.json delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-14B/fd88d234-b3f9-4f48-896c-af58f1a69880.json delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-32B/273745b1-3761-463e-b9ab-7860968064eb.json delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-72B-v1.5/101d84d3-e741-4eb2-bd8a-db6c12022fe2.json delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-72B/9c82deca-1998-4506-b038-c5dd592324d8.json delete mode 100644 data/hfopenllm_v2/Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R/da620a94-4c0d-4c50-9619-10e12001fb5d.json delete mode 100644 data/hfopenllm_v2/SanjiWatsuki/Kunoichi-DPO-v2-7B/51dade8f-34e7-4237-8691-22655249bf76.json delete mode 100644 data/hfopenllm_v2/SanjiWatsuki/Silicon-Maid-7B/cdd59385-0a54-4ca1-b24d-9316a70f2875.json delete mode 100644 data/hfopenllm_v2/Sao10K/70B-L3.3-Cirrus-x1/514a3103-e8a1-49e8-b9da-a85963f5b3dd.json delete mode 100644 data/hfopenllm_v2/Sao10K/Fimbulvetr-11B-v2/daafaafa-1e00-4433-95f3-91c169598ebd.json delete mode 100644 data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json delete mode 100644 data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json delete mode 100644 data/hfopenllm_v2/Sao10K/L3-8B-Lunaris-v1/99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json delete mode 100644 data/hfopenllm_v2/Sao10K/L3-8B-Niitama-v1/362f5875-4dbc-4e68-90ce-789f692bb533.json delete mode 100644 data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.2/fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json delete mode 100644 data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.3-32K/93f829b8-b8d9-4389-a210-2a38c3a30edb.json delete mode 100644 data/hfopenllm_v2/Sao10K/MN-12B-Lyra-v3/6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B/70d749cf-2e92-4847-86de-7964fc8eb990.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B/623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B/e1aca741-2765-4e47-b6a1-49f3d9532432.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B/4f42366e-e6aa-4974-9a40-5781e350616d.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B/4ec2231d-c012-4ad3-830c-8ff86c977202.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B/1d2e5513-bd0c-4795-8487-f5266c6e368f.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/104172b7-86f5-410a-a454-63e1cfbeb87f.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/d28e04ac-7d18-43fb-80b8-82c0662fec79.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B/20bb3819-9d85-4d84-99ba-65e33965f0c5.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B/3a4bdf58-0137-4d85-b567-59b3fed3dad5.json delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Superb-27B/04f843ba-947c-4732-979c-2aeae7d34e5a.json delete mode 100644 data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2.5/173a31d3-7d12-4ab1-a963-005a81aee767.json delete mode 100644 data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2/d0555736-b614-43ca-91d7-8264e3566872.json delete mode 100644 data/hfopenllm_v2/SeaLLMs/SeaLLMs-v3-7B-Chat/4b7b13b7-4aee-4462-87e6-aa6c15068236.json delete mode 100644 data/hfopenllm_v2/SenseLLM/ReflectionCoder-CL-34B/4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json delete mode 100644 data/hfopenllm_v2/SenseLLM/ReflectionCoder-DS-33B/acbcd5a5-bcd8-4209-b35f-425feada7e8b.json delete mode 100644 data/hfopenllm_v2/SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B/cb9a415f-1a02-46ad-a731-bf825ddd78ae.json delete mode 100644 data/hfopenllm_v2/SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B/92cde6db-47f4-43c6-9ad5-643c35faa226.json delete mode 100644 data/hfopenllm_v2/SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/5e88a037-f9bd-4b39-944f-f0781bb7884f.json delete mode 100644 data/hfopenllm_v2/Sharathhebbar24/SSH_355M/d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json delete mode 100644 data/hfopenllm_v2/Sharathhebbar24/chat_gpt2_dpo/ac5adf39-f0a4-439b-9873-9141e0a554b1.json delete mode 100644 data/hfopenllm_v2/Shreyash2010/Uma-4x4B-Instruct-v0.1/62965c92-cdf4-4a3b-b035-990abaab615c.json delete mode 100644 data/hfopenllm_v2/Sicarius-Prototyping/Brainy_LLAMA/3866ece8-d70a-4061-9e86-0798ecd98bd6.json delete mode 100644 data/hfopenllm_v2/Sicarius-Prototyping/Micropenis_1B/ff484d0e-bb14-4a80-ae29-2351b03cf278.json delete mode 100644 data/hfopenllm_v2/Sicarius-Prototyping/bacon_and_food/06ac1718-fe71-4e05-a47f-1200e067336c.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/2B-ad/4ddb1616-7889-45ef-96de-823fee338e1d.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/2B_or_not_2B/487dd91b-5bc4-4355-90d3-c82ecc789ab3.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Dusk_Rainbow/a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Eximius_Persona_5B/9a9239ab-9e0e-449b-bd1b-6ec280fad505.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Impish_LLAMA_3B/2c710cd5-75a6-46b7-8356-212da7bf864d.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Impish_Mind_8B/377d5240-73b5-48d0-bbdc-0960ad1d9069.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_14B-1M/9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_7B-1M/104a0157-c614-44cf-b6cc-9f15dab4b187.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA/bb379093-c169-44bd-ac86-edb8ab8fc225.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Phi-Line_14B/e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Phi-lthy4/43d87bf5-2620-4f8e-a8b6-f86fc157d987.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncencored/735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored/0c6dcc87-343c-4973-a589-3e3393829184.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct/7c1d1657-e9ae-433f-be9d-523431bfc7ae.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Redemption_Wind_24B/0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Winged_Imp_8B/e87e1d3f-1476-499d-a9f3-b6463b429262.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Wingless_Imp_8B/246e8450-3c53-4bde-99bb-5663f751e88e.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Zion_Alpha/496b9e45-2f64-456e-b35e-12a94c5643b1.json delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/dn_ep02/05890047-a95a-433e-b6b6-fb037592cdd1.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1/4a30580c-1d25-49d4-984d-2d28ef3a5656.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora/696d7966-d140-4f43-91df-54f02247b34f.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3/fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5/9ac16d1f-d894-414d-8a14-110e971d0ba6.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3/2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5/3b221b0e-6158-471f-bcd2-b09514f28bd7.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1/c8af8428-aab6-4d19-b185-2b437c0334fa.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2/c617d12b-c37f-47ef-9704-e19774c67aeb.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3/577f31e2-1808-45e2-a528-5933019cfa85.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct/7bd7f5c8-be9e-473e-be18-03ad22a195ee.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/5036a549-5583-4775-935a-1a12b6de3e7d.json delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/5c0ffff9-542c-424e-88e9-89584e686e12.json delete mode 100644 data/hfopenllm_v2/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/5c6a045d-2c90-4938-9185-9c1a0f82903a.json delete mode 100644 data/hfopenllm_v2/Skywork/Skywork-o1-Open-Llama-3.1-8B/02480176-2058-4e71-a970-9698be8d235e.json delete mode 100644 data/hfopenllm_v2/Solshine/Brimful-merged-replete/4be1e5b4-254c-4287-907d-cc845042de37.json delete mode 100644 data/hfopenllm_v2/Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2/21b51852-5cad-414e-92d5-31878f025d67.json delete mode 100644 data/hfopenllm_v2/Sorawiz/Gemma-9B-Base/9eb07d4a-1f01-4696-9137-d477ffca43be.json delete mode 100644 data/hfopenllm_v2/Sorawiz/Gemma-Creative-9B-Base/4236485b-aa92-4bc4-a652-17ed3231ecf4.json delete mode 100644 data/hfopenllm_v2/Sourjayon/DeepSeek-R1-8b-Sify/9c0d6b71-8c6a-4294-961c-972a002b847f.json delete mode 100644 data/hfopenllm_v2/Sourjayon/DeepSeek-R1-ForumNXT/d1e906d5-8f0d-49c2-88c3-cf71774de600.json delete mode 100644 data/hfopenllm_v2/SpaceYL/ECE_Poirot/798e4f83-6262-4d5b-a854-6ff114167209.json delete mode 100644 data/hfopenllm_v2/Spestly/Athena-1-3B/dd2603d5-e99e-4778-95d0-159c788626cf.json delete mode 100644 data/hfopenllm_v2/Spestly/Atlas-Pro-1.5B-Preview/41c71990-e79d-447f-b082-63c96fd67a1f.json delete mode 100644 data/hfopenllm_v2/Spestly/Atlas-Pro-7B-Preview/b9e25948-2871-4b6c-933b-8a731e48e81b.json delete mode 100644 data/hfopenllm_v2/Stark2008/GutenLaserPi/7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json delete mode 100644 data/hfopenllm_v2/Stark2008/LayleleFlamPi/ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json delete mode 100644 data/hfopenllm_v2/Stark2008/VisFlamCat/b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json delete mode 100644 data/hfopenllm_v2/Steelskull/L3.3-MS-Nevoria-70b/7395fcde-49dd-47f4-a8ea-463eda40f5e3.json delete mode 100644 data/hfopenllm_v2/Steelskull/L3.3-Nevoria-R1-70b/a130087f-566f-4405-b662-1102f1664c49.json delete mode 100644 data/hfopenllm_v2/StelleX/Qwen2.5_Math_7B_Cot/3be58cf3-4761-4459-9f3c-eabf812a3c19.json delete mode 100644 data/hfopenllm_v2/StelleX/Vorisatex-7B-preview/dbdd71ad-db5b-4b4b-8856-68b55adbe127.json delete mode 100644 data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Instruct/da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json delete mode 100644 data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Reinforced/77d5f51e-5ad2-42a6-a32c-060cd844b949.json delete mode 100644 data/hfopenllm_v2/SultanR/SmolTulu-1.7b-it-v0/724cc582-cc83-474b-9606-70dbc22f3581.json delete mode 100644 data/hfopenllm_v2/Supichi/BBA-123/8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json delete mode 100644 data/hfopenllm_v2/Supichi/BBA99/0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json delete mode 100644 data/hfopenllm_v2/Supichi/BBAIK29/ab2512fa-2335-4817-9a76-3259690bbc67.json delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_135_Gemma/fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_250_Xia0_gZ/0e14484a-69d7-423e-bf6c-33d0992f408c.json delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_275_Tsunami_gZ/881eaa2c-af5f-4e84-8807-d0835c10ebd2.json delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_525_Tsu_gZ_Xia0/ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_78B_Calme_3_1_Ties/db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_QWEEN_V000000_LUMEN_14B/0c44a429-e705-4794-b702-1a731e52df90.json delete mode 100644 data/hfopenllm_v2/Supichi/HF_TOKEN/92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json delete mode 100644 data/hfopenllm_v2/Supichi/NJS26/5703e81d-055c-459b-8202-80ec382a8d5b.json delete mode 100644 data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.0/f6260b6e-52a2-4142-93ba-5393807fa0d4.json delete mode 100644 data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.1/83b84506-4826-48de-a6fe-2af6ae5d425a.json delete mode 100644 data/hfopenllm_v2/Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo/7483e260-9853-4d3f-aa10-187796d96de9.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V1-P1/f9925806-4252-44e8-b67e-917737572bd4.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V1-P2/70470e6c-8d66-4249-b762-a5a2e3589a53.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V1-P3/d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V2/a35b06bc-d759-421a-94cf-f408a98e9273.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V3/bbac659c-7cf8-41d4-98d4-ded4c471bd98.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V4/0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V5/a7ab6f16-717f-4567-8057-a4a18e1a1e77.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V6/2abe2c9d-032d-469e-852b-114eca5e84f8.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V7/2e8a83dc-c760-4f42-a361-e02cf3a65427.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V8/743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V9/4e37c90b-65a8-4b71-bfc2-d63541fb8962.json delete mode 100644 data/hfopenllm_v2/T145/Llama-3.1-8B-Instruct-Zeus/2e34d74e-1b69-4daf-8bee-77e5357fd439.json delete mode 100644 data/hfopenllm_v2/T145/Llama-3.1-8B-Zeus/0646e2f7-d2e6-42d3-8f09-f8daee302709.json delete mode 100644 data/hfopenllm_v2/T145/Meta-Llama-3.1-8B-Instruct-TIES/c66b1ff8-9c04-4f9c-b83e-088f31f79590.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V10/1bd2affc-9970-4149-b52b-51549b1f0029.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V11/f0479d74-4684-4b41-a63b-16d7fe0e3290.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V12/95deb890-a15d-4c71-8151-ed45c3dfb87f.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V13-abliterated/1c07fc4c-a773-4e03-bb14-7144e7815c01.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V13/e7e8388e-db3c-4881-b67c-5177c60562b9.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V14/c4923208-2a47-45f2-a74a-4483e4b99bee.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V15/b5f06a78-5b57-45a5-93be-4f3c1b36f208.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V16/835f19d3-515c-4bc4-ab96-5cb5bece45dc.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V2/7dd96382-6fc1-4a39-924b-d9034b5b0839.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V4/77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated/e3eae267-46ab-4433-a8f3-2a2f8448299b.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V17/e31308c4-8eb2-4a72-8127-18049d58b814.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V18/c7098a7a-e865-4ecd-b511-abeb2c0872bd.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V19/b3a8c734-e63a-47f7-af2c-a3b6518802fa.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2-ORPO/35937965-2791-4f75-8954-5a2280381c91.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2-abliterated/4ab806fe-738d-4f5b-89e4-004134d2f7fe.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2/a937e27e-b757-4de7-b679-01ac29d8bb22.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V20/1d906aab-33a6-4ffe-8a63-694482d83d09.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V21/9e101298-6482-4ae8-83e4-b948ba8fa550.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V22/3818710d-80a9-4e7d-90e3-f06afffb71ac.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V23/a18ec0c4-6f3f-4904-b69c-e40770df169e.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V24/529c2bd4-6b8e-4e3c-8737-c0b794444d13.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V25/9e994362-a1d1-48f7-9db1-dd9d532b9f35.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V26/cf35b7db-f675-4362-8916-36b0582b64f4.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V27/79ee7e34-36cd-4024-8978-86c1b059ae5f.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V28/9ec4fb99-ed4d-416e-9342-0c036aadd35d.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V29/8788e4fa-04c5-4f7c-bb4e-523287901f71.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2L1/18097bf4-5149-40e9-9850-558c3f143ed8.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2L2/b5942721-5c30-4c49-a6e1-fb5419539652.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V3/76d27de3-0309-4e4b-8d0d-0e402bde0a31.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V30/5c0553ff-4910-45a9-aa8d-3a76af098403.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V4/fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V6/f77aa103-5a09-409c-ad72-7992b6049f94.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V7/0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V8/044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V9/ac6b884d-62ea-4ff5-8eee-cfce08869030.json delete mode 100644 data/hfopenllm_v2/T145/qwen-2.5-3B-merge-test/8ffa696e-adef-4808-ba0e-bb04921a433d.json delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m-hf/8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m/4f24fc46-3686-41fa-bf25-a0e39b252cc9.json delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b-chat-hf/b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b-chat/4ce062da-acfc-4684-95c2-679cbe5a697b.json delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b/3d785765-befa-4e53-8672-769f7bb87dcd.json delete mode 100644 data/hfopenllm_v2/TIGER-Lab/AceCodeRM-7B/ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json delete mode 100644 data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule/31f0b186-1805-42ff-86cf-d8455a66d538.json delete mode 100644 data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule/ed6b3e7e-d294-420d-b9b9-460a52cd0239.json delete mode 100644 data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule/91dec0c0-9854-4790-a0a5-e17d19636f17.json delete mode 100644 data/hfopenllm_v2/TIGER-Lab/MAmmoTH2-7B-Plus/599616fb-26c1-47e3-a98b-9ad922a95c08.json delete mode 100644 data/hfopenllm_v2/TIGER-Lab/Qwen2.5-Math-7B-CFT/aeee4365-c34d-46b9-8c98-29976010bb62.json delete mode 100644 data/hfopenllm_v2/TTTXXX01/Mistral-7B-Base-SimPO2-5e-7/1ec68708-94c9-4561-bb99-7f211d7a9950.json delete mode 100644 data/hfopenllm_v2/Tarek07/Progenitor-V1.1-LLaMa-70B/0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json delete mode 100644 data/hfopenllm_v2/Tarek07/Thalassic-Alpha-LLaMa-70B/91bcd646-fe3d-458b-a426-a6a8863d69a0.json delete mode 100644 data/hfopenllm_v2/TeeZee/DoubleBagel-57B-v1.0/2e0458cc-e092-4770-bd80-00dff169d754.json delete mode 100644 data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0/d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json delete mode 100644 data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0/a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json delete mode 100644 data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B-Instruct/f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json delete mode 100644 data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B/2c8c6c6a-ce95-4d11-a33a-d547859fee11.json delete mode 100644 data/hfopenllm_v2/TencentARC/MetaMath-Mistral-Pro/47858744-3378-4ed4-9101-8acbc3a53cda.json delete mode 100644 data/hfopenllm_v2/TencentARC/Mistral_Pro_8B_v0.1/2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json delete mode 100644 data/hfopenllm_v2/TheDrummer/Cydonia-22B-v1.2/23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json delete mode 100644 data/hfopenllm_v2/TheDrummer/Gemmasutra-9B-v1/312ec315-6175-4f99-8741-97d97eb26b47.json delete mode 100644 data/hfopenllm_v2/TheDrummer/Gemmasutra-Mini-2B-v1/7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json delete mode 100644 data/hfopenllm_v2/TheDrummer/Llama-3SOME-8B-v2/68c9fb85-f90e-442f-aa96-458dabe30b39.json delete mode 100644 data/hfopenllm_v2/TheDrummer/Ministrations-8B-v1/6891d1dd-0e1a-42e8-9206-64a4c71854f9.json delete mode 100644 data/hfopenllm_v2/TheDrummer/Rocinante-12B-v1/c62eb6b3-2a3d-45bd-acdf-bad717e51766.json delete mode 100644 data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v1/55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json delete mode 100644 data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v2/227e3e19-29d6-414f-b538-9f6f89d47677.json delete mode 100644 data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v3/e922ac2c-e8d0-48f2-99fc-da70c925136c.json delete mode 100644 data/hfopenllm_v2/TheDrunkenSnail/Daughter-of-Rhodia-12B/59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json delete mode 100644 data/hfopenllm_v2/TheDrunkenSnail/Mother-of-Rhodia-12B/a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json delete mode 100644 data/hfopenllm_v2/TheDrunkenSnail/Son-of-Rhodia/a889f561-0d8a-4345-9131-0a897ec215ac.json delete mode 100644 data/hfopenllm_v2/TheHierophant/Underground-Cognitive-V0.3-test/6402facc-6258-43a4-a0fd-78e21765c504.json delete mode 100644 data/hfopenllm_v2/TheTsar1209/nemo-carpmuscle-v0.1/29fbd2e0-e08a-48f4-905e-d2aa54886915.json delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-r-v0.3/313e0379-d3ea-4f5a-8e06-4b0a94317487.json delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.1/f326fbd0-5f92-4324-a587-1f08cf7da208.json delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.2/d61310e9-5267-4a87-8e24-ae25172cd64e.json delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.3/60953e5e-523d-43c0-ad00-f746308030b1.json delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4.1/5afd8861-d7cb-45cd-af1b-6db966cb56e0.json delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4/c3972df1-4414-4c71-b473-fb9459cf085b.json delete mode 100644 data/hfopenllm_v2/Tijmen2/cosmosage-v3/b89d54b7-2329-4608-b9f6-07017e63f1cd.json delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.1/50389350-af23-41ba-af46-5ffe338ff9d2.json delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.5/b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.6/7cd59011-75d7-4497-956c-322d5d609c5f.json delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v1.0/1313d865-9c5b-45d2-ad64-629c65f07f2c.json delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/0efc2583-bf21-4b60-96cc-716928768eb1.json delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama_v1.1/be0a2737-19a0-4401-998a-a03663467133.json delete mode 100644 data/hfopenllm_v2/ToastyPigeon/Sto-vo-kor-12B/71720e07-2de0-4402-bdfd-102150c61765.json delete mode 100644 data/hfopenllm_v2/Trappu/Magnum-Picaro-0.7-v2-12b/38c84c69-5cdb-4f24-820d-4b39c5b118ff.json delete mode 100644 data/hfopenllm_v2/Trappu/Nemo-Picaro-12B/de9d274d-f213-4037-9711-3e9d3dbbcc96.json delete mode 100644 data/hfopenllm_v2/Tremontaine/L3-12B-Lunaris-v1/92381da4-b9d1-43c4-a5c9-59f375017e11.json delete mode 100644 data/hfopenllm_v2/Triangle104/Annunaki-12b/44ab6a50-027d-47df-a518-5aa944eb2a61.json delete mode 100644 data/hfopenllm_v2/Triangle104/BigTalker-Lite-8B/2a1947d7-74e0-43d0-931d-b2862348e90a.json delete mode 100644 data/hfopenllm_v2/Triangle104/Chatty-Harry_V2.0/3677b71c-387d-4182-b15d-c3525bc7bc36.json delete mode 100644 data/hfopenllm_v2/Triangle104/Chatty-Harry_V3.0/6b125a8e-5b53-48ca-8875-926249879f39.json delete mode 100644 data/hfopenllm_v2/Triangle104/Chronos-Prism_V1.0/af851d4b-69d4-49a9-a160-a180146c3963.json delete mode 100644 data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1/7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json delete mode 100644 data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES/1bce093e-27c0-41ad-aad6-b656f6773ed5.json delete mode 100644 data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-10B-Harmony/5c6cffab-ef72-4e12-808c-c26ee8ec6999.json delete mode 100644 data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1/e288a874-f750-4a90-be07-616094c220cf.json delete mode 100644 data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-7B-RP/0607da8d-3f4e-468a-91a6-b975261a87c0.json delete mode 100644 data/hfopenllm_v2/Triangle104/DS-R1-Llama-8B-Harmony/be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json delete mode 100644 data/hfopenllm_v2/Triangle104/DSR1-Distill-Llama-Lit-8B/15ffe64e-72fd-4e65-8632-babf137a386d.json delete mode 100644 data/hfopenllm_v2/Triangle104/DSR1-Distill-Qwen-7B-RP/ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json delete mode 100644 data/hfopenllm_v2/Triangle104/Dark-Chivalry_V1.0/b5afab38-13ba-4abd-9d04-a433c41061c5.json delete mode 100644 data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B/a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json delete mode 100644 data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B_TIES/d8254f6c-8110-44d3-800e-101fc731d779.json delete mode 100644 data/hfopenllm_v2/Triangle104/Distilled-Whiskey-8b/ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json delete mode 100644 data/hfopenllm_v2/Triangle104/Dolphin3-Llama3.2-Smart/c208b19b-4ecf-4fad-b931-54f65d4b711b.json delete mode 100644 data/hfopenllm_v2/Triangle104/Gemmadevi-Stock-10B/debaf4a0-c734-47ea-bea0-2ddc65dc397d.json delete mode 100644 data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT-Summary/0eeb5962-ccc0-407b-92e6-7cf17c00941f.json delete mode 100644 data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT/4b60e863-482c-4f91-8cd1-6c993d3c5988.json delete mode 100644 data/hfopenllm_v2/Triangle104/Hermes3-L3.1-DirtyHarry-8B/f5f0bc72-427d-4703-aab1-1bb1bea73895.json delete mode 100644 data/hfopenllm_v2/Triangle104/Herodotos-14B/aae7f543-7b5b-435f-a506-e3ab901a8c5a.json delete mode 100644 data/hfopenllm_v2/Triangle104/Herodotos-14B_V0.1/6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json delete mode 100644 data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink/3ee76278-89d4-44fb-a449-717534b00161.json delete mode 100644 data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink_v0.r1/fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json delete mode 100644 data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesBlackroot/9ddaa721-bf3a-416a-9be8-291188793cc9.json delete mode 100644 data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesInk/d659077d-7261-4c69-862c-d61be21662a2.json delete mode 100644 data/hfopenllm_v2/Triangle104/Llama3.1-Allades-Lit-8b/e87ba227-c55e-4666-949d-b45913f8336b.json delete mode 100644 data/hfopenllm_v2/Triangle104/Llama3.1-cc-Lit-8b/077f683a-af6f-4a71-b599-b9b269546b7c.json delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-1.5b/54808b08-d10d-4a06-ab60-8d99039311b8.json delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-1.5b_V0.2/138e6fdb-7092-4ee6-be82-7bb86c1fc759.json delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-10b/1b27423f-62cc-4189-a293-5af84ef1f2c8.json delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-14b-V0.1/f5468512-d2c7-4486-9d31-bef61225af52.json delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-14b/0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-7b/07b87b98-0d61-4479-937f-7447565b4631.json delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-8b/85b11b91-d686-49e9-8db0-971dd7cafb75.json delete mode 100644 data/hfopenllm_v2/Triangle104/Mistral-Redemption-Arc/21bac032-a092-4afa-8d29-ebdefb3a0650.json delete mode 100644 data/hfopenllm_v2/Triangle104/Mistral-Small-24b-Harmony/29e3a687-429f-4f33-ae5f-48db85127364.json delete mode 100644 data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.1/d98493a6-f237-4565-8508-9e4cc3188d2d.json delete mode 100644 data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.2/2def6fbd-7488-4e9f-a822-2405d4f7a315.json delete mode 100644 data/hfopenllm_v2/Triangle104/Pantheon_ChatWaifu_V0.2/819143d4-9538-48b9-b7af-128bc15c518a.json delete mode 100644 data/hfopenllm_v2/Triangle104/Phi-4-AbliteratedRP/c29d47af-a9de-4edb-acac-6763c0d44ca3.json delete mode 100644 data/hfopenllm_v2/Triangle104/Phi4-RP-o1-Ablit/22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json delete mode 100644 data/hfopenllm_v2/Triangle104/Phi4-RP-o1/2bea7014-460d-470b-918f-468b58d70fd6.json delete mode 100644 data/hfopenllm_v2/Triangle104/Porpoise-R1-Llama3.2-3b/3927a5dd-002b-441a-b769-ba68547cd5f3.json delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-14B-Instruct-1M-Harmony/476fc734-dedd-4192-aa59-eb2f9dabf16b.json delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-AthensCOT/817e2fbe-0866-489f-b987-391228a68c53.json delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-CodeR1-3B/f25f5eb1-ff22-4be3-a639-a9d25207078f.json delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-EVACOT-7b/f71d1c31-184b-46be-a288-bdc92f0ebe09.json delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-EvaHumane-RP/0d9547b3-7bef-4815-9c44-7d714fe81bbb.json delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-Humane-RP/22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-Instruct-1M_Harmony/afedb249-f1a5-42d6-b6c0-54b2cc303f64.json delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-R1-3B/61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-R1-7B/c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json delete mode 100644 data/hfopenllm_v2/Triangle104/Robo-Gutenberg_V1.0/cc57e6f0-ab55-4ab9-983c-63d74632d016.json delete mode 100644 data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.0/0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json delete mode 100644 data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.1/a6ec2934-e9fd-481d-8f00-932603bc6e0a.json delete mode 100644 data/hfopenllm_v2/Triangle104/RomboHermes3-R1-Llama3.2-3b/e2553c93-60df-4126-9e64-ecd4a5003389.json delete mode 100644 data/hfopenllm_v2/Triangle104/Rombos-Novasky-7B_V1c/e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json delete mode 100644 data/hfopenllm_v2/Triangle104/Set-70b/a807ee8c-509e-4b6d-a414-df24444d8a0a.json delete mode 100644 data/hfopenllm_v2/Tsunami-th/Tsunami-0.5-7B-Instruct/2199024b-7944-4950-8335-32a536efad02.json delete mode 100644 data/hfopenllm_v2/Tsunami-th/Tsunami-0.5x-7B-Instruct/97919c86-6161-4548-95b9-d44263a29f8a.json delete mode 100644 data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-14B-Instruct/c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json delete mode 100644 data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-7B-Instruct/c1294268-b5f5-4d64-b91a-147f58a21a47.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1/2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2/b926ca6c-60c9-4353-9671-0453b46d0222.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3/44db30b4-2010-4f96-a39e-9ccc8568374f.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1/2210d673-d417-46be-aeca-de48cd846e01.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2/892d27cc-dfb3-40c7-ae0f-a7cd06784808.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/49b3f293-721d-4d44-9748-88d1ce275050.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/70fb41fe-46af-49e3-8270-5882e12f710f.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1/13e2489f-9d96-4f68-8e22-c937604c2145.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2/0c386ea0-4706-4a6f-994c-b6ee21dbce92.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3/a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO/4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json delete mode 100644 data/hfopenllm_v2/UKzExecution/LlamaExecutor-8B-3.0.5/568072cb-118d-41af-bfe8-fa14cb4c7348.json delete mode 100644 data/hfopenllm_v2/Unbabel/TowerInstruct-Mistral-7B-v0.2/a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json delete mode 100644 data/hfopenllm_v2/Undi95/MG-FinalMix-72B/2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json delete mode 100644 data/hfopenllm_v2/Undi95/Phi4-abliterated/359dde31-d9dc-4c22-b829-77df652dcc73.json delete mode 100644 data/hfopenllm_v2/V3N0M/Jenna-Tiny-2.0/34a79823-b993-402a-89a7-538e126ee02a.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct/f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct/73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct/72a66eae-9c94-40e3-b3c9-211303e5cba8.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct/ef7390b5-599b-4354-805b-9486e4ce34fa.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-1.5b/57f964c3-0504-4b60-9539-ce0e369816ea.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-HerO/4e6c0336-5d94-4417-a194-92a4d6f38481.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-LaserChat/fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-2b/5ced7497-5a05-40d2-80cb-cae63ca62022.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-7b/52a66aaa-193a-48ca-b693-4dcab811eaa3.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct/e0e4bcef-cb73-436b-9353-b18ade293e8b.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct/1ae45791-7e47-4083-bd72-4530fa26893c.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Phi-3-medium/b2731f04-a9bd-4e36-a545-85be5b66f5a7.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-SOLAR-Instruct/ed6de552-d04b-4d51-8456-610e2cb41d85.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-2b-it/3e08a589-d2b3-487b-900e-85725522a2e4.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-9b-it/b2717503-d081-40ee-b1ed-fcadaf239049.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-DPO/9915eb01-5c45-42b6-82a3-ad782411642f.json delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-SFT/190eb7ca-46db-4e1d-8b71-9bb20af74ede.json delete mode 100644 data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B-r-v-0.1/86b9077d-9ec3-411d-84c5-326ba97742c1.json delete mode 100644 data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/18bfa50c-20be-4027-8ee7-f6cd1411c882.json delete mode 100644 data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/eb1a099a-48c7-412b-b62f-143537c41f06.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3-70B-Fireplace/e530a4b7-c2f6-4bad-bab5-2895e950ed63.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3-70B-ShiningValiant2/52ad7152-feea-46a6-b2d8-20e1a70514ce.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-70B-ShiningValiant2/a61162a6-ef3e-46f4-8aa2-241547fadea2.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/9f208aef-8544-47c8-bb1f-a3841aff208b.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/da237ab6-df39-460f-9efc-e1649e1ac202.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Enigma/c81b3193-9d01-4590-8b72-da97aa3c9dc4.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Esper2/1a9ffe50-69ae-48bc-b636-89431391eb37.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/b0c67359-1da0-4f55-aa1c-f54f88038bd7.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/c700798b-583a-41be-94dd-382669bb495f.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/e8c9501b-c985-4b78-a902-a1a030c72e60.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Enigma/df978fce-3373-4073-8c44-d6a83df1d9d1.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Esper2/e46ee8d9-81af-4259-8fef-3d3113fb6168.json delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-ShiningValiant2/aa6ab404-89ef-4336-b811-7c8064e26107.json delete mode 100644 data/hfopenllm_v2/Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24/a14e6c79-4a78-4c02-a7ca-35e783f32be1.json delete mode 100644 data/hfopenllm_v2/Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24/ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json delete mode 100644 data/hfopenllm_v2/Weyaxi/Bagel-Hermes-2x34B/f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json delete mode 100644 data/hfopenllm_v2/Weyaxi/Bagel-Hermes-34B-Slerp/335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v4-7B/b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v6.1-Llama3-8B/112f01a2-f0fb-4257-86bf-61c9a184eb92.json delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v7-Qwen2-7B/16ff8fa3-4676-473c-99ad-908ddb59d8ed.json delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v8-Llama3.2-1B/9b153ac9-f95b-419b-b7f9-beccd769ddad.json delete mode 100644 data/hfopenllm_v2/Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct/8a5df3c2-eb71-4e12-b013-fb43685f2916.json delete mode 100644 data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.0/35fa3213-5c08-4b19-ae76-237fdd25444e.json delete mode 100644 data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.2/242ce55f-1471-435e-bcd7-d28b5fc87fc4.json delete mode 100644 data/hfopenllm_v2/WizardLMTeam/WizardLM-70B-V1.0/95f509f2-5e67-404a-968d-f7488d684e32.json delete mode 100644 data/hfopenllm_v2/Wladastic/Mini-Think-Base-1B/bcbcdfe9-0663-417c-9a29-60906e63db8f.json delete mode 100644 data/hfopenllm_v2/Xclbr7/Arcanum-12b/d95a7493-2f99-4c10-8067-711c7388af7d.json delete mode 100644 data/hfopenllm_v2/Xclbr7/Hyena-12b/789848a0-6d8a-4583-93c3-a72df74d0071.json delete mode 100644 data/hfopenllm_v2/Xclbr7/caliburn-12b/14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json delete mode 100644 data/hfopenllm_v2/Xclbr7/caliburn-v2-12b/379f559f-9bfa-444f-b477-562c25b4c299.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2/effb6a3d-c98f-4c3a-be77-902c61cda21b.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER/6c1c1405-afa4-412d-ba1f-49dc1cac4509.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Phi-4-Megatron-Empathetic/6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Phi-4-mini-UNOFFICAL/5fd5206b-186a-43b9-a4f4-07e75aa0293a.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-7B-MS-Destroyer/b707ecbf-0658-4226-803d-53456d16d54b.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2/dca1ee57-5e86-4532-a2f3-ac6a619ca576.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview/1233476a-7839-4a22-a7ca-1d0f237d8888.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored/5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/18f5fd6c-2b79-4d48-b7e9-18845db16271.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small/a9039374-fa5a-4b8b-800f-5f4651cf812d.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp/3f9704b4-bf25-40da-b6dc-b927c3569f40.json delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B/a8f858d8-a792-409f-b79d-948a19e2aa87.json delete mode 100644 data/hfopenllm_v2/Xkev/Llama-3.2V-11B-cot/5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-1M-YOYO-V3/77092cfe-9820-45e8-94c5-31d27f1daa7c.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0505/cab8fed8-de68-4fa5-b4fc-d9483fc56571.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0510-v2/a8103350-b208-4856-8e7b-8ea8918ba0d1.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0805/e849c03c-c569-4059-8fc5-6a98cf391342.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005-v2/f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005/97bdb352-2e9d-4cc5-8b70-55348ef3a217.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010-v2/78053a33-24c8-4e9f-8791-f127f21eec1c.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/03082966-87ba-4560-a784-5d8677003500.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/97f26b20-db66-4a30-ba2a-c18a31081271.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-SCE/85f9ccda-8c47-4fa1-9d47-e9da4730b077.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p1/2a57d6f4-643b-4b30-8d67-03032d454887.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p2/d333f360-c1c3-4916-8480-4a1fc490875a.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4/37a41261-a7b0-44b2-916f-770cdfa0ad39.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest-V2/c46cd6cc-b56d-44c5-a03c-b49381ba3462.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest/612b6226-c25d-42e0-bcd7-be7faa844530.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-it-restore/2fc7a4d6-88e0-4f11-9110-dc53942870a4.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-7B-it-restore/34665752-58d8-48ee-81a6-f1a068c23026.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010/cc0767b5-4aaa-4418-8f68-72a721323e9c.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2/ea507a41-1654-4515-94cc-ce2e38800c61.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3/c44e773f-4cca-4780-bdd4-f486e65c18e0.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4/f8a46bda-d53b-484e-8832-7939f7d0762d.json delete mode 100644 data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B/c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json delete mode 100644 data/hfopenllm_v2/Yash21/TinyYi-7B-Test/da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json delete mode 100644 data/hfopenllm_v2/Youlln/1PARAMMYL-8B-ModelStock/ac078124-85d9-4715-bf7c-1428b1063732.json delete mode 100644 data/hfopenllm_v2/Youlln/2PRYMMAL-Yi1.5-6B-SLERP/9c1dcd75-8491-4890-ac6f-000868099a3e.json delete mode 100644 data/hfopenllm_v2/Youlln/3PRYMMAL-PHI3-3B-SLERP/7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json delete mode 100644 data/hfopenllm_v2/Youlln/4PRYMMAL-GEMMA2-9B-SLERP/8f38374e-f373-4639-9278-24441ebd0325.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-12B/c007938e-3427-4896-8493-1500abdfbd2b.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-15B/df81dc0d-6c72-49e9-862b-02e9b6642cb6.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR/46c96d8e-568c-48f8-a74b-9dd4b4195037.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3/1f4f7181-8a81-49f4-9e81-925d5d69a37c.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR/3ea343b6-93f6-4c61-a164-3db95d13cbdf.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V2/a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V3/0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1/6896faa7-7204-4091-8f4e-9cc0b53d673a.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2/88064453-fd8c-4bd9-adf1-39f43972bec1.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4/a18ade45-acba-4059-b969-445e529a82e2.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5-FT/6c0e4132-71e7-44af-95fc-83b0a6be2a82.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5B-Youri/5d9ab422-4f4f-460d-bd39-51266b43d7e5.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL1B-FT-V1/cda03c45-0782-40cc-a17d-67d808657b83.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE-Qwen0.5B-FT-V2/50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json delete mode 100644 data/hfopenllm_v2/Youlln/ECE.EIFFEIL.ia-0.5B-SLERP/cf758994-6e94-434d-bf68-74cca188b5e8.json delete mode 100644 data/hfopenllm_v2/YoungPanda/qwenqwen/611f9549-0788-44e9-8125-18df06cd80d6.json delete mode 100644 data/hfopenllm_v2/Yuma42/KangalKhan-RawRuby-7B/59cf23ba-027d-4bac-a0e1-526376396b4d.json delete mode 100644 data/hfopenllm_v2/Yuma42/Llama3.1-IgneousIguana-8B/1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json delete mode 100644 data/hfopenllm_v2/Yuma42/Llama3.1-SuperHawk-8B/1e737e28-d926-43e8-9e4c-e39fa91d7977.json delete mode 100644 data/hfopenllm_v2/Z1-Coder/Z1-Coder-7B/43ef8eee-5d8a-47e7-ac71-1a898421370a.json delete mode 100644 data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-dpo-avg/d8d03c71-942f-4aff-8a5e-5c265c639b44.json delete mode 100644 data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-rpo-avg/96262938-1146-4993-92a1-a2ddb2519f8a.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/L3-Aspire-Heart-Matrix-8B/292d7cfb-3e3c-47d8-8cca-33507f9ff081.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix/3f29c10f-57ef-435b-85df-2cae30ae72fa.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix/d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SpecialTitanFusion/baa35c90-c494-4dff-af28-cb549e40bed8.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes/2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova/f687df8b-42b5-4d94-b741-1b516d9221b2.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B/c3a8a952-6869-4eee-a59f-4ae33ac72986.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M/a7a74117-71e4-49b2-bd65-add82c9165d8.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix/04ee694c-0c89-4f25-b10f-315a24743ba2.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix/47fd4acb-acc3-4f12-8af5-c425d3754c38.json delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-Qandora-CySec/e19577f5-d1ba-45ad-8500-d18ae2b14440.json delete mode 100644 data/hfopenllm_v2/ZeusLabs/L3-Aethora-15B-V2/e86443cd-453b-4ca0-8e7e-054764fe4bb9.json delete mode 100644 data/hfopenllm_v2/ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3/24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json delete mode 100644 data/hfopenllm_v2/aaditya/Llama3-OpenBioLLM-70B/1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json delete mode 100644 data/hfopenllm_v2/abacusai/Dracarys-72B-Instruct/4b1f2aab-ef92-4231-9bdd-96918b26914c.json delete mode 100644 data/hfopenllm_v2/abacusai/Liberated-Qwen1.5-14B/4956e127-14a1-405e-a0e0-76fe94ea727b.json delete mode 100644 data/hfopenllm_v2/abacusai/Llama-3-Smaug-8B/90fb6e40-88f7-4ce2-ae99-308d87e69718.json delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-34B-v0.1/cdad0f08-1c60-4493-bed0-9733894b367a.json delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-72B-v0.1/8e83b4f7-736f-4e03-8256-2a1fc421b04f.json delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-Llama-3-70B-Instruct-32K/f0d6639d-8485-4bcd-b069-046a747dfbfa.json delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-Mixtral-v0.1/d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-Qwen2-72B-Instruct/5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json delete mode 100644 data/hfopenllm_v2/abacusai/bigstral-12b-32k/de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json delete mode 100644 data/hfopenllm_v2/abacusai/bigyi-15b/db96601a-2f7f-438f-915b-55fee0e0d1d1.json delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/27912f7d-7033-4b7c-b93a-af1673ce4a9b.json delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/da58a484-4a45-4a70-a651-031ada8023d5.json delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v1/e8bd221d-8a89-4e3c-8815-0bff27574053.json delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v2/ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-llama3-orpo-v2/1e506afa-0d08-45d6-9242-b06104aa67e8.json delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-vr4a1-e5mms/7d66bb93-cb2f-4be6-b133-1f0325be58e1.json delete mode 100644 data/hfopenllm_v2/abideen/MedPhi-4-14B-v1/936f3c5f-7817-4118-96c8-e4061d4560fb.json delete mode 100644 data/hfopenllm_v2/adamo1139/Yi-34B-200K-AEZAKMI-v2/7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json delete mode 100644 data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/77cace56-503f-4531-a4eb-0178a68cc283.json delete mode 100644 data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/9e49b710-2413-42f3-8943-bc9dbf68cb3c.json delete mode 100644 data/hfopenllm_v2/aevalone/distill_qw_test/9a5b3564-97df-4661-a171-37322386ac4d.json delete mode 100644 data/hfopenllm_v2/agentlans/Gemma2-9B-AdvancedFuse/0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json delete mode 100644 data/hfopenllm_v2/agentlans/Llama-3.2-1B-Instruct-CrashCourse12K/7f06c78c-f95e-4e50-aa57-da0579adcdae.json delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-8B-drill/06e55e47-9995-4fa2-877a-c728e9f9f1a1.json delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish-Instruct/39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish/f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-LexiHermes-SuperStorm/7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K/ce80ac07-22d2-4883-ac6c-40b080e00b81.json delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse/cbece170-f872-485f-a6c2-5db17ced73bc.json delete mode 100644 data/hfopenllm_v2/agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout/c1fd751b-c6c3-4350-9618-f4b4840e1b69.json delete mode 100644 data/hfopenllm_v2/ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b/bfd28b91-3a72-4417-b52b-804d2cbae12f.json delete mode 100644 data/hfopenllm_v2/ai21labs/Jamba-v0.1/32c26cbc-3697-47a6-bd12-18187df9dda9.json delete mode 100644 data/hfopenllm_v2/ai4bharat/Airavata/02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json delete mode 100644 data/hfopenllm_v2/aixonlab/Aether-12b/a57d2d49-5ccf-48f5-8035-b1d480c80f40.json delete mode 100644 data/hfopenllm_v2/aixonlab/Grey-12b/6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json delete mode 100644 data/hfopenllm_v2/aixonlab/Zara-14b-v1.2/fe0665dd-b976-4d90-b16b-6c2acfef15ff.json delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-First/8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-Last/e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-First/26c4c993-ae49-42a0-be0a-f157be9f7d58.json delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-Last/19adf124-c120-4e97-80cf-49c40a66eb81.json delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.BaseFiT/66bc5d38-8d25-4934-bce8-41ce4ea0e385.json delete mode 100644 data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/541eafe5-807e-44b0-b652-a0752210fc71.json delete mode 100644 data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/845a2484-9f17-4c0e-b06b-6250992298bc.json delete mode 100644 data/hfopenllm_v2/alcholjung/llama3_medical_tuned/e62b6b26-5f3c-42c9-9541-bb8b23caee66.json delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-DPO/ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-SFT/a70b8356-94ce-4f0d-b44a-2215076eed5e.json delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/b182807d-587e-4702-bf30-dab11983b8db.json delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/c1f0944a-c44c-42e9-90ba-a847509cbd66.json delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-DPO/64bb8530-7071-402e-ba9b-1d15ecbe275c.json delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-RM/4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-SFT/1420df5c-690e-4b01-b99c-c21c793689ae.json delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/dfabd777-8620-40e3-b19c-a9227f57b638.json delete mode 100644 data/hfopenllm_v2/allenai/OLMo-1.7-7B-hf/08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json delete mode 100644 data/hfopenllm_v2/allenai/OLMo-1B-hf/4b264bb0-bd7e-4b15-9591-50b5a521f100.json delete mode 100644 data/hfopenllm_v2/allenai/OLMo-2-1124-7B-Instruct/a8cfe336-0c3e-401c-a1e9-d951e64918ec.json delete mode 100644 data/hfopenllm_v2/allenai/OLMo-7B-Instruct-hf/5e66c653-41b1-46de-b677-ffd8426ba5ec.json delete mode 100644 data/hfopenllm_v2/allenai/OLMo-7B-hf/9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json delete mode 100644 data/hfopenllm_v2/allenai/OLMoE-1B-7B-0125-Instruct/cc64a143-4f1e-42ee-ade1-fafc4b316336.json delete mode 100644 data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924-Instruct/cf322e64-2682-4a9a-a48f-c4ec47b852f2.json delete mode 100644 data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924/30b32261-b24a-49e3-ba57-172dc1d03ba0.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Chocolatine-24B/0681c01d-23f3-4b8b-9516-a5cc41761fc4.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-2.6B/7693ed8a-f76d-482b-92c1-f11810e522ca.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-27B/f8dc0128-c606-490a-b965-59d5377dd778.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-2.6B/844547f7-658f-41dd-ab4c-dc0569030e59.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-27B/75c291b5-6d60-4bde-8621-f865196a6ecc.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp3-27B/36d54b12-594f-47fe-9637-a9b740416c5c.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp4-27B/57733383-9573-463d-a467-068d2685014c.json delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaSlerp-9B/eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaSlerp2-9B/00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaSlerp4-10B/0a3b9ad6-b853-471d-a292-413b30273034.json delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaSlerp5-10B/d61c3ace-e353-4c0b-9472-c9a1928809cc.json delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaStock1-27B/2293a19a-b650-436d-9448-1b641e63d407.json delete mode 100644 data/hfopenllm_v2/allknowingroger/HomerSlerp1-7B/c15b977c-c781-4b17-ac9f-25c77602c875.json delete mode 100644 data/hfopenllm_v2/allknowingroger/HomerSlerp2-7B/42c191be-c0ae-4170-8b6f-565053ae7d9c.json delete mode 100644 data/hfopenllm_v2/allknowingroger/HomerSlerp3-7B/f5cb910d-6e5b-404a-a751-d5cb90668150.json delete mode 100644 data/hfopenllm_v2/allknowingroger/HomerSlerp4-7B/de806e4c-dbf8-48cc-a0d8-033a61dfc777.json delete mode 100644 data/hfopenllm_v2/allknowingroger/LimyQstar-7B-slerp/59150b73-b05a-451e-ba3f-696d04effe05.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Llama3.1-60B/84926b81-360a-480c-b240-f154ec7fe0ba.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Marco-01-slerp1-7B/8e6edb04-302b-4dfc-b38f-94b437c921a8.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Meme-7B-slerp/db92c564-1cf9-43db-9e25-1f450c7b1e7f.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Ministral-8B-slerp/e3796243-cbba-4ec2-ad7c-89547ad24342.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MistralPhi3-11B/1479be90-df8f-4e1d-b9db-03e84000187a.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Mistralmash1-7B-s/d2e6c48c-1c18-45a6-ba1a-b335325c980c.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Mistralmash2-7B-s/f843e45a-f66b-4091-a964-75583c2d7fc5.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MixTAO-19B-pass/cbc3cd41-e187-4c4f-b207-37bceab423a4.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MixTaoTruthful-13B-slerp/0f124566-5e94-4233-9a3f-5ff9cfdf160c.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiCalm-7B-slerp/98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash-12B-slerp/91522dad-529b-477c-8372-793f631e14b7.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash10-13B-slerp/cec22734-493c-4d11-ba86-6c7ae2005124.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash11-13B-slerp/704a6e19-0d86-42a5-b8f5-05a5856e9c29.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash2-12B-slerp/bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash5-12B-slerp/d20d533a-758b-477c-b4eb-073adaed640e.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash6-12B-slerp/f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash7-12B-slerp/9db1f823-e068-4a39-a5cc-b9c588099427.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash8-13B-slerp/23818b45-bf5f-48a2-982f-1e2a0d35aac8.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash9-13B-slerp/de6eda66-b8f5-4b23-89e1-44bbac600953.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMerge-7B-slerp/632974c2-57e2-41f9-8c00-671e07e7594b.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Multimash3-12B-slerp/e86dcf4f-6282-4aa6-b645-00f93a2e9077.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Multimerge-19B-pass/b20be5c9-9720-4076-b587-728549dd19af.json delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiverseEx26-7B-slerp/5e193803-39d1-4f12-8726-ebbe5f71563c.json delete mode 100644 data/hfopenllm_v2/allknowingroger/NeuralWestSeverus-7B-slerp/61131a6c-f412-42bf-814b-7d711a840d44.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Neuralcoven-7B-slerp/535e72b1-17e0-40e3-9d66-d31f8ec70413.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Neuralmultiverse-7B-slerp/ea15479e-24a8-4924-a754-a8567c511e61.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3della5-14B/5799f285-c61f-43a8-a6a6-053808cf4e8f.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3merge-14B/36feef44-3d3b-4102-8606-ee6420bddcff.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3merge2-14B/fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3merge3-14B/18e5decd-c95e-43d2-9ba2-007ba32e216f.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3task1-14B/85a4996e-8c44-4e4f-9478-19a8c5513617.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3task2-14B/db6d57c8-df0b-407e-b937-67c55b513a5f.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3task3-14B/89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3unsloth-3B-slerp/c79e690f-3e09-4fac-9412-937a3b7ef352.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Phi3mash1-17B-pass/ce74b7e3-8505-4c79-a7de-12d1e6b47155.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Quen2-65B/3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-42B-AGI/152b0cbe-e27b-4438-8326-e67f4e70e600.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task2/c733c91f-79a9-49e5-9398-3a424ee1940a.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task3/32d7b6c6-de5c-4864-a446-97dccce378c5.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task4/7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task7/99650529-55d9-42b0-b812-761a30277e5e.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task8/81abbc2a-791b-4a39-bb46-97edfa14b9c0.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-slerp-14B/c658e535-7098-40fc-bea0-f5734d8f4ca9.json delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenSlerp12-7B/9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenSlerp4-14B/07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenSlerp5-14B/c41d8925-b56b-458e-b1a9-27dbbcaee149.json delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenSlerp6-14B/9136feb4-5c3e-48b3-bc70-c7816b8b189b.json delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenStock1-14B/c395ef02-9a50-4696-aad2-bcb32ba05f67.json delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenStock2-14B/93f47969-556a-4fd4-b7bb-4d1c861a8d71.json delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenStock3-14B/349ae559-6c1f-4b2f-954c-e83cba1e603a.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwenslerp2-14B/3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwenslerp2-7B/500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwenslerp3-14B/340a3ebb-bc06-404f-84e7-aeccc016fd32.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwenslerp3-7B/a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json delete mode 100644 data/hfopenllm_v2/allknowingroger/ROGERphi-7B-slerp/bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json delete mode 100644 data/hfopenllm_v2/allknowingroger/RogerMerge-7B-slerp/0e1e45d4-2747-480d-9b1f-2b200e250271.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Rombos-LLM-V2.5-Qwen-42b/00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Strangecoven-7B-slerp/c9e57ab2-c2a4-4935-b976-4bf24647b777.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Weirdslerp2-25B/c22436a2-ec60-4220-82b3-123618165eb2.json delete mode 100644 data/hfopenllm_v2/allknowingroger/WestlakeMaziyar-7B-slerp/1f990438-dd84-44d2-99f9-a10035ecd652.json delete mode 100644 data/hfopenllm_v2/allknowingroger/YamMaths-7B-slerp/f4564f5e-3595-466e-8201-0e2a4c50ff0d.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Yi-1.5-34B/040def3a-702d-4868-b429-39697ca36207.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Yi-blossom-40B/9e24fd65-56ec-4160-b299-b34d702a3231.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Yibuddy-35B/216bf9f8-9521-4311-a40b-8a847271265c.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Yillama-40B/45f8c4fb-3591-44df-a4f0-57093b9bae23.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Yislerp-34B/d17275ef-8a32-4fcb-94f4-fb24299ba50e.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Yislerp2-34B/61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json delete mode 100644 data/hfopenllm_v2/allknowingroger/Yunconglong-13B-slerp/113c3507-b738-4b06-ada8-da93b19c6ae2.json delete mode 100644 data/hfopenllm_v2/allknowingroger/limyClown-7B-slerp/8835d5c1-8350-4d42-a753-82b94dffda3b.json delete mode 100644 data/hfopenllm_v2/allknowingroger/llama3-Jallabi-40B-s/dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json delete mode 100644 data/hfopenllm_v2/allknowingroger/llama3AnFeng-40B/0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json delete mode 100644 data/hfopenllm_v2/allura-org/L3.1-8b-RP-Ink/f2415b7a-2cd7-4a05-834b-7da992e1da1a.json delete mode 100644 data/hfopenllm_v2/allura-org/MN-12b-RP-Ink/01af237f-40d8-4841-a90d-13dce6db8634.json delete mode 100644 data/hfopenllm_v2/allura-org/MS-Meadowlark-22B/d69bb392-fd38-4f57-b567-24566896167b.json delete mode 100644 data/hfopenllm_v2/allura-org/Mistral-Small-24b-Sertraline-0304/63503943-1c1e-4dac-9c41-4933fbb44b70.json delete mode 100644 data/hfopenllm_v2/allura-org/Mistral-Small-Sisyphus-24b-2503/80c5d343-41e6-45d7-8921-62586a3cd270.json delete mode 100644 data/hfopenllm_v2/allura-org/MoE-Girl-1BA-7BT/2c27d7f6-60fd-49f3-8666-784f2a16031b.json delete mode 100644 data/hfopenllm_v2/allura-org/TQ2.5-14B-Aletheia-v1/cbcc1e64-8455-4382-8999-654d1757bbd6.json delete mode 100644 data/hfopenllm_v2/allura-org/TQ2.5-14B-Neon-v1/1bea4f6b-7a41-4907-baca-430c7ea179e9.json delete mode 100644 data/hfopenllm_v2/allura-org/Teleut-7b/298ce89b-966c-4f4e-9da5-3803a395188f.json delete mode 100644 data/hfopenllm_v2/aloobun/Meta-Llama-3-7B-28Layers/ea27a4d6-8c32-4b36-873d-1046ae6240e5.json delete mode 100644 data/hfopenllm_v2/aloobun/d-SmolLM2-360M/73d5905d-7825-43ba-8051-7e1f5639b857.json delete mode 100644 data/hfopenllm_v2/alpindale/WizardLM-2-8x22B/956b8589-a048-43be-9cfd-05658d3c57ca.json delete mode 100644 data/hfopenllm_v2/alpindale/magnum-72b-v1/36f597b4-8f53-4b40-9c0e-c9284743e456.json delete mode 100644 data/hfopenllm_v2/altomek/YiSM-34B-0rn/7b67e526-7588-4c62-9293-55e77851c4c7.json delete mode 100644 data/hfopenllm_v2/amazon/MegaBeam-Mistral-7B-300k/8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json delete mode 100644 data/hfopenllm_v2/amd/AMD-Llama-135m/6751a200-0bd9-498e-a991-ebe22375633d.json delete mode 100644 data/hfopenllm_v2/amd/AMD-Llama-135m/f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json delete mode 100644 data/hfopenllm_v2/anakin87/gemma-2b-orpo/b105b62a-ce77-4387-b679-1adf2782b2f4.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v1-72b/72180fd7-bf34-4758-b02f-7d11859700c7.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v2-12b/ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v2-72b/2d266d7f-8edd-40fd-adfc-597a7742167b.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v2.5-12b-kto/484ccbf2-87e2-423f-9de4-a4bd54291b54.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v3-27b-kto/4de79504-f9e8-4235-9aad-d38f0799e081.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v3-34b/b4bde9d8-f50c-448c-ada4-5bc05f302c04.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v3-9b-chatml/5da3240b-b5e3-4333-ba61-925343b56043.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v3-9b-customgemma2/d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v4-12b/15b86bbf-8d3b-474b-98f0-abb3972a7271.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v4-22b/c0b339f6-4a46-46eb-b2d0-945176afe676.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v4-27b/79367289-6245-4bf0-99e9-42bc3ff7649c.json delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v4-9b/c3ec5505-1086-446a-9739-523810e93d13.json delete mode 100644 data/hfopenllm_v2/apple/DCLM-7B/c6c5e462-d373-4536-afc3-b740fb7e300f.json delete mode 100644 data/hfopenllm_v2/appvoid/arco-2-instruct/b7537abe-8177-4206-999f-5bb7e95c72c8.json delete mode 100644 data/hfopenllm_v2/appvoid/arco-2/eb2f6159-e37e-46db-9419-6a66cb7e539e.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Blitz/0b2d0a06-2907-4258-be33-1591e18ac6a2.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Maestro-7B-Preview/0284d867-45c4-4fe4-883c-8e3ea169d66c.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Nova/1a2da513-104e-4074-b3b7-601ab11bf6d8.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Spark/189db16b-5e78-439f-9f79-6eec979c3a79.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Spark/d751f1c5-5505-4c12-8d51-091538b49949.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Llama-3.1-SuperNova-Lite/b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Llama-Spark/67dc7fb2-1455-4f60-9dcb-59a8197741d7.json delete mode 100644 data/hfopenllm_v2/arcee-ai/SuperNova-Medius/7f4ab590-29fa-473a-b617-00135dd1d6ee.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Virtuoso-Lite/d67db62e-e21d-43c8-8b4c-bfa353e47636.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Virtuoso-Small-v2/85abff46-8ae5-4a75-9522-721793224363.json delete mode 100644 data/hfopenllm_v2/arcee-ai/Virtuoso-Small/1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json delete mode 100644 data/hfopenllm_v2/arcee-ai/raspberry-3B/4777e427-8d17-4e06-8cbf-0883c95bbfd8.json delete mode 100644 data/hfopenllm_v2/argilla-warehouse/Llama-3.1-8B-MagPie-Ultra/4df0b890-d4c5-408e-8994-88f7383e9235.json delete mode 100644 data/hfopenllm_v2/argilla/notus-7b-v1/76a5a59d-f5fd-4fb0-849e-7db7772b555a.json delete mode 100644 data/hfopenllm_v2/argilla/notux-8x7b-v1/6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json delete mode 100644 data/hfopenllm_v2/arisin/orca-platypus-13B-slerp/92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json delete mode 100644 data/hfopenllm_v2/arshiaafshani/Arsh-V1/b40ef568-f277-4d5c-87cd-53feaa71598b.json delete mode 100644 data/hfopenllm_v2/asharsha30/LLAMA_Harsha_8_B_ORDP_10k/893d5149-c535-41c7-8a1a-26bb6b33e407.json delete mode 100644 data/hfopenllm_v2/ashercn97/a1-v0.0.1/0b649ed5-5af4-4910-b853-2408e3b58f1f.json delete mode 100644 data/hfopenllm_v2/ashercn97/a1-v002/5c8edeba-5c65-4168-b67e-02143acbcafb.json delete mode 100644 data/hfopenllm_v2/assskelad/smollm2-360M-sft_SmallThoughts/67e657ef-d602-4f58-b898-874a22f4a009.json delete mode 100644 data/hfopenllm_v2/athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/53d2bf07-689a-4e69-a534-b288313c8481.json delete mode 100644 data/hfopenllm_v2/automerger/YamshadowExperiment28-7B/34d6a184-d4d5-4609-8305-c0e2ee1c585b.json delete mode 100644 data/hfopenllm_v2/avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI/39b627ab-3e64-42f7-a88d-abe5764fcf4d.json delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-2/d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-4/85bc5976-0d40-4416-bbf8-9b1dbf372343.json delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-3-over-8/8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-5-over-16/de8651eb-16d1-46ee-a1df-b8c72caaf205.json delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-7-over-16/6a744db8-814f-4e8e-b6e5-0d096267dfa5.json delete mode 100644 data/hfopenllm_v2/aws-prototyping/MegaBeam-Mistral-7B-512k/028b7c37-770e-4356-a7c6-0cc74650d5fd.json delete mode 100644 data/hfopenllm_v2/axolotl-ai-co/romulus-mistral-nemo-12b-simpo/3b399c64-922a-48ba-9a25-862102749647.json delete mode 100644 data/hfopenllm_v2/baconnier/Napoleon_24B_V0.0/d5e46a11-3e81-457d-9d26-9fd17f96f076.json delete mode 100644 data/hfopenllm_v2/baconnier/Napoleon_24B_V0.2/b3abfbc1-911a-43b7-a338-efb25f746f9d.json delete mode 100644 data/hfopenllm_v2/baebee/7B-Cetacea/6b471ee0-9444-45ff-92cf-da624aa59bf6.json delete mode 100644 data/hfopenllm_v2/baebee/mergekit-model_stock-nzjnheg/b56bd924-0a63-4ca2-8f2f-97b581e47a36.json delete mode 100644 data/hfopenllm_v2/baebee/mergekit-ties-fnjenli/bfe9098d-7207-4f8c-9a3f-549a29303b5f.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.1v/7856172d-ec3e-4e71-befe-54952478e330.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.2v/a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.3v/9d19c44f-4912-4c95-ab3f-2dddb055d932.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.4v/6cef3550-27d7-4073-b4bb-0f19a2c5f553.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.5v/08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.6v/622f9379-6a30-43ba-a7a8-fbd08c484fa5.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V/24f728e6-de5e-44cc-8b6d-51e0065c1475.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V/c3b2bf18-d355-40fc-a862-376c1b988305.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_Neo/79474be5-2587-4087-a2cc-1337e3b696dd.json delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B/22ff2700-70c0-459e-96a2-0ce1710947bc.json delete mode 100644 data/hfopenllm_v2/bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407/7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json delete mode 100644 data/hfopenllm_v2/bamec66557/NameLess-12B-prob/69dc0f8e-16d7-4907-9741-484eafa62b8c.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.1v/e516abc1-9c3c-4921-a385-e2533d45fed3.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.X.ver/8baa5832-cc07-4a31-a815-0e8151426ea6.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-ALPHA/509fbca4-f405-4c27-85a9-1eea59025070.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-BETA/6f45ed56-6bec-4439-9adb-e79fcd74667c.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DELTA/512ff924-c1d3-4d75-a468-2bcdcda25cf6.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DIGAMMA/86b561ae-c4d3-4293-a884-bcab26df026d.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-EPSILON/516d1972-9731-4234-a4b3-b96423ebba5c.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-GAMMA/274f6e02-c81f-4f2e-9747-e5de5cee1933.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-NEMO/61638b55-296b-40fd-a39f-cc2276d9f94a.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-OMEGA/11c1b6fe-4815-415b-a4a8-d14073df6ee1.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-UNION/88e2cb24-288e-4f37-8753-f0daa825051c.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B/8a1a6c44-17fd-402e-a22e-e795a1f612e3.json delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B_Razor/1121af0b-61fe-424a-bc66-3164bcb1d833.json delete mode 100644 data/hfopenllm_v2/bamec66557/mergekit-model_stock-zdaysvi/35300d67-7ee1-4874-b351-87f46267cec9.json delete mode 100644 data/hfopenllm_v2/bamec66557/mergekit-ties-sinbkow/6180b7b3-4b21-42aa-a62d-084a91568b43.json delete mode 100644 data/hfopenllm_v2/belztjti/dffghgjh/7414d344-0e67-424a-9e16-00de0487ce02.json delete mode 100644 data/hfopenllm_v2/belztjti/dtfgv/f5fcd407-080c-4cb7-a299-7a7f919c734d.json delete mode 100644 data/hfopenllm_v2/benhaotang/phi4-qwq-sky-t1/efe03731-6021-4dcf-b7fe-24cbf2d60fac.json delete mode 100644 data/hfopenllm_v2/beomi/gemma-mling-7b/6ffed624-cc22-4b62-a447-3c02b0e43ded.json delete mode 100644 data/hfopenllm_v2/beowolx/CodeNinja-1.0-OpenChat-7B/ed867fa8-be8a-49b0-8c94-38085808b58b.json delete mode 100644 data/hfopenllm_v2/berkeley-nest/Starling-LM-7B-alpha/c8b9a56b-0933-4085-8d5f-a1d8294699db.json delete mode 100644 data/hfopenllm_v2/bfuzzy1/Gunny/9b178661-ed9a-427d-b93c-b905b8089ad8.json delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron-c/69588e07-7559-49c2-9423-19fd143e42f7.json delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron-d/317589da-d673-4f90-93e9-59983f2ef54b.json delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron-m/efab322e-ea15-4fe7-9bfc-15246003e59c.json delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron-m1a-llama/b1eac68e-b292-414b-9594-c921f8e10818.json delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron/b7d08c65-8219-4067-9504-99e438a86038.json delete mode 100644 data/hfopenllm_v2/bfuzzy1/llambses-1/e9c5b479-0dce-4de3-84d6-90c7515337f1.json delete mode 100644 data/hfopenllm_v2/bhuvneshsaini/merged_model/3c766465-29db-4b3d-b42f-a3222b38a096.json delete mode 100644 data/hfopenllm_v2/bigcode/starcoder2-15b/e6c85677-61ed-475b-85a5-48b91ec76bcf.json delete mode 100644 data/hfopenllm_v2/bigcode/starcoder2-3b/7b68fa5e-dbbf-4542-8767-6874aabf8f40.json delete mode 100644 data/hfopenllm_v2/bigcode/starcoder2-7b/c103b7f4-a432-42d6-86ef-cb369e0c16ff.json delete mode 100644 data/hfopenllm_v2/bigscience/bloom-1b1/643dda41-37d0-4c1e-b856-58b774612886.json delete mode 100644 data/hfopenllm_v2/bigscience/bloom-1b7/ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json delete mode 100644 data/hfopenllm_v2/bigscience/bloom-3b/16e30aa0-736a-4ef8-8ba6-78285b84546f.json delete mode 100644 data/hfopenllm_v2/bigscience/bloom-560m/73eb729d-adfd-4dee-9bde-04a31f5528f6.json delete mode 100644 data/hfopenllm_v2/bigscience/bloom-7b1/0daad2ae-92d0-4522-a067-20332f72c96f.json delete mode 100644 data/hfopenllm_v2/bluuwhale/L3-SthenoMaid-8B-V1/a3e3849f-a289-4132-b4a8-f67d67ad46a1.json delete mode 100644 data/hfopenllm_v2/bond005/meno-tiny-0.1/59a9ed26-a67a-4e76-8858-520400c90766.json delete mode 100644 data/hfopenllm_v2/bosonai/Higgs-Llama-3-70B/6c5c61b4-8037-4b28-8616-1aefa7963eb8.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt/e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective/5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/d5b31b1f-ace0-457f-bf8a-9041398b8344.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/c701f1fd-166d-416b-8f78-edf17f2fecd4.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/4217b403-e924-4f67-9b0e-ad1d4ed293a1.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/03816e41-5fb8-4815-ab9c-4108ab19a3bc.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt/a763b10e-350a-4342-ade3-b782437ca3e2.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective/9e806fd2-edbf-40e2-a008-834cee537bb6.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B/fbcf861c-62db-4079-bba6-becd4e231216.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt/22b591c0-3386-4bd5-860c-20c0c6001986.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/dfb9a9c4-114e-4188-9940-4d6df7e4815f.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective/38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B/e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json delete mode 100644 data/hfopenllm_v2/braindao/Qwen2.5-14B-Instruct/2165e69a-c50c-419a-932e-909f53b73b71.json delete mode 100644 data/hfopenllm_v2/braindao/Qwen2.5-14B/46430a07-15c8-4727-9102-2f471d4f1d3c.json delete mode 100644 data/hfopenllm_v2/braindao/iq-code-evmind-0.5b/3c7f540a-c850-4e20-ad93-60e021d17133.json delete mode 100644 data/hfopenllm_v2/brgx53/3Bgeneral-ECE-PRYMMAL-Martial/c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json delete mode 100644 data/hfopenllm_v2/brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial/2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json delete mode 100644 data/hfopenllm_v2/brgx53/3Blareneg-ECE-PRYMMAL-Martial/6427a5ef-8508-430d-970d-054fc485e754.json delete mode 100644 data/hfopenllm_v2/brgx53/3Blarenegv2-ECE-PRYMMAL-Martial/08984ad9-1e9b-4916-b214-af26dadfcc0b.json delete mode 100644 data/hfopenllm_v2/brgx53/Barracuda-PRYMMAL-ECE-TW3/1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json delete mode 100644 data/hfopenllm_v2/brgx53/LaConfiance-PRYMMAL-ECE-TW3/6bf42faa-c3e9-4069-bf93-ffd626062f0f.json delete mode 100644 data/hfopenllm_v2/bunnycore/Best-Mix-Llama-3.1-8B/9feccbdc-18eb-4077-b50b-986db0047fc8.json delete mode 100644 data/hfopenllm_v2/bunnycore/Blabbertron-1.0/a074c33f-782a-409c-987b-7dd62c65ccc7.json delete mode 100644 data/hfopenllm_v2/bunnycore/Blabbertron-1.1/2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json delete mode 100644 data/hfopenllm_v2/bunnycore/CyberCore-Qwen-2.1-7B/84481fee-3727-427b-912a-30e2744df28a.json delete mode 100644 data/hfopenllm_v2/bunnycore/DeepQwen-3B-LCoT-SCE/aaa801dc-1a47-4009-9ad4-7129a8d4e651.json delete mode 100644 data/hfopenllm_v2/bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/3ac92cbf-c85b-4e00-9ef9-4322f961591a.json delete mode 100644 data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v1/162b511b-4684-4595-9261-a33f3a4117f9.json delete mode 100644 data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v2/20d5d59a-028d-4e34-9414-d9edaf2e59b8.json delete mode 100644 data/hfopenllm_v2/bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct/a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json delete mode 100644 data/hfopenllm_v2/bunnycore/FuseQwQen-7B/0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json delete mode 100644 data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.1/6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json delete mode 100644 data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.2/78582fec-2f69-4b37-8497-12ceb097b44b.json delete mode 100644 data/hfopenllm_v2/bunnycore/Gemma-2-2B-Smart/949bf65e-c2ae-4701-82f0-39d0c62a0e87.json delete mode 100644 data/hfopenllm_v2/bunnycore/Gemma2-9B-TitanFusion/8812151c-4301-4131-a414-d64d025e476e.json delete mode 100644 data/hfopenllm_v2/bunnycore/HyperLlama-3.1-8B/2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-Mix/9feeffb2-3763-4e43-933e-89100b76f7fa.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-v3/721102b5-ed5e-4631-8600-a6adfff0c784.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-All-Mix/18c185f7-5ca4-46ff-81c2-6c538f096409.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Bespoke-Thought/7ab5911c-e229-43e5-a798-095287d0a597.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Booval/f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/d4b20ef4-734e-40a7-818e-f77e170d7437.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Della/e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Long-Think/3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Mix-Skill/9aff874c-1953-4b97-9bff-9e6120b0bfa7.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlus/45ae7f45-8c36-46c6-989d-bc672cdf8eff.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlusPlus/7d36e44e-a329-4b96-a891-365ad900f718.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RP-DeepThink/a8c26325-1eec-43a6-a8ad-3bcb2e378924.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RRStock/bde1a879-6852-42ce-9217-f427af85a46a.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ToxicKod/dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3b-RP-Toxic-Fuse/12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json delete mode 100644 data/hfopenllm_v2/bunnycore/Maestro-S1k-7B-Sce/1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-3.5-mini-TitanFusion-0.1/7076406b-7e0a-49c7-8150-2e6a243aa23b.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v2/96c3fd80-a601-4629-a1ab-bf7f366a909a.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v3/1302c9a5-d35c-400c-b9f3-d990243e5d59.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v4/c7f48bbf-6583-4ddd-ae4d-671c43218dae.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock/5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-RP-v0/15701682-97ce-46cf-8010-a6bdeaf8c7aa.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-RR-Shoup/c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-RStock-v0.1/4337b1c1-cc00-4a15-8148-e8d0739561b9.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-ReasoningRP/1151ee14-8fe9-4f97-808d-8103b353c2ec.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Sce-exp-v0.1/a2c18179-aca3-422c-b9f5-8345109cea13.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Stock-Ex/07495d34-1505-45a9-bb48-887af0da8a0c.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Stock-RP/567baf6d-99f9-46a5-8c40-c6899986f1ff.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Trim-Exp1/a337df3a-28ff-46c9-adae-4bc029937101.json delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-Seek-4-Sce-V1/b201a849-44e9-4598-918b-ffa27c894ee9.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qandora-2.5-7B-Creative/dd87ebf3-3088-43b1-851c-a97d12a68ea8.json delete mode 100644 data/hfopenllm_v2/bunnycore/QandoraExp-7B-Persona/1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json delete mode 100644 data/hfopenllm_v2/bunnycore/QandoraExp-7B-v2/220cb478-58c0-4028-b51a-ec5fe1050746.json delete mode 100644 data/hfopenllm_v2/bunnycore/QandoraExp-7B/17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json delete mode 100644 data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT-R1/2b55023b-b8bc-42a2-aca8-dcaf39890232.json delete mode 100644 data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT/31736569-5992-4b1d-9d66-27a6c1620506.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Sky-T1/630b37b5-351c-403c-ac76-ccb68ffc5d53.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v1/69cdef01-30dc-4f75-97fa-9daeebcec72f.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v4/9aa1acb0-c791-4dea-aa1e-c912cea69466.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v5/0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Exp-Sce/2872dcd9-421b-4346-812c-b27bb32c6e86.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-R1-Stock/2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke/d0a76497-84b0-45b9-b748-04ffe9bc13a3.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7b-S1k/185b6560-6790-417f-aeba-f7405fee808a.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-1.5B-Model-Stock/30a8074e-df03-4866-9b8d-a5a7eece3c71.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v2/ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.1/bc98b048-18d4-438e-80c4-0cd851798da5.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.2/c88c011f-0a24-4e78-a104-035d25af2430.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v4.1/f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock/5484405a-2ec8-4515-af75-76a5dd348d3d.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Mix/7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker-V2/e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker/7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-CyberRombos/d0a70e95-fc72-41c6-ac42-09b8f379b566.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Fuse-Exp/e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Fusion/f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1/e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3/3feb9449-49a2-427f-a317-c21e6d1ca66c.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-V0.1/6359e37e-0405-436b-903c-8f0e740dd6c7.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Stock/f5daed76-f6e5-4a7d-84d7-80537a046b83.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Task/03af2b1d-989f-4afc-ab13-8793093b9c50.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M-Thinker/5db7ec54-7feb-4c11-b2e0-042226ba1f94.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M/f1f5615d-8a78-43c9-b5c6-edc180252381.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-ID/9c89bf8f-4b8a-4c01-8685-fafc687c673e.json delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Sky-R1-Mini/58b69c0f-826d-414f-915e-dd0b78d9298c.json delete mode 100644 data/hfopenllm_v2/bunnycore/QwenMosaic-7B/101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json delete mode 100644 data/hfopenllm_v2/bunnycore/Smol-Llama-3.2-3B/259c4798-ff03-4f58-8fb4-59150710212b.json delete mode 100644 data/hfopenllm_v2/bunnycore/SmolLM2-1.7-Persona/f731caa1-f777-494a-8490-da0c815f0708.json delete mode 100644 data/hfopenllm_v2/bunnycore/SmolLM2-1.7B-roleplay-lora/d4d25d38-b21a-490e-9ca9-556504ec00ea.json delete mode 100644 data/hfopenllm_v2/bunnycore/Tulu-3.1-8B-SuperNova/75bb85a3-40bb-4630-95a0-50e40b008412.json delete mode 100644 data/hfopenllm_v2/byroneverson/Mistral-Small-Instruct-2409-abliterated/bb44f3ef-eefa-48ef-a257-2eb345c89a00.json delete mode 100644 data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-16K-abliterated/2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json delete mode 100644 data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-abliterated/caa0c8df-5488-4bf9-a5b8-0fff831e6732.json delete mode 100644 data/hfopenllm_v2/c10x/Q-Pluse/c6f8e581-e849-4e28-b3a6-1838ee522770.json delete mode 100644 data/hfopenllm_v2/c10x/longthinker/f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json delete mode 100644 data/hfopenllm_v2/carsenk/flippa-v6/44129be7-f73d-4580-8375-e8ef324e73a8.json delete mode 100644 data/hfopenllm_v2/carsenk/phi3.5_mini_exp_825_uncensored/2925ecde-a9a5-4369-b391-d23a8605d35c.json delete mode 100644 data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1/8409e464-fd16-4b41-b533-2f6cae4fe894.json delete mode 100644 data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1/86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json delete mode 100644 data/hfopenllm_v2/cckm/tinymistral_950m/aa2e6df7-a0b0-42f7-8057-e2763fc34834.json delete mode 100644 data/hfopenllm_v2/cgato/TheSalt-L3-8b-v0.3.2/2bf9a06e-f3bf-4b55-804b-e553a722e0de.json delete mode 100644 data/hfopenllm_v2/chargoddard/prometheus-2-llama-3-8b/b380a675-39ea-4950-ad0a-d9771f09ddde.json delete mode 100644 data/hfopenllm_v2/chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO/482358eb-7d3b-4de0-b5d9-451308f104e2.json delete mode 100644 data/hfopenllm_v2/chujiezheng/Mistral7B-PairRM-SPPO-ExPO/ef04a83d-7b89-43ec-ba33-30e1006422dc.json delete mode 100644 data/hfopenllm_v2/cjvt/GaMS-1B/7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json delete mode 100644 data/hfopenllm_v2/cloudyu/Llama-3-70Bx2-MOE/52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json delete mode 100644 data/hfopenllm_v2/cloudyu/Llama-3.2-3Bx4/1f4a827d-31cd-42e6-871d-7c0cad010f58.json delete mode 100644 data/hfopenllm_v2/cloudyu/Mixtral_11Bx2_MoE_19B/56d6d99c-fba1-42e7-aad4-631370b44da3.json delete mode 100644 data/hfopenllm_v2/cloudyu/Mixtral_34Bx2_MoE_60B/006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json delete mode 100644 data/hfopenllm_v2/cloudyu/Mixtral_7Bx2_MoE/33a82686-6202-4a4d-ba34-bd4537105e5f.json delete mode 100644 data/hfopenllm_v2/cloudyu/S1-Llama-3.2-3Bx4-MoE/38d45554-44bd-4b40-b7c9-c0b7ba44b862.json delete mode 100644 data/hfopenllm_v2/cloudyu/Yi-34Bx2-MoE-60B-DPO/37d7e3ab-db9c-4ad7-81d1-933c030a6250.json delete mode 100644 data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo/9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json delete mode 100644 data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/b6bd8515-4c95-40ce-b2d5-af8873d261ab.json delete mode 100644 data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc/d102e75d-3e20-482b-a243-bae3ec44e2bb.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.1-8B/68920da1-af71-4ccd-88b9-554e3c72c4dc.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.2-1B/c0eb144f-c726-4a80-bce9-384fb7a641a7.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B/0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-R1-Mistral-24B/8fe4360a-0924-4386-b4cd-89069f7ff55f.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9-llama3-8b/eeeb082b-7112-4a08-a87a-b2c9ae37efff.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-llama-3-70b/b8f933e9-867f-4934-9648-371d1e632116.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-34b/8d225023-4b7e-48cd-ae67-6d00b541f17d.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-9b/ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/177ef040-da5c-4a65-adac-efdc555bd110.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium/f4549a39-0b28-4e06-998a-774f5f02cfba.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-72b/a79af78a-adab-406f-995a-adb3893e1510.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-7b/4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k/eeb3a10a-d584-414a-90de-e018c47615c2.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-7B-32k/e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b/cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-gemma2-2b/b64b6416-b18b-47cc-a516-c613cd670b37.json delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-llama3.1-8b/64e96d56-72a9-413f-8903-45821b98f71e.json delete mode 100644 data/hfopenllm_v2/collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2/a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json delete mode 100644 data/hfopenllm_v2/cpayne1303/cp2024-instruct/79314f48-d92b-4992-b3c6-d31278c0867a.json delete mode 100644 data/hfopenllm_v2/cpayne1303/cp2024/5a007612-c8e7-4f6b-baa9-a21af7e908c6.json delete mode 100644 data/hfopenllm_v2/cpayne1303/llama-43m-beta/fdefdd3e-2d83-4430-bd95-e16a1935dff1.json delete mode 100644 data/hfopenllm_v2/cpayne1303/llama-43m-beta/ffdd45bf-3409-4b92-909a-25a32ba27f82.json delete mode 100644 data/hfopenllm_v2/cpayne1303/smallcp2024/a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json delete mode 100644 data/hfopenllm_v2/crestf411/MN-Slush/d9d49bf7-f6f0-4c25-9182-d815454940e3.json delete mode 100644 data/hfopenllm_v2/cstr/llama3.1-8b-spaetzle-v90/deb48e93-0378-482f-8a5d-7ec350497e0b.json delete mode 100644 data/hfopenllm_v2/cyberagent/calm3-22b-chat/302a9a47-8603-42d9-85fb-64c60e7c6f44.json delete mode 100644 data/hfopenllm_v2/darkc0de/BuddyGlassNeverSleeps/28d52801-3998-421f-a37a-2b7b677d0eaa.json delete mode 100644 data/hfopenllm_v2/darkc0de/BuddyGlassUncensored2025.2/32b4e23b-9430-45a8-bfa2-eea2e89792c4.json delete mode 100644 data/hfopenllm_v2/darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/0336e168-e313-44cb-a030-42e6d20e92df.json delete mode 100644 data/hfopenllm_v2/databricks/dbrx-base/11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json delete mode 100644 data/hfopenllm_v2/databricks/dbrx-instruct/6d97749c-3bfa-4c32-b581-a5e2b73303f3.json delete mode 100644 data/hfopenllm_v2/databricks/dolly-v1-6b/ec58907d-b67c-467e-a3dd-b9f9c10138f0.json delete mode 100644 data/hfopenllm_v2/databricks/dolly-v2-12b/a7f09a3d-025c-48fa-9358-863b9ae382b1.json delete mode 100644 data/hfopenllm_v2/databricks/dolly-v2-3b/bf2be2d5-58de-4550-b733-a5910bded48d.json delete mode 100644 data/hfopenllm_v2/databricks/dolly-v2-7b/52b32c1f-6189-4850-b3f4-de442eb2ccb5.json delete mode 100644 data/hfopenllm_v2/davidkim205/Rhea-72b-v0.5/87b44160-c3dd-452d-8c15-c4f758f8db7b.json delete mode 100644 data/hfopenllm_v2/davidkim205/nox-solar-10.7b-v4/3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/35b7ff42-3825-4240-97bf-f8af7e8c23ff.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/c108173e-1582-4c99-9291-46986d7ba1cf.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/6feb08b0-1c67-4fe2-a001-0b3b84529687.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/53ec995e-bcfd-4a72-bd9a-45d14da3f219.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/299a0397-89c7-4329-9599-9fc29a52db87.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-llm-67b-chat/41adbc32-6cdf-49ba-980c-6eb6f722b40b.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-base/4236ece5-f2b2-44e7-9503-9731bff20155.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-chat/b33d672c-4a96-4093-bc13-25c42303b918.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-base/2b4f42fc-8b25-481c-98f7-911c52fdd242.json delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-chat/634b7a64-2bd3-48b8-b2f4-a93189801850.json delete mode 100644 data/hfopenllm_v2/dfurman/CalmeRys-78B-Orpo-v0.1/72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json delete mode 100644 data/hfopenllm_v2/dfurman/Llama-3-70B-Orpo-v0.1/78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json delete mode 100644 data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json delete mode 100644 data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/79b81e37-f75e-4b18-b145-73c42625ced5.json delete mode 100644 data/hfopenllm_v2/dfurman/Qwen2-72B-Orpo-v0.1/2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json delete mode 100644 data/hfopenllm_v2/dicta-il/dictalm2.0-instruct/315fa815-fab0-47c9-8185-00bc597c0176.json delete mode 100644 data/hfopenllm_v2/dicta-il/dictalm2.0/0c1686db-b396-4ecf-86f1-e4e092491acd.json delete mode 100644 data/hfopenllm_v2/distilbert/distilgpt2/57455fbc-b5a9-4a3b-9a30-7da0593fd778.json delete mode 100644 data/hfopenllm_v2/divyanshukunwar/SASTRI_1_9B/a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json delete mode 100644 data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base/9d0d4eee-0b87-485c-843f-e32d08aa601b.json delete mode 100644 data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B/e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json delete mode 100644 data/hfopenllm_v2/djuna/G2-BigGSHT-27B-2/8c7e25df-884d-4940-8185-4c1b82fac8c5.json delete mode 100644 data/hfopenllm_v2/djuna/G2-GSHT/83611d50-01d0-4642-a104-daf77f1a0fe8.json delete mode 100644 data/hfopenllm_v2/djuna/Gemma-2-gemmama-9b/5cbdafba-6071-4da1-8b19-3de612e9ff18.json delete mode 100644 data/hfopenllm_v2/djuna/L3.1-ForStHS/1c934cba-c94a-4aad-9645-84658e0b5588.json delete mode 100644 data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc/7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json delete mode 100644 data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-calc/38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json delete mode 100644 data/hfopenllm_v2/djuna/L3.1-Purosani-2-8B/3d65fbc2-bf91-479c-a687-e9ef702794fb.json delete mode 100644 data/hfopenllm_v2/djuna/L3.1-Suze-Vume-calc/650cdbbb-e066-4581-8d61-77aa6a4c402c.json delete mode 100644 data/hfopenllm_v2/djuna/MN-Chinofun-12B-2/05d566c5-1810-483c-8ce0-84635b9457dc.json delete mode 100644 data/hfopenllm_v2/djuna/MN-Chinofun-12B-3/37e3456a-92ff-4122-a697-ffbdc1c79555.json delete mode 100644 data/hfopenllm_v2/djuna/MN-Chinofun-12B-4/70c908d4-f1bf-4553-9bf7-95eb593b4853.json delete mode 100644 data/hfopenllm_v2/djuna/MN-Chinofun/2ccc9c20-5414-4286-abcd-ad2b20f8652d.json delete mode 100644 data/hfopenllm_v2/djuna/Q2.5-Partron-7B/50f4560a-e172-42b9-b552-437aff158a38.json delete mode 100644 data/hfopenllm_v2/djuna/Q2.5-Veltha-14B-0.5/c6a3abac-8a34-4725-915b-c27c3d0bc484.json delete mode 100644 data/hfopenllm_v2/djuna/Q2.5-Veltha-14B/a8ed68ea-6463-4ff9-9dcd-034080272dec.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama-3-8B-Instruct/5799ce8b-c00d-49f6-96dc-f7dd057a268c.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama-3-Huge-Instruct/0d261023-3e35-4160-98ca-241bbaee927e.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama-3-Large-Instruct/f0454d3b-18b4-488a-94dd-fb24729996c7.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama-3.1-8B-Instruct/6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama3.1-Large/37f20f86-40ba-4f63-b29d-efff6cb0e09b.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Medium/bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Phi-3-medium-4k-instruct/bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-XLarge-base/cbea8d66-0370-4998-8e3a-06fef0a60f0c.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-XLarge/ca48b670-b82e-46cc-beb9-2fd0f11d3585.json delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-XLarge2/d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json delete mode 100644 data/hfopenllm_v2/dreamgen/WizardLM-2-7B/503c8a24-4ced-4dca-b9df-5733ce89c2ca.json delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v1/5c5283a0-819f-4112-bb90-5277423d9c00.json delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v2/b636bc82-1625-49b1-beec-cadaf4e1b1a9.json delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v3/00f481c1-0ef0-40bd-bd95-81dc9443a62c.json delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v4/7ea22fef-2d79-49ae-bf72-9153a4e239c5.json delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v5/64f441df-1781-4d01-b73b-2156413ad403.json delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v6/4e3676eb-8607-416e-986a-7098bc192820.json delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v7/2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json delete mode 100644 data/hfopenllm_v2/duyhv1411/Llama-3.2-1B-en-vi/c4b86264-3725-4742-91f0-3e01f8d965a4.json delete mode 100644 data/hfopenllm_v2/duyhv1411/Llama-3.2-3B-en-vi/0308147c-dabb-46bb-8add-d332fcd5a800.json delete mode 100644 data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-inst/a9977a0d-e199-488a-a26e-6269806fdb2b.json delete mode 100644 data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-instruct/56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json delete mode 100644 data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id/4185c376-91c6-435d-ae3b-47cd85151049.json delete mode 100644 data/hfopenllm_v2/dwikitheduck/gen-inst-1/26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json delete mode 100644 data/hfopenllm_v2/dwikitheduck/gen-try1-notemp/09be48ce-61f8-4ba9-b082-b9c475fa714d.json delete mode 100644 data/hfopenllm_v2/dwikitheduck/gen-try1/27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json delete mode 100644 data/hfopenllm_v2/dzakwan/dzakwan-MoE-4x7b-Beta/7b6fc3c2-a67d-450e-858c-fa87be122376.json delete mode 100644 data/hfopenllm_v2/ehristoforu/Falcon3-8B-Franken-Basestruct/76b86418-5450-48c6-ae56-58a19016d055.json delete mode 100644 data/hfopenllm_v2/ehristoforu/Falcon3-MoE-2x7B-Insruct/e06594e4-899a-4285-b130-f7b605e5a6b9.json delete mode 100644 data/hfopenllm_v2/ehristoforu/Gemma2-9B-it-psy10k-mental_health/9efdc773-a5c7-4709-88c8-96a67d84a742.json delete mode 100644 data/hfopenllm_v2/ehristoforu/Gemma2-9b-it-train6/1fcc2f96-afc9-403f-b82e-8e1804506582.json delete mode 100644 data/hfopenllm_v2/ehristoforu/HappyLlama1/bee1e134-9a43-441a-b977-522c510dd1ce.json delete mode 100644 data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT-Dare/b70e1089-d136-4b2f-a253-f361bcf8cdcc.json delete mode 100644 data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT/8b7e9c34-a982-4f4d-b5dc-66a12578601f.json delete mode 100644 data/hfopenllm_v2/ehristoforu/RQwen-v0.1/0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json delete mode 100644 data/hfopenllm_v2/ehristoforu/RQwen-v0.2/066abe97-2c6c-4f3b-9e5e-e144f130258a.json delete mode 100644 data/hfopenllm_v2/ehristoforu/SoRu-0009/a3af8f77-d915-4482-a2b6-c99744aada4b.json delete mode 100644 data/hfopenllm_v2/ehristoforu/coolqwen-3b-it/82cc8b37-e242-441e-ac74-1662bcc0a0e2.json delete mode 100644 data/hfopenllm_v2/ehristoforu/della-70b-test-v1/1527c8bc-c1ec-45f4-9663-4cffbb808f94.json delete mode 100644 data/hfopenllm_v2/ehristoforu/falcon3-ultraset/337b8ce8-d697-47f6-94ac-7a420dd7d91b.json delete mode 100644 data/hfopenllm_v2/ehristoforu/fd-lora-merged-16x32/3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json delete mode 100644 data/hfopenllm_v2/ehristoforu/fd-lora-merged-64x128/0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json delete mode 100644 data/hfopenllm_v2/ehristoforu/fp4-14b-it-v1/3e236ad8-3828-407f-9076-743b465b8d15.json delete mode 100644 data/hfopenllm_v2/ehristoforu/fp4-14b-v1-fix/9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json delete mode 100644 data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_false/940d88e9-085b-4065-b8c8-92ebe685deb0.json delete mode 100644 data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_true/7fdcd616-2c72-4c44-9646-9c32344bfa0b.json delete mode 100644 data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-duable4layers-it/9d358f55-810c-4ac1-adc7-83f95bd74c11.json delete mode 100644 data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-it/9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json delete mode 100644 data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-instruct/651a32b1-77fb-4acf-89bf-2d45b684944d.json delete mode 100644 data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-it/192c4037-753a-4790-80d0-33c4d277102d.json delete mode 100644 data/hfopenllm_v2/ehristoforu/moremerge-upscaled/679d66bf-244e-4080-9a42-0a0c6cfdc965.json delete mode 100644 data/hfopenllm_v2/ehristoforu/moremerge/73b0ca8a-fb16-43eb-a9af-a01219cf6196.json delete mode 100644 data/hfopenllm_v2/ehristoforu/phi-4-25b/7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json delete mode 100644 data/hfopenllm_v2/ehristoforu/qwen2.5-test-32b-it/a8238bd4-3982-4e45-92e4-bab77e528e29.json delete mode 100644 data/hfopenllm_v2/ehristoforu/qwen2.5-with-lora-think-3b-it/f87f9f08-e989-4e99-a254-a3650e7ab1b6.json delete mode 100644 data/hfopenllm_v2/ehristoforu/rmoe-v1/f40496a9-fb14-4b2d-8070-84f55e6417f6.json delete mode 100644 data/hfopenllm_v2/ehristoforu/rufalcon3-3b-it/cc52f59d-5669-44b0-b1af-e6fd0836e284.json delete mode 100644 data/hfopenllm_v2/ehristoforu/ruphi-4b/67525a37-f658-40e8-89a1-de8bf6275a00.json delete mode 100644 data/hfopenllm_v2/ehristoforu/testq-32b/3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json delete mode 100644 data/hfopenllm_v2/ehristoforu/tmoe-v2/0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json delete mode 100644 data/hfopenllm_v2/ehristoforu/tmoe/7a05616e-7335-419a-914d-00fb287fe663.json delete mode 100644 data/hfopenllm_v2/ehristoforu/trd-7b-it/070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json delete mode 100644 data/hfopenllm_v2/ehristoforu/ud-14b/5afc044a-3138-443f-89cf-74f1272cc632.json delete mode 100644 data/hfopenllm_v2/elinas/Chronos-Gold-12B-1.0/a6c1d914-647c-46b7-b0e1-712b8d506780.json delete mode 100644 data/hfopenllm_v2/ell44ot/gemma-2b-def/43f35eac-0946-42f9-a128-eb8011c29588.json delete mode 100644 data/hfopenllm_v2/euclaise/ReMask-3B/04c22be7-2cf4-4774-b479-863199c7c3a4.json delete mode 100644 data/hfopenllm_v2/eworojoshua/vas-01/fc3d436b-ec61-4458-a3c6-1df41057ea70.json delete mode 100644 data/hfopenllm_v2/ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning/e3ed157f-f306-40fb-b3a1-d3434236759e.json delete mode 100644 data/hfopenllm_v2/ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning/8793b3e3-f409-499a-81f8-c250c8092841.json delete mode 100644 data/hfopenllm_v2/ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning/33572f63-15ba-4fbc-b1cf-56b978384d02.json delete mode 100644 data/hfopenllm_v2/ewre324/ewre324-R1-SmolLM2-135M-Distill/44c636ba-8303-4d75-bcb5-46e3c07a991a.json delete mode 100644 data/hfopenllm_v2/experiment-llm/exp-3-q-r/0a002444-3e5a-4fc8-acc6-72210a4181a9.json delete mode 100644 data/hfopenllm_v2/facebook/opt-1.3b/bbf936a5-3594-4d0a-b5af-7a01740d0c81.json delete mode 100644 data/hfopenllm_v2/facebook/opt-30b/1164abea-4cc2-46a7-a44b-f024a2ce40b4.json delete mode 100644 data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-MopeyMule/bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json delete mode 100644 data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-abliterated/7f49e582-a01f-481f-8345-1c384fc8b567.json delete mode 100644 data/hfopenllm_v2/failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5/10937ed1-56e2-4aad-b717-5125bc8ac72a.json delete mode 100644 data/hfopenllm_v2/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3/f4622539-c0ac-4e9f-86d4-00e3c826d03b.json delete mode 100644 data/hfopenllm_v2/failspy/Phi-3-medium-4k-instruct-abliterated-v3/6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json delete mode 100644 data/hfopenllm_v2/failspy/llama-3-70B-Instruct-abliterated/5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json delete mode 100644 data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/21d6f2dd-7bd6-42a9-b14e-c25777497890.json delete mode 100644 data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/d0bc11cb-56ff-4c77-9446-e76e550e0919.json delete mode 100644 data/hfopenllm_v2/fblgit/UNA-SimpleSmaug-34b-v1beta/ff78dc97-e9cf-4215-a607-3e80892af82c.json delete mode 100644 data/hfopenllm_v2/fblgit/UNA-TheBeagle-7b-v1/0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json delete mode 100644 data/hfopenllm_v2/fblgit/UNA-ThePitbull-21.4B-v2/48837141-2556-4658-87e0-bb88cfcd562a.json delete mode 100644 data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-MGS/f2d6da5d-3685-43de-8ceb-5b798f88e24c.json delete mode 100644 data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-UNAMGS/9ec02ccd-329a-4d62-9f04-87de6fda5011.json delete mode 100644 data/hfopenllm_v2/fblgit/juanako-7b-UNA/781d0332-e332-4ff7-8585-9c2d8395a147.json delete mode 100644 data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS-GRPO/d6dd460e-c352-4d31-8941-183c6eabd0a7.json delete mode 100644 data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS/66bf6442-04ea-437b-88c4-e61afc6f7139.json delete mode 100644 data/hfopenllm_v2/fblgit/pancho-v1-qw25-3B-UNAMGS/0d1911f5-a2e7-4511-a8d8-098cbf9207df.json delete mode 100644 data/hfopenllm_v2/fblgit/una-cybertron-7b-v2-bf16/abc18648-ef96-4695-94d5-fa14be277431.json delete mode 100644 data/hfopenllm_v2/fhai50032/RolePlayLake-7B/ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json delete mode 100644 data/hfopenllm_v2/fhai50032/Unaligned-Thinker-PHI-4/cc8ef5bd-957f-4308-9539-00a696182056.json delete mode 100644 data/hfopenllm_v2/flammenai/Llama3.1-Flammades-70B/abc7652f-b88e-40ba-847c-c99dce9f2719.json delete mode 100644 data/hfopenllm_v2/flammenai/Mahou-1.2a-llama3-8B/56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json delete mode 100644 data/hfopenllm_v2/flammenai/Mahou-1.2a-mistral-7B/4b81caad-92ed-4bd5-98bd-58582854b5d8.json delete mode 100644 data/hfopenllm_v2/flammenai/Mahou-1.5-llama3.1-70B/2cef0040-6d4c-4c38-be40-5477911f3063.json delete mode 100644 data/hfopenllm_v2/flammenai/Mahou-1.5-mistral-nemo-12B/4aeef94f-823e-4be5-b4f1-37463e052748.json delete mode 100644 data/hfopenllm_v2/flammenai/flammen15-gutenberg-DPO-v1-7B/3d367147-373f-4543-be19-55a6429558a2.json delete mode 100644 data/hfopenllm_v2/fluently-lm/FluentlyLM-Prinum/cb93091a-6c46-438a-b111-cbf7e2fac420.json delete mode 100644 data/hfopenllm_v2/fluently-lm/Llama-TI-8B-Instruct/ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json delete mode 100644 data/hfopenllm_v2/fluently-lm/Llama-TI-8B/f4dc1659-800f-49d2-a290-48e9d4b15581.json delete mode 100644 data/hfopenllm_v2/fluently-sets/FalconThink3-10B-IT/d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json delete mode 100644 data/hfopenllm_v2/fluently-sets/reasoning-1-1k-demo/91017e73-f33a-49f5-ac87-f6e6a178d885.json delete mode 100644 data/hfopenllm_v2/formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/b7a75bca-6afe-448a-8e5c-53ebd577c964.json delete mode 100644 data/hfopenllm_v2/formulae/mita-elite-v1.1-7b-2-25-2025/8cdced5c-23bc-4426-a0c9-b9bf82913683.json delete mode 100644 data/hfopenllm_v2/formulae/mita-elite-v1.1-gen2-7b-2-25-2025/368784c8-6fc2-4340-8277-a6a9a9800a99.json delete mode 100644 data/hfopenllm_v2/formulae/mita-elite-v1.2-7b-2-26-2025/f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json delete mode 100644 data/hfopenllm_v2/formulae/mita-gen3-7b-2-26-2025/f423b0d1-3536-4865-9615-f89b9d15b14c.json delete mode 100644 data/hfopenllm_v2/formulae/mita-gen3-v1.2-7b-2-26-2025/c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json delete mode 100644 data/hfopenllm_v2/formulae/mita-math-v2.3-2-25-2025/b6149d15-3e0f-43d2-ae90-eca290a94edb.json delete mode 100644 data/hfopenllm_v2/formulae/mita-v1-7b/e21f5d83-6b71-488d-ad55-d23268fbd611.json delete mode 100644 data/hfopenllm_v2/formulae/mita-v1.1-7b-2-24-2025/68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json delete mode 100644 data/hfopenllm_v2/formulae/mita-v1.2-7b-2-24-2025/12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json delete mode 100644 data/hfopenllm_v2/frameai/Loxa-4B/adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json delete mode 100644 data/hfopenllm_v2/freewheelin/free-evo-qwen72b-v0.8-re/7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json delete mode 100644 data/hfopenllm_v2/freewheelin/free-solar-evo-v0.1/1bb09da7-1675-4e57-b46a-9791c888ce6f.json delete mode 100644 data/hfopenllm_v2/freewheelin/free-solar-evo-v0.11/3ed7dd5a-e431-480a-91a7-5ccd915057e4.json delete mode 100644 data/hfopenllm_v2/freewheelin/free-solar-evo-v0.13/9cab35b6-d6a7-475e-b715-e4493d07cd92.json delete mode 100644 data/hfopenllm_v2/fulim/FineLlama-3.1-8B/ef7149ae-8d50-4890-89ae-fb561a86d130.json delete mode 100644 data/hfopenllm_v2/gabrielmbmb/SmolLM-1.7B-Instruct-IFEval/3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json delete mode 100644 data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json delete mode 100644 data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/8fe13380-a045-4d63-96f8-ec977540478c.json delete mode 100644 data/hfopenllm_v2/gbueno86/Brinebreath-Llama-3.1-70B/6da42427-c7de-4830-b368-ca7757ee1d51.json delete mode 100644 data/hfopenllm_v2/gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b/5faf24b3-38af-4f3f-8377-bba70d75f8df.json delete mode 100644 data/hfopenllm_v2/ghost-x/ghost-8b-beta-1608/9a26214c-2601-49be-b1b1-03796b704059.json delete mode 100644 data/hfopenllm_v2/glaiveai/Reflection-Llama-3.1-70B/fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json delete mode 100644 data/hfopenllm_v2/gmonsoon/SahabatAI-Llama-11B-Test/25c5b304-46d3-4df3-9ac3-75ffa972849a.json delete mode 100644 data/hfopenllm_v2/gmonsoon/SahabatAI-MediChatIndo-8B-v1/88ed0272-39f8-4676-970a-525aee058991.json delete mode 100644 data/hfopenllm_v2/gmonsoon/SahabatAI-Rebase-8B-Test/d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json delete mode 100644 data/hfopenllm_v2/gmonsoon/StockSeaLLMs-7B-v1/dcb90e75-8709-4729-8c00-e756e6a9a49d.json delete mode 100644 data/hfopenllm_v2/gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES/81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_full_2/0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_full_3B/82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600/e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600_3B/7ccaa29a-4f73-4794-83a2-b925d755d91e.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_me_max_5200/ba8de8f6-c118-4bc3-ae8d-851e964684ed.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_min_2600/4011975a-e2a0-466a-9b34-923e1b4f8733.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ins_ans_max_5200/8a172205-39c6-4dd1-86b2-11b234b37e3c.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ins_max_5200/495b2e8e-e2d8-4158-bc6e-7568604d44e9.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_2600/e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_5200/4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_5200/a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_new_5200/c85c79d6-28e0-4deb-ad84-901b725aeca8.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.1_2600/73271472-d06f-405b-af9d-2da7c17e1eb0.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.3_2600/4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.7_2600/9b36e4c0-0d13-4988-8145-b9254da2e76e.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2500/6a464798-0111-4c71-b156-72a5aba1da63.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2600_3B/78252135-f15b-427d-86de-c32cd3dbcd0f.json delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_5200/c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json delete mode 100644 data/hfopenllm_v2/godlikehhd/ifd_2500_qwen/bce17582-e807-4b91-b0e7-0a890bf5eb24.json delete mode 100644 data/hfopenllm_v2/godlikehhd/ifd_new_correct_all_sample_2500_qwen/f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json delete mode 100644 data/hfopenllm_v2/godlikehhd/ifd_new_correct_sample_2500_qwen/78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json delete mode 100644 data/hfopenllm_v2/godlikehhd/ifd_new_qwen_2500/bdb9e2d2-8d09-4994-a320-2f968bcb4898.json delete mode 100644 data/hfopenllm_v2/godlikehhd/qwen-2.5-1.5b-cherry/c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json delete mode 100644 data/hfopenllm_v2/godlikehhd/qwen_2.5-1.5b-cherry_new/550d5665-7a8a-437e-b318-000690dd250f.json delete mode 100644 data/hfopenllm_v2/godlikehhd/qwen_full_data_alpaca/a1922f33-32f5-4f99-8df6-e2080808d292.json delete mode 100644 data/hfopenllm_v2/godlikehhd/qwen_ins_ans_2500/6ccc376b-24a4-42cc-8ea0-823ef14336db.json delete mode 100644 data/hfopenllm_v2/google/codegemma-1.1-2b/6547b6f3-63dd-4516-b294-62c4246c3dc7.json delete mode 100644 data/hfopenllm_v2/google/flan-t5-base/a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json delete mode 100644 data/hfopenllm_v2/google/flan-t5-large/b15ad3b5-7ef2-439e-9acd-a85eab520d31.json delete mode 100644 data/hfopenllm_v2/google/flan-t5-small/64da2654-9fdb-4a08-ad16-cf8793a30ed8.json delete mode 100644 data/hfopenllm_v2/google/flan-t5-xl/37080215-ee30-4e59-a407-b14695ac2a38.json delete mode 100644 data/hfopenllm_v2/google/flan-t5-xl/b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json delete mode 100644 data/hfopenllm_v2/google/flan-t5-xxl/bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json delete mode 100644 data/hfopenllm_v2/google/flan-ul2/da9ddecc-43cf-4055-a19e-795b1ee98826.json delete mode 100644 data/hfopenllm_v2/google/gemma-1.1-2b-it/a93ccb3f-f2d9-415d-8397-0c7fb765fada.json delete mode 100644 data/hfopenllm_v2/google/gemma-1.1-7b-it/d0f86765-bdb4-4367-986b-28303bbe1844.json delete mode 100644 data/hfopenllm_v2/google/gemma-2-27b-it/693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json delete mode 100644 data/hfopenllm_v2/google/gemma-2-27b/7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b-it/c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b-jpn-it/1810033a-185b-4c91-91d3-43b8f6c61443.json delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b-jpn-it/beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b/cf20e77a-340f-4d8d-b593-9645bdfc5877.json delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b/eec73e49-ac2b-42ed-a115-76e45007cd5d.json delete mode 100644 data/hfopenllm_v2/google/gemma-2-9b-it/aa06d058-87f9-4fde-ad53-139b29a71448.json delete mode 100644 data/hfopenllm_v2/google/gemma-2-9b/3f1d571a-fc42-411b-88ab-4700d5861367.json delete mode 100644 data/hfopenllm_v2/google/gemma-2b-it/74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json delete mode 100644 data/hfopenllm_v2/google/gemma-2b/2eb433ba-5c93-4355-99dd-edcb65721603.json delete mode 100644 data/hfopenllm_v2/google/gemma-7b-it/826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json delete mode 100644 data/hfopenllm_v2/google/gemma-7b/6da54964-e3b5-4567-8ce4-7e0f279af84f.json delete mode 100644 data/hfopenllm_v2/google/mt5-base/a7dde688-a0ae-4731-909f-0bef0c6eeba9.json delete mode 100644 data/hfopenllm_v2/google/mt5-small/eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json delete mode 100644 data/hfopenllm_v2/google/mt5-xl/9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json delete mode 100644 data/hfopenllm_v2/google/mt5-xxl/6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json delete mode 100644 data/hfopenllm_v2/google/recurrentgemma-2b-it/b0ca2dec-387f-4b27-9adb-772af1899832.json delete mode 100644 data/hfopenllm_v2/google/recurrentgemma-2b/53c4b397-b78e-4699-a01e-3535aa072225.json delete mode 100644 data/hfopenllm_v2/google/recurrentgemma-9b-it/f5b251f0-741c-4ad5-ab04-19c5202854ea.json delete mode 100644 data/hfopenllm_v2/google/recurrentgemma-9b/7b2ba13a-e01d-4442-9abe-d16df1a1668a.json delete mode 100644 data/hfopenllm_v2/google/switch-base-8/bf79f87c-3f14-49e8-acba-725e709d5f11.json delete mode 100644 data/hfopenllm_v2/google/umt5-base/3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json delete mode 100644 data/hfopenllm_v2/goulue5/merging_LLM/6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json delete mode 100644 data/hfopenllm_v2/gradientai/Llama-3-8B-Instruct-Gradient-1048k/1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json delete mode 100644 data/hfopenllm_v2/grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/03393ffd-1923-4767-ba14-d0e3e6751842.json delete mode 100644 data/hfopenllm_v2/grimjim/Gigantes-v1-gemma2-9b-it/b7d049dc-127d-4075-8067-22adac9a58c3.json delete mode 100644 data/hfopenllm_v2/grimjim/Gigantes-v2-gemma2-9b-it/89d79024-f4b8-4165-bd88-47f2b0010800.json delete mode 100644 data/hfopenllm_v2/grimjim/Gigantes-v3-gemma2-9b-it/d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json delete mode 100644 data/hfopenllm_v2/grimjim/HuatuoSkywork-o1-Llama-3.1-8B/a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json delete mode 100644 data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json delete mode 100644 data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/fe7a6940-fc4c-4345-84be-609c8155be57.json delete mode 100644 data/hfopenllm_v2/grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter/77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json delete mode 100644 data/hfopenllm_v2/grimjim/Llama-3.1-Bonsaikraft-8B-Instruct/94d744be-5d28-490a-ba9a-8440cb97dce9.json delete mode 100644 data/hfopenllm_v2/grimjim/Llama-Nephilim-Metamorphosis-v2-8B/2765061e-7506-4eb6-b63f-312f6290665a.json delete mode 100644 data/hfopenllm_v2/grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/167c937c-66c7-45a8-bbd9-97d98531bf7d.json delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v1-Gemma2-8k-9B/9587c35c-1def-46e7-8642-7acb0340be5e.json delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v2-12B/1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v2-Gemma2-8k-9B/8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v3-12B/a2f9536a-9266-4aee-be90-d04f4dcbe53c.json delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v3-Gemma2-8k-9B/7f116aaa-3880-4e53-948a-4b06e0d26cff.json delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v4-12B/7cbe4516-2be2-421b-95f4-c9500ad64ca5.json delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v5a-12B/07df565a-bc30-4a9d-b472-7a85f35938be.json delete mode 100644 data/hfopenllm_v2/grimjim/Magot-v1-Gemma2-8k-9B/7545f7db-10bb-4d97-9b3f-4346f4f26bad.json delete mode 100644 data/hfopenllm_v2/grimjim/Magot-v2-Gemma2-8k-9B/47384f10-ac6a-4629-92db-86f01a441f7f.json delete mode 100644 data/hfopenllm_v2/grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B/3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json delete mode 100644 data/hfopenllm_v2/grimjim/llama-3-Nephilim-v1-8B/1d851cfb-8624-4516-8204-85569c60dc67.json delete mode 100644 data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2-8B/a7990990-7498-4b74-a0aa-9c266910698e.json delete mode 100644 data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2.1-8B/0b41d37e-0728-4575-9662-c150e2e29bd0.json delete mode 100644 data/hfopenllm_v2/grimjim/llama-3-Nephilim-v3-8B/c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json delete mode 100644 data/hfopenllm_v2/gupta-tanish/llama-7b-dpo-baseline/680a4507-755e-4014-877b-6032f0220270.json delete mode 100644 data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.1/5ace8dc6-e348-4267-bb4a-f71a335d074e.json delete mode 100644 data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.2/07549821-db51-4b77-980a-056131b5dd29.json delete mode 100644 data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.3/ff12a0a1-a913-441b-955c-bcbd50056acf.json delete mode 100644 data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.4/947cfc2b-b73c-40eb-9e57-be5278776711.json delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube-1.8b-chat/53639078-c50a-4147-bab0-16993f1790b6.json delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube3-4b-base/b2cf96e0-382e-4200-a4a4-d66e8a188878.json delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube3-4b-chat/d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube3-500m-chat/210f7063-e0d9-424d-94f4-3645e4e1b401.json delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube3.1-4b-chat/4ecd26d8-8416-4dba-8d53-96f4013cfef0.json delete mode 100644 data/hfopenllm_v2/haoranxu/ALMA-13B-R/15712b7d-e69f-4a4f-b13c-4e79ce859399.json delete mode 100644 data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-CPO-SimPO/9148c375-7c08-4c1c-82ed-5f935b2a4f04.json delete mode 100644 data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-SimPO/fb93274b-b7d8-483a-a95d-96340535febc.json delete mode 100644 data/hfopenllm_v2/hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc/0818b755-ec49-457c-8635-73f01816f30b.json delete mode 100644 data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v0.5/77962326-0160-49bd-9ef1-59b403b2bfce.json delete mode 100644 data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v1.0-7B/272abbe5-8b61-442f-9860-d7411e7fec99.json delete mode 100644 data/hfopenllm_v2/hongbai12/li-0.4-pre/14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Deepseek-qwen-modelstock-2B/ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Falcon3Slerp1-10B/1970e257-7c93-4342-9ff4-a96af21acc67.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Falcon3Slerp2-10B/15d71696-4b21-41ff-a4c6-0aea92fb844a.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Falcon3Slerp4-10B/ccb85394-5252-48d4-8980-8b3a6c67ab1a.json delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp-3B/ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp1-7B/fe9012a7-d07f-48d4-b460-eca256078d8b.json delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp2-7B/8e8d2071-8e7d-4dad-8536-4698b2d00316.json delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp3-10B/dbcb41be-9ed6-4244-ada8-77f363c3487e.json delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp3-7B/e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp4-7B/30c2d908-3eaf-408a-a2b5-301e0cd9e052.json delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp6-7B/f7624d04-66d1-4c05-8c01-d015ecf8412c.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Gemma2Crono-27B/511e4aad-1e5a-4515-9433-46989fc3945b.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Gemma2SimPO-27B/863e71ec-03a4-47ed-8bc9-b064d5571162.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Gemma2atlas-27B/6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Gemma2magnum-27b/e0dbec0b-a154-448a-be23-ef9b764469ea.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp-8B/ecd91300-b0cf-48ce-9e5c-253a7991f90e.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp2-8B/e3df71f1-63e1-40f1-918d-07cb3ec939cf.json delete mode 100644 data/hfopenllm_v2/hotmailuser/LlamaStock-8B/52066a23-9847-490e-90e3-57eee3c63276.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Mistral-modelstock-24B/91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Mistral-modelstock2-24B/323630ee-fbe0-49a7-aa11-816fde38ba2d.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Phi4-Slerp4-14B/e5c8f97d-1873-4c9d-8bed-50dc592543db.json delete mode 100644 data/hfopenllm_v2/hotmailuser/Qwen2.5-HomerSlerp-7B/7ee2803c-b8f8-4156-8472-bab4baab8863.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenModelStock-1.8B/78573f63-3073-4be4-93a7-0ea00b1383fd.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp-14B/42da7295-d78d-49a4-9279-8406063240c4.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp-3B/b61c5735-53ca-4dda-a223-79921eee7f3e.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp-7B/310124ef-e33f-49de-83eb-e665a5143aaa.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp2-14B/c9b056df-8bbe-4959-ab44-85813157c95c.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp2-3B/7a60385f-48dd-4926-8b66-3d42a1631db3.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp3-14B/da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSparse-7B/e2930715-b616-49a4-83bc-53e92fc3580f.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenStock-0.5B/543f45e0-a158-4fdb-bbb1-8deb38f4515b.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenStock-1.7B/b96a20e0-d044-4a66-8909-437aeaef569c.json delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenStock1-14B/408742ff-4b21-46dc-b4d6-4c78d652d228.json delete mode 100644 data/hfopenllm_v2/hotmailuser/RombosBeagle-v2beta-MGS-32B/496a9fbe-376c-4546-bd90-b42f583924ce.json delete mode 100644 data/hfopenllm_v2/huggyllama/llama-13b/f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json delete mode 100644 data/hfopenllm_v2/huggyllama/llama-65b/cc36cc37-0f41-42aa-8051-54cc135820ef.json delete mode 100644 data/hfopenllm_v2/huggyllama/llama-7b/20d3dac4-9f8c-431c-b20f-364dd860e37f.json delete mode 100644 data/hfopenllm_v2/huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json delete mode 100644 data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-7030/97bfd152-79c6-4c96-8d3e-588275339e41.json delete mode 100644 data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-8020/93061947-2bcf-482e-ab22-38ef8ee33bcf.json delete mode 100644 data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-9010/8f65748b-1251-49f8-bfed-d1e4a937d5ba.json delete mode 100644 data/hfopenllm_v2/huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2/4f278881-69d3-42b5-b72c-ff8627a6ef44.json delete mode 100644 data/hfopenllm_v2/huihui-ai/Qwen2.5-72B-Instruct-abliterated/d88e85c5-73df-46cc-9234-f0556592ad5a.json delete mode 100644 data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2/44d2a20d-e867-4fa5-af3d-087f9c1b4067.json delete mode 100644 data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated/e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json delete mode 100644 data/hfopenllm_v2/huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3/44f2948c-4564-44cc-98d8-4f82a30e1f09.json delete mode 100644 data/hfopenllm_v2/iFaz/llama31_8B_en_emo_v4/846cf1ff-62c3-44e7-b6dd-0135ec77451a.json delete mode 100644 data/hfopenllm_v2/iFaz/llama32_1B_en_emo_v1/d2054469-b38b-4b1d-bd40-7324319f8eca.json delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_1000_stp/ce60608d-5b52-49d4-bbce-4b20e8272cef.json delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_2000_stp/f177bb70-fb7c-4b57-965d-acbcb4936bfa.json delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_300_stp/a5b2ab3d-1f12-4a5a-a110-2514185568b6.json delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_5000_stp/63b887a1-a0b9-46db-a563-b9bd67a0805a.json delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v2/92d122f7-f29d-49e3-99da-bf20edf377a2.json delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v3/a0b71344-f3a8-4ad0-87c5-6393148488b1.json delete mode 100644 data/hfopenllm_v2/iRyanBell/ARC1-II/821ff784-c48a-4623-9fb5-b77b7114b625.json delete mode 100644 data/hfopenllm_v2/iRyanBell/ARC1/ed251513-4807-4e31-bc8e-3ab0217ae4f3.json delete mode 100644 data/hfopenllm_v2/ibivibiv/colossus_120b/e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json delete mode 100644 data/hfopenllm_v2/ibivibiv/multimaster-7b-v6/11dfd131-00bf-4561-a913-f1c0cb15bf9c.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-base/3ba34f38-2340-407f-a7b5-82749f8a0ee6.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-instruct/91b9649b-bdf6-4b15-a038-47edc2e79ef6.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-2b-base/24670e63-32e1-4c5d-82fe-0d0c45a4e165.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-2b-instruct/198d1441-1d13-468a-a998-c8cf9f1e7a57.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-base/e9eb1499-835c-4a70-b531-4be5a9718c34.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-instruct/b1fd95ad-767d-4c13-a936-00b08c74ca3d.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-8b-base/f87bd357-535e-4450-b01d-b41e1b7571e0.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-8b-instruct/300fd27e-4dce-441f-91da-f38bd14ffe5e.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-base/1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-instruct/4c34d5c6-af1b-4519-8d08-67bd837e9b97.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-2b-base/ddc27df7-1c4c-4563-92b2-5a39380423a8.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-2b-instruct/3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-base/b9053559-3b90-4de0-981a-dbb49db38eb5.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-instruct/cea89bc6-b1a1-4b67-a136-45e097563a5b.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-8b-base/5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-8b-instruct/45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.2-2b-instruct/c94079d1-d8b1-4198-8129-8c5a11c310ca.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.2-8b-instruct/cb45306a-096c-4ed5-a028-6d720b26afe9.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-7b-base/f301908e-474b-4ba2-a873-610ca1b6c2bd.json delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-7b-instruct/06f5865d-a62a-48da-b33f-486fe29e3685.json delete mode 100644 data/hfopenllm_v2/ibm/PowerLM-3b/4f952c51-91dc-446e-bda1-43ed66e1ca3e.json delete mode 100644 data/hfopenllm_v2/ibm/merlinite-7b/dcba3a6f-8f4f-49f6-af74-541de16be435.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.15-02.10-RP/b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.16-02.10-RP/1e597e9b-4e75-4981-842b-dad6f1c15ed7.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.17-03.10-RP/18752dc4-76d1-40dc-9f43-62b8087b7a88.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.27-06.11-RP/fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.29-06.11-RP/5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.31-08.11-RP/a95ab4cf-456f-4b3d-9bab-2b755649758d.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.32-10.11-RP/9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.34b-14.11-RP/26ff113c-95ca-4716-83f7-4792b46be246.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.34n-14.11-RP/285e1d08-15a0-4d8b-a844-e4cad923ea9b.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.37-18.11-RP/0462269d-94a3-4991-9af5-e55592f344e5.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.38-19.11-RP/c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.39-19.11-RP/0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.40-20.11-RP/4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.41-22.11-RP/84783e4d-5eed-474d-9463-a01a0890850e.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.50-16.01-RP/d9fe39c5-24a5-4240-bfc9-59860fcb3911.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.50.1-16.01-RP/2ddf850e-36dc-41b2-92da-e2b45d1544c6.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.51-16.01-RP/b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.51.1-16.01-RP/2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.52-16.01-RP/425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.52.1-16.01-RP/7e1fcf4e-9f64-4112-934c-4808f07d32b2.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.53-16.01-RP/d3666566-09dc-4d53-9996-2301c6fb2721.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.54-17.01-RP/36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.55-17.01-RP/a6dba337-81d2-40c6-89c2-aee6de82282e.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.57-17.01-RP/e44b8d9a-f270-45c8-b126-6a8911c35436.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.60-18.01-RP/44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.60.1-18.01-RP/4246401d-9049-4c83-83d4-e2d9efa4dded.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.61-18.01-RP/26c4785a-0caf-4b01-be5d-1e421bfeb698.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.62-18.01-RP/cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.62.1-24.01-RP/b4edb7f5-a675-4627-af96-7ed0909da1e5.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.64-24.01-RP/461b6f40-6f19-48b1-857e-f0fb37f929f9.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.64.1-24.01-RP/e924270d-a655-4093-91b2-f73b7f12eefd.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.65-25.01-RP/af8905e0-e969-45bd-8e09-e7316fff0914.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.66-25.01-RP/e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.67-25.01-RP/47472cd9-36d3-4074-83d4-af53b9c23758.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.68-25.01-RP/b922f4e1-1fd9-4a32-94ce-4784430cef51.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.69-25.01-RP/5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.7-29.09-RP/35937213-bb16-4935-9d92-9fa8fd61aac3.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.70-25.01-RP/04122d1b-929d-439c-bb8d-f08508f7a00e.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.70.1-01.02-RP/03beb242-2628-4ea0-a2f3-c3ec43d379de.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.73-01.02-RP/46d55b7b-1972-4cb0-97ca-e04d306282a7.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.74-02.02-RP/32730d82-cfac-481f-9a22-9cbe40646218.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.76-02.02-RP/a290a75f-753b-489d-87a2-ce0637c09f41.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.77-02.02-RP/54032eb0-c4cd-4c76-be2e-f0c81bd26365.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.78-02.02-RP/73b59506-cc1d-413c-a28b-d25e0e6bf413.json delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.80-03.02-RP/bea2dcd6-4772-4aac-bcbc-4802cfb33495.json delete mode 100644 data/hfopenllm_v2/icefog72/IceCocoaRP-7b/66275215-28e6-42bc-bc22-5d152682ce53.json delete mode 100644 data/hfopenllm_v2/icefog72/IceCoffeeRP-7b/9015365c-400b-4fa3-85f2-a1033b030cf7.json delete mode 100644 data/hfopenllm_v2/icefog72/IceDrinkByFrankensteinV3RP/55d52914-0904-4e6e-8b37-c22b06f5f2bf.json delete mode 100644 data/hfopenllm_v2/icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock/3677260a-2fd5-41bf-9010-f1b31cedacbc.json delete mode 100644 data/hfopenllm_v2/icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock/fc54f87a-2e4a-4f3f-b407-e268c4487d16.json delete mode 100644 data/hfopenllm_v2/icefog72/IceDrunkCherryRP-7b/8d893736-1707-4c0b-860d-16c62ec26d78.json delete mode 100644 data/hfopenllm_v2/icefog72/IceDrunkenCherryRP-7b/d3d2728f-74bf-4196-a909-43797d8b628a.json delete mode 100644 data/hfopenllm_v2/icefog72/IceEspressoRPv2-7b/ed241e67-8718-48be-a6e8-19e295a2b5cd.json delete mode 100644 data/hfopenllm_v2/icefog72/IceLemonTeaRP-32k-7b/05aafad3-e07a-453b-a70b-f18fbd4eb218.json delete mode 100644 data/hfopenllm_v2/icefog72/IceMartiniRP-7b/f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json delete mode 100644 data/hfopenllm_v2/icefog72/IceNalyvkaRP-7b/cec76b15-1069-4d37-b8bc-74dde28101f6.json delete mode 100644 data/hfopenllm_v2/icefog72/IceSakeRP-7b/e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json delete mode 100644 data/hfopenllm_v2/icefog72/IceSakeV4RP-7b/f8d629bf-df0b-4c6a-8c18-17dda002b089.json delete mode 100644 data/hfopenllm_v2/icefog72/IceSakeV6RP-7b/6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json delete mode 100644 data/hfopenllm_v2/icefog72/IceSakeV8RP-7b/a51722f4-29f4-47a5-acba-4c8b5355551b.json delete mode 100644 data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5/06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json delete mode 100644 data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3/04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json delete mode 100644 data/hfopenllm_v2/ifable/gemma-2-Ifable-9B/e4668365-d3dd-4996-9bb1-5b4e6f510264.json delete mode 100644 data/hfopenllm_v2/ilsp/Llama-Krikri-8B-Instruct/4d743678-e14d-4866-b1bf-0d660787847b.json delete mode 100644 data/hfopenllm_v2/inflatebot/MN-12B-Mag-Mell-R1/720b1476-876c-47d1-bf46-d037389b4b2f.json delete mode 100644 data/hfopenllm_v2/informatiker/Qwen2-7B-Instruct-abliterated/4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json delete mode 100644 data/hfopenllm_v2/insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json delete mode 100644 data/hfopenllm_v2/instruction-pretrain/InstructLM-500M/623f1b73-1505-4527-b41c-dcb2b711226d.json delete mode 100644 data/hfopenllm_v2/internlm/internlm2-1_8b/53f03454-9587-4208-bc01-21de62f59195.json delete mode 100644 data/hfopenllm_v2/internlm/internlm2-7b/fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json delete mode 100644 data/hfopenllm_v2/internlm/internlm2-chat-1_8b/b127a923-3bf2-4cad-9225-d738efe800e3.json delete mode 100644 data/hfopenllm_v2/internlm/internlm2_5-1_8b-chat/a94ae52a-7936-4750-83f5-4740f23adf15.json delete mode 100644 data/hfopenllm_v2/internlm/internlm2_5-20b-chat/95e689c6-cd19-4114-b3b5-1672ab849214.json delete mode 100644 data/hfopenllm_v2/internlm/internlm2_5-7b-chat/890a8414-bccf-4a66-8013-6c270d017965.json delete mode 100644 data/hfopenllm_v2/intervitens/mini-magnum-12b-v1.1/0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json delete mode 100644 data/hfopenllm_v2/inumulaisk/eval_model/121096cf-356b-4069-a0a3-8cf6aad52b81.json delete mode 100644 data/hfopenllm_v2/invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/fb0bcadf-32a0-4320-909f-2c38ba7d9372.json delete mode 100644 data/hfopenllm_v2/invisietch/EtherealRainbow-v0.2-8B/ab941c52-cf33-4b8e-87af-4a73930cf72a.json delete mode 100644 data/hfopenllm_v2/invisietch/EtherealRainbow-v0.3-8B/08c242fd-0258-4817-970a-668584ed9385.json delete mode 100644 data/hfopenllm_v2/invisietch/MiS-Firefly-v0.2-22B/2171af9a-be5e-4daf-8e67-a5239ccec7bd.json delete mode 100644 data/hfopenllm_v2/invisietch/Nimbus-Miqu-v0.1-70B/706f75a1-2f6b-47dd-809e-a830e739b574.json delete mode 100644 data/hfopenllm_v2/irahulpandey/mistralai-7B-slerp-v0.1/a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json delete mode 100644 data/hfopenllm_v2/jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/67cfd12d-0551-406d-bd1d-8ced75c69478.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2-8B/0a31d2f0-196b-4508-861a-1ba7bd28ea23.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.1-8B/57576999-2749-441a-91d6-5a976e83a658.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.2-8B/e44792e6-0329-4784-832b-3043478e70a4.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.3-8B/8b3789d6-51be-472a-95d3-2ae7c34ad140.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Aurora_faustus-8B/3f4765f2-551b-485f-9020-0cf17a36a887.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-8B/6375a845-5d86-4dcf-bfd2-e836daa4ca11.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B/65a74446-6964-4f5f-8ea6-aeb1b09595ae.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-v38-8B/dcba5998-3b84-4753-a4fa-2558ffe3e69b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/4e332594-d0b9-4913-9950-208abe4faab7.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-8B/5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-light-8B/c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v23-8B/a2e32a77-867c-4921-ada4-c7b169efbebe.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v24-8B/f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v25-8B/ece0bd6b-4eec-485c-942b-e23f3295c2f8.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v26-8B/ada110bb-0988-4c19-9798-74577dde5ce9.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v27-8B/ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v28-8B/57395f9a-0534-453e-80fc-96e9dc5cd9c3.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v29-8B/f8f70702-9ab4-4e1a-a11d-090627d58f02.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v30-8B/3cab8bda-bdf6-4345-b89e-18d34a8f6361.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v31-8B/0955fc17-8878-401a-9ec3-149528ee51e1.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v32-8B/c63bf49a-e7d4-4853-8684-9cc03eaa7840.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v33-8B/65e6a3b6-4291-4591-bc0b-576930061c68.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v34-8B/1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-8B/f9f96bb2-edbc-4112-97aa-a7420dea32a1.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-light-8B/3a24b30f-7698-4ecb-ac26-3537a0b38616.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v19-8B/d4030df6-2be6-4f46-9c9b-ce3037b9a004.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v20-8B/ec234403-f43d-46a0-84a4-ab47673226b3.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v21-8B/805379f4-784f-4602-92e8-180df4da9fc3.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v22-8B/9f3920aa-9400-46f1-bcfa-969f69b3335c.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-8B/26cbf444-ab93-409a-b85d-e2bd267eae5e.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-alt-8B/7c2b17a8-1de2-4441-a281-fe3fd043f831.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-8B/94c5756c-cbde-46e2-90d2-207678373061.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-alt-8B/e0048124-89bf-4327-88a8-00aa51ee29af.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B/9d776307-43af-43bb-ab64-52fb7f331cfe.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v13-8B/d8d41981-a7c8-48e9-a63c-86520a0f23d5.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v14-8B/1355985c-fbcb-4eac-8435-417d6034f2f0.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v15-8B/44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v16-8B/45ae3dc3-6dc0-4d10-99cb-a7f330110906.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v17-8B/6b54763a-6329-47fb-bf50-296604251b47.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v18-8B/96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B/655b047f-c3a8-4c9c-b864-81d318b2f506.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v10-8B/f62fed77-e166-422d-b5ce-c50b7bccbf4c.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v11-8B/7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v12-8B/2c93c987-b32d-4a02-8df4-949cc45b8eb2.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v2-8B/02e7c1d6-9db1-4de8-b13e-afd752b3669a.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v3-8B/580a3045-338a-47b2-8ed7-54c993d5aa90.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v4-8B/e71d3be5-ea9d-4426-aa58-5806b7541aa6.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v5-8B/1174683a-9488-4c6b-be6b-e5a96328a96f.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v6-8B/3789b37f-daf0-4c21-82b8-309cbf00312e.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v7-8B/8586cdc1-dd4e-4112-a59c-f6bc2766701b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v8-8B/946a7b16-dfa6-42ad-97c1-955bf8a40dae.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-8B/d9a6cc31-57c4-4480-a019-25a34b31fcc8.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B/279bd5fa-0ab1-411b-871b-bd9ff23853f6.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-8b/c26fae10-e65a-49ac-a2da-2dbf024fd10d.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-8B/6d37b2b4-630e-4471-b7a8-50f8a58902fe.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B/de687865-4297-4130-bcfe-0c5116c9b0d1.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B/ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-VENN-8B/52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json delete mode 100644 data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-8B/2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json delete mode 100644 data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-light-8B/5120e433-f5c7-45fa-be56-566101556271.json delete mode 100644 data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-8B/7f4b4668-c3a0-4575-957d-ba321d55f420.json delete mode 100644 data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-v2-8B/9245b74d-4b9d-4158-a402-0c3742097eba.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-1/29a5fcd3-9c22-424c-ab17-70cfe187aea1.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-2/af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-3/258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-4/4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-5/a5d66f97-1f4b-43da-a83a-4a262e297fd9.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-6/5d29cf73-65d6-4965-a504-4caf07108cc8.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-7/15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-1/2ed96c70-390b-44de-aa08-9883a2f33ff3.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-10/67c95889-8a67-40fd-99e2-62e767c16416.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-11/a518f39d-e073-493d-9a4f-9af53fc71abf.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-12/24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-13/3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-15/ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-16/0da639d4-181c-4ee1-808c-3de8003c2471.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-17/480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-18/dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-19/a2ae2953-e341-49be-8469-32bd41d780d7.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-2/23bdd694-f250-46dd-9b8b-526fda47bc9e.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-20/d600a69d-1952-4e30-abe8-1769ab63ac29.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-21/afc031d4-852e-4ead-9098-6ce30112b459.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-22/cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-23/a4b93124-1151-4f69-8a5e-6b916e8cf11f.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-24/efe11d8f-65e6-4ba6-8148-fdd43c9346be.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-25/923da7be-2ec8-46b2-8187-fe08eb86d5a0.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-26/1652b9fe-640a-48f9-b7a5-20ae28fb5985.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-27/572463ed-f6b9-460d-9c38-0e0ee5327511.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-28/5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-29/32322361-f18d-480d-9475-cd11a45bc4bc.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-3/f62d1aee-2d9e-466e-85e2-002fae5d2504.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-30/af389bf1-da63-49a9-9e49-32613d8d05b8.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-31/ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-32/1e697620-36a7-459c-b88c-405febb57c3a.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-33/532723e8-a9b7-4f72-a015-c2bd9363b5d8.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-34/be096a57-7d81-4999-919a-ed8a243012b2.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-35/cadeb016-e158-4a49-921c-efe0e4eb0cb2.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-36/c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-37/04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-38/a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-39/29c7bc9b-6833-497b-a553-2941026efea5.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-4/09a60955-978e-4136-bdde-d5459e37ad2c.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-40/501744a2-070a-4378-9232-f7ccd9b2a67e.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-41/369efdc6-6529-477c-b5f0-d229c8102491.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-42/906645f3-2041-4380-8118-ac26b92297ba.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-43/57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-44/95f2fa22-3da9-4876-ace3-50763f2b2453.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-46/b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-47/b3173a2a-8309-498d-961b-0167d5d5dea6.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-48/0d59dd75-c999-4a7e-919a-fd084202fc9c.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-49/639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-5/56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-50/d03fb481-be0b-4dfb-bb4d-54067e058e99.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-51/d8fc3475-83e9-4790-a472-72b442087562.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-52/57efd335-4873-4e01-bfc3-0d704b3d482a.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-53/25fdcc8a-0e7d-4148-8508-2631ea6deb05.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-54/f5f63d06-7e51-4b91-8814-ecbda604fe6b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-55/5326c33b-6b8a-472a-9058-a9e9fe83b599.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-56/28674053-e1b6-4f0a-a90e-5dd5082ec164.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-57/fd27bfa7-11b3-46d3-915c-373ddf5a9865.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-58/91f190ba-39c8-47af-8351-73d1f382dd99.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-59/b637b55c-dd05-4060-bf33-e63e9de7fac9.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-6/bcacef79-d7c0-46e7-9194-43541c2f01fc.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-60/77a358c7-59fa-4b22-a190-dfca86c5166b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-61/ad4c8922-7079-4383-8f42-d3de6326a1e1.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-62/7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-63/07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-64/5fb04756-c7bb-4772-b209-0d9a300bbf7d.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-7/0c02d1b6-2d31-4c54-b881-588cbfb0c686.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-8/a32e4d22-8096-4537-a68a-98ff9171ac8c.json delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-9/4e45b666-fa7e-4a38-8b6b-65846876c8d9.json delete mode 100644 data/hfopenllm_v2/jaspionjader/dp-6-8b/d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json delete mode 100644 data/hfopenllm_v2/jaspionjader/dp-7-8b/6afaec07-ebb8-4f3f-af48-c679f38f4917.json delete mode 100644 data/hfopenllm_v2/jaspionjader/ek-6/bf8370c9-baed-4034-ac38-c6f796baca15.json delete mode 100644 data/hfopenllm_v2/jaspionjader/ek-7/d397c078-6fe3-44a8-859c-a0f7c551dc3a.json delete mode 100644 data/hfopenllm_v2/jaspionjader/f-1-8b/ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json delete mode 100644 data/hfopenllm_v2/jaspionjader/f-2-8b/6be795f4-0784-44bf-8926-e3060ec37dcf.json delete mode 100644 data/hfopenllm_v2/jaspionjader/f-3-8b/d4d808f5-3b79-43b5-8076-d3f785083789.json delete mode 100644 data/hfopenllm_v2/jaspionjader/f-4-8b/370f5923-91d7-40d2-bd06-bf2b657b8ef2.json delete mode 100644 data/hfopenllm_v2/jaspionjader/f-5-8b/5334e5e4-d243-4c20-912c-d0ded74d6ea5.json delete mode 100644 data/hfopenllm_v2/jaspionjader/f-6-8b/7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/f-7-8b/68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/f-8-8b/59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json delete mode 100644 data/hfopenllm_v2/jaspionjader/f-9-8b/220cd306-0613-4c8f-9848-4af812a1d37f.json delete mode 100644 data/hfopenllm_v2/jaspionjader/fct-14-8b/39a6a40c-3fa0-41ba-9d13-da9381263d4a.json delete mode 100644 data/hfopenllm_v2/jaspionjader/fct-9-8b/4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json delete mode 100644 data/hfopenllm_v2/jaspionjader/fr-1-8b/16baf620-7dcc-49f3-a787-b431e11ad4f6.json delete mode 100644 data/hfopenllm_v2/jaspionjader/fr-10-8b/4745add2-7bcb-4c05-8b12-6bd30856890b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/fr-3-8b/f68b122d-4dec-4d5c-ac22-198da3d3e96b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-8B/2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json delete mode 100644 data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v2-8B/f21bcd75-fc9f-4266-8976-3227b18b6b32.json delete mode 100644 data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v3-8B/7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json delete mode 100644 data/hfopenllm_v2/jaspionjader/knf-2-8b/1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json delete mode 100644 data/hfopenllm_v2/jaspionjader/knfp-2-8b/ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json delete mode 100644 data/hfopenllm_v2/jaspionjader/knfp-3-8b/df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-1-8b/774d54fb-a445-4ed9-b79a-9c1346537e98.json delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-11-8b/420b8be3-3560-48e8-8ab3-bb55338a9069.json delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-4-8b/c118b75c-597f-48a7-a4eb-675af72c9930.json delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-5-8b/e75534d3-b994-4e88-9274-7b62f61916cf.json delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-6-8b/770a1ff1-057f-49a7-9402-c6dd881ac03d.json delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-8-8b/6cc9790d-9b02-437e-8ac7-be4152f5b17d.json delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-9-8b/264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-10/549db368-437a-4982-ba5b-5c4d7bf203ae.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-11/0d098a19-7e8f-4a52-8466-729be91388d8.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-13/83335f65-25a4-4bec-a901-587567ed0e99.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-14/02fb24c3-927f-4c21-bd47-b883521162a3.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-17/2a6507c7-44c1-4416-9ff1-36abd6af3b73.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-2/327a146a-8cfd-4480-8342-46afde530677.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-20/0700fb7a-e722-432f-a64d-c040bba4deee.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-22/131d3a7e-43dd-4189-8466-6562703b3bdd.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-23/8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-25/aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-29/a73250f1-399a-4afa-bf83-4036dce78ef3.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-32/f68bf680-9626-4952-b95e-12a18fd60820.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-33/d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-34/7b5eab2e-fba3-47d5-9839-02249c2568c5.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-35/2acee2c3-4322-4152-8151-c1d571475b7c.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-36/67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-37/2923aeb3-982f-400d-9588-707583c75a1d.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-6/b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-mix-1/7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json delete mode 100644 data/hfopenllm_v2/jaspionjader/sof-1/fd481b93-55b2-4831-9be9-1b1b2886fda3.json delete mode 100644 data/hfopenllm_v2/jaspionjader/sof-10/f159748f-234e-4962-b582-cd5805448f33.json delete mode 100644 data/hfopenllm_v2/jaspionjader/sof-3/044d53dd-d134-4959-a70c-46f11cc0b300.json delete mode 100644 data/hfopenllm_v2/jaspionjader/sof-6/f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-10/5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-11/80e5134b-0733-41cc-8b4f-ef32fbe57066.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-12/61123e41-7b2a-40da-9f7f-b830c27d7f12.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-13/b93c31d7-54c3-47b9-a267-3f8fdb796805.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-14/b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-15/3b06f75e-3d22-4428-8d4f-2e704b96961e.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-16/dfda4aab-f8d4-49ee-b141-78539b69007c.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-17/690f3c19-c148-458d-b4c5-87761d72b851.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-18/b6a18246-776d-463f-80d5-140df74e9704.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-19/9831abdc-ad08-48c0-8384-86240e7350b5.json delete mode 100644 data/hfopenllm_v2/jaspionjader/test-20/96a572e5-4751-46ce-9202-deb223ef4dfe.json delete mode 100644 data/hfopenllm_v2/jayasuryajsk/Qwen2.5-3B-reasoner/f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json delete mode 100644 data/hfopenllm_v2/jeanmichela/o-distil-qwen/8376c0bf-f9c3-4529-b13c-c57106182d15.json delete mode 100644 data/hfopenllm_v2/jebcarter/psyonic-cetacean-20B/97a80145-e621-4603-8ff8-2cc4bd74190a.json delete mode 100644 data/hfopenllm_v2/jebish7/Llama-3-Nanda-10B-Chat/99a7881c-cca0-43d6-96f5-ce5292ed60a0.json delete mode 100644 data/hfopenllm_v2/jebish7/Llama-3.1-8B-Instruct/60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json delete mode 100644 data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Base/4a0f8dc7-9446-4dda-bf49-8cca4851746c.json delete mode 100644 data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Instruct/6eb3a040-8234-4d31-8274-6987b0e4e3b4.json delete mode 100644 data/hfopenllm_v2/jebish7/Nemotron-Mini-4B-Instruct/16053077-38fd-4136-81a5-fea0d4cd927a.json delete mode 100644 data/hfopenllm_v2/jebish7/aya-expanse-8b/25abb99f-536e-4638-8611-a1db5dee931d.json delete mode 100644 data/hfopenllm_v2/jebish7/gemma-2-2b-it/aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json delete mode 100644 data/hfopenllm_v2/jebish7/gemma-2-9b-it/b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json delete mode 100644 data/hfopenllm_v2/jebish7/qwen2.5-0.5B-IHA-Hin/169fb05f-5201-47b8-a06e-7d01e574c689.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen-7B-nerd-uncensored-v1.0/db076309-32e5-4d46-9786-ff14f8daf5d2.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-minperplexity-2/cde914dc-7d57-425f-9787-e4b8d36d61cf.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9/5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0/8c645c9f-02f6-44a5-b295-d6364ed49464.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1/97bb5519-e2d3-44d5-abf4-b5263c2b3245.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2/bd3d78d3-3ff1-4a92-a316-e4e30787a331.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3/d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4/e1772d6c-fd26-43a7-82b3-7997d8a6809f.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5/febaf893-6aaf-4c87-89fc-cc865ebf2859.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7/0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8/0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.0/e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.1/85426280-8138-46d0-a111-b59b0d7c86c8.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.2/32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.3/86ed6833-ae85-4a8e-b840-b0c9540083ce.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.4/2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.5/9677e68d-afda-4917-825c-83318219ff59.json delete mode 100644 data/hfopenllm_v2/jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1/23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json delete mode 100644 data/hfopenllm_v2/jeonsworld/CarbonVillain-en-10.7B-v4/bec23315-f98a-4211-81a0-c49f395e66c9.json delete mode 100644 data/hfopenllm_v2/jiangxinyang-shanda/Homer-LLama3-8B/1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json delete mode 100644 data/hfopenllm_v2/jieliu/Storm-7B/39327803-11e7-4b28-8750-81feb027e8f3.json delete mode 100644 data/hfopenllm_v2/jiviai/medX_v2/ce2b6874-0fc8-4364-a526-7b25b101e1e3.json delete mode 100644 data/hfopenllm_v2/jlzhou/Qwen2.5-3B-Infinity-Instruct-0625/9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1eb697fe-9dd4-4a41-aa47-33456df39e2d.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/5f10df7b-cd2c-44ca-b13a-2852483c71f8.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/5f47e65d-293f-469e-a18f-5627ca1adf44.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/15c21655-9af8-4bee-9884-b047683e9adf.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/f642de95-218a-4db0-807f-1bb97618b4f6.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1ee8c377-2236-4225-942f-ef8ce5770741.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/419c6631-805f-43ba-9db8-5296f8d221ec.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/3fc1822f-4a43-4a3b-90d7-fc163491c90a.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/76b4037b-c5d0-435f-966a-bd88b1665dad.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/757b85e7-84c8-429f-aeb4-870852fa8959.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/acab4982-1205-4362-803e-306b1e2371bf.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/708aded5-6252-44e3-bf0d-08bf3e7f32e0.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_linear/5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1/47320824-8064-40d4-a08c-810faafbba77.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3/8baeef58-0ba6-4723-8f23-7a4c386f2cad.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7/0387ca63-1e31-4eaa-ac7c-35d417548c54.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9/733983fe-4b9c-47e6-963d-c57829b6f1af.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_linear/80c4859d-8016-4650-939f-100ba2e6d808.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.1/21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.3/d781945e-e9df-4136-90cd-632f0bed6246.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.5/8f146bb5-dd4d-49ce-ac60-76f66321feb8.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.7/89bfba6d-c622-445e-b0b9-512aadcea7cf.json delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.9/9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-4k-DPO/455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.2/e04a76a6-ac22-43b2-bbf9-196a08de2949.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.3/2fcb74f0-add1-4d46-8a0f-8578a616dbed.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1/51530638-ef76-43ce-9396-8a0d07988712.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.1/74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.3/80e8b9f0-b507-4927-9d24-1c793e3783cc.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0/7b037520-a5e9-4b58-80f3-f0ecc5957c67.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b2/10b88d05-62d2-4603-9d04-b0854e39ed40.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b3/4b693f41-d811-4b64-892c-d840eee5ace4.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-Revised/90d86c8c-3aa6-42ba-a94f-75c961e65c41.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.0/8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.2/b20a1d13-2f14-42e4-bdde-49f053cef325.json delete mode 100644 data/hfopenllm_v2/jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1/51521dfb-d4b5-45df-ac2a-54190aed0b9f.json delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1.3/997a1ceb-185a-4e6c-8383-eb5a6f976771.json delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1/22101998-c3d3-414f-9ed1-99330cdbe3b2.json delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0/a2408953-a7eb-449c-b80c-3620915d44d0.json delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1/d65e5b08-7d3c-4c0d-85fa-496db65a235c.json delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-Boosted-7B-Instruct/ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json delete mode 100644 data/hfopenllm_v2/jsfs11/L3-8B-Stheno-slerp/e9ba998d-8147-4046-afae-9ee7d544e98d.json delete mode 100644 data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v4/c44f1012-1123-42c8-b110-5735dc756fd5.json delete mode 100644 data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v5/5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json delete mode 100644 data/hfopenllm_v2/kaist-ai/janus-7b/b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json delete mode 100644 data/hfopenllm_v2/kaist-ai/janus-dpo-7b/529dba11-53af-4045-ae46-04e1b9838d4a.json delete mode 100644 data/hfopenllm_v2/kaist-ai/janus-rm-7b/391f6d6c-418f-44be-910a-fb90b5712649.json delete mode 100644 data/hfopenllm_v2/kaist-ai/mistral-orpo-capybara-7k/2ccccb4b-7260-4a1a-9426-117e359c7c5c.json delete mode 100644 data/hfopenllm_v2/kavonalds/BunderMaxx-0710/84afecec-453d-491c-9f5a-de31d8fba43e.json delete mode 100644 data/hfopenllm_v2/kavonalds/BunderMaxx-0710/dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json delete mode 100644 data/hfopenllm_v2/kavonalds/BunderMaxx-1010/1179bcce-558e-40ad-8537-c74c59557975.json delete mode 100644 data/hfopenllm_v2/kavonalds/Lancer-1-1b-Instruct/fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json delete mode 100644 data/hfopenllm_v2/kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe/81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json delete mode 100644 data/hfopenllm_v2/keeeeenw/MicroLlama/173bb053-e817-4551-b169-c3f71163650a.json delete mode 100644 data/hfopenllm_v2/kekmodel/StopCarbon-10.7B-v5/b7e6a86f-340c-48ed-a828-2e80a13aa515.json delete mode 100644 data/hfopenllm_v2/kevin009/llamaRAGdrama/bd221eee-7aa8-4d6f-a6be-89ee5568e729.json delete mode 100644 data/hfopenllm_v2/khoantap/cheap-moe-merge/8727a325-a515-4456-ba34-65c30f84644a.json delete mode 100644 data/hfopenllm_v2/khoantap/llama-3-8b-stock-merge/3e4011fa-d480-4c16-9371-2025bc834358.json delete mode 100644 data/hfopenllm_v2/khoantap/llama-breadcrumbs-ties-merge/867499a7-589b-4564-b04d-a004b7c0abb4.json delete mode 100644 data/hfopenllm_v2/khoantap/llama-evolve-ties-best-merge/52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json delete mode 100644 data/hfopenllm_v2/khoantap/llama-linear-0.5-0.5-1-merge/5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json delete mode 100644 data/hfopenllm_v2/khoantap/llama-linear-0.5-1-0.5-merge/3278855d-7bd1-4e7e-b27b-b1393006e7e7.json delete mode 100644 data/hfopenllm_v2/khoantap/llama-linear-1-0.5-0.5-merge/5193ab4d-1627-43b5-bfb7-89e08ea1f810.json delete mode 100644 data/hfopenllm_v2/khoantap/llama-slerp-merge/598faeda-48fb-43a8-aaa9-849d5dfcea79.json delete mode 100644 data/hfopenllm_v2/khoantap/moe-out-merge/d1afa2fb-1256-4dd3-b13b-802917bf481b.json delete mode 100644 data/hfopenllm_v2/khulaifi95/Llama-3.1-8B-Reason-Blend-888k/397c9bc3-0af5-453c-9b68-5360783dfbf7.json delete mode 100644 data/hfopenllm_v2/kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json delete mode 100644 data/hfopenllm_v2/kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/7e793244-b746-4aa4-a401-dcf5884f61a4.json delete mode 100644 data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1/26a8da03-debd-41e3-8ee1-2827d76b26ca.json delete mode 100644 data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath/e214c326-dd84-4915-bba1-faaafbb026b2.json delete mode 100644 data/hfopenllm_v2/kno10/ende-chat-0.0.5/98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json delete mode 100644 data/hfopenllm_v2/kno10/ende-chat-0.0.7/40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json delete mode 100644 data/hfopenllm_v2/kyutai/helium-1-preview-2b/d881a83a-9ba8-4919-8b89-45f5a7220621.json delete mode 100644 data/hfopenllm_v2/kz919/QwQ-0.5B-Distilled-SFT/d6c966a1-7927-424a-9886-b98688d27e6f.json delete mode 100644 data/hfopenllm_v2/ladydaina/ECE-FDF/c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json delete mode 100644 data/hfopenllm_v2/laislemke/LLaMA-2-vicuna-7b-slerp/b3979c7f-0596-4a24-b264-73a17ba19821.json delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR/f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-SLERP-V4/8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3/329e5e91-10ba-4795-ae86-dda95e698b4f.json delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4/3fe89b13-135d-4790-871d-74e7a28ea2e9.json delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1/4b807741-f1b9-4964-9bc9-bb93f9b34217.json delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2/c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json delete mode 100644 data/hfopenllm_v2/langgptai/Qwen-las-v0.1/f6e157c4-0ce9-41c9-b885-9222d894ff0c.json delete mode 100644 data/hfopenllm_v2/langgptai/qwen1.5-7b-chat-sa-v0.1/fe52a94a-5324-4b59-accc-dfd1f9d4aead.json delete mode 100644 data/hfopenllm_v2/lars1234/Mistral-Small-24B-Instruct-2501-writer/1241f5e3-54eb-429e-b109-a5e163e39eda.json delete mode 100644 data/hfopenllm_v2/leafspark/Llama-3.1-8B-MultiReflection-Instruct/8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-9B/5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Advanced-9B/63e82cb3-2f6f-4617-abb7-ae093bc27830.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Remix-9B/0feb74e6-40d4-472d-9233-27faa2d3f802.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2-9B/e74dd005-c9b5-45c9-b7f5-455c3110e09b.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2a-9B/d094bf6f-9952-45c7-995e-d7eda07f4668.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2f-9B/0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B/f91982ac-0cab-415a-8503-e090d195bd05.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3b-9B/fb1af66e-7828-495b-8277-5cff77c3070e.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3i-9B/ac84c157-4d11-43c1-8731-b1e5cfa91668.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3j-9B/bbc812dd-9a9c-4f99-b813-50361025eea3.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B/fc818799-49d5-4fca-b131-ebe8d5d831f1.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B/33349989-8573-4d71-ae0f-99691fdaffc3.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4b-9B/91551de5-d8ac-4c0d-b9b4-3627db947f0e.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4c-9B/c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4d-9B/36821a8b-af18-4631-b4b0-7e4b37bb194b.json delete mode 100644 data/hfopenllm_v2/lemon07r/Llama-3-RedMagic4-8B/e402d129-f4f1-4b95-b079-4f30936119aa.json delete mode 100644 data/hfopenllm_v2/lemon07r/llama-3-NeuralMahou-8b/814e1ea7-a639-4b05-9208-0bf537ea5479.json delete mode 100644 data/hfopenllm_v2/lesubra/ECE-EIFFEL-3B/35a50d36-31d0-454b-a13c-80ca26945f94.json delete mode 100644 data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv2/87347017-4ff1-4bd3-a1d7-8f3999061209.json delete mode 100644 data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv3/976184ed-c4ed-4898-83c7-521a8a8309ac.json delete mode 100644 data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V1/fa52f072-7725-4a4e-b728-042e5897a1bd.json delete mode 100644 data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V2/6374dcee-301c-4f28-9316-82ed8e693089.json delete mode 100644 data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V1/b7c95cb4-f32f-466e-a28c-32afd9ec5578.json delete mode 100644 data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V2/bddd742b-f7c9-44aa-ad2f-83f51a4625be.json delete mode 100644 data/hfopenllm_v2/lesubra/merge-test/099af0ee-c06b-4435-8f97-27681f3eddff.json delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full/fa826f3a-8688-4518-8d44-68189abb47ba.json delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half/10d29dc0-3486-40df-9933-1ce8f0fabaa2.json delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25/741ff375-3392-461e-a9b0-e0dab4e6e9f8.json delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75/c3d709de-118d-40c2-ab89-040efedd7fdb.json delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual/9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_145_/be850d1b-bf75-4c34-830f-8881792ac842.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_200_Gemma/6b644b97-4fc3-4826-9ea9-68be1dc8e947.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_212_QwenLawLo/861d41f1-6d33-4e07-96ea-2c39a36c4b63.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_212_Qwencore/7501b038-4847-45bc-8b92-6800d7a58c1e.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_230_Xiaqwen/db48206d-700b-45f3-b597-8752110113b5.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_375_QwenDyancabs/b52b76e4-9dec-4336-88b1-d98b95b95d2a.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_456_QwenKoen/ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_7B_KoenQwenDyan/724221ce-d7b2-43cb-8e16-72ac529a7b60.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_7B_Qwen2.5koen/552f3814-d071-4d00-a895-b739dffdcb2d.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyanKoenLo/d3819133-bae8-493d-9a86-aee67da5d115.json delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyancabsLAW/5c3a022f-7221-4b4f-ab67-d5b69c558434.json delete mode 100644 data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/c161b868-746f-4d88-9f41-eb8283a7b87a.json delete mode 100644 data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5/39f4d1ab-fd42-4746-b949-9666ce32f9d1.json delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6/8348f316-9109-4229-9fee-edc02431befa.json delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1/6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2/8645ffc1-6487-4205-b8b0-e980e094ac6c.json delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3/2c6d1e57-7673-4a86-808e-6ff6a7146a11.json delete mode 100644 data/hfopenllm_v2/lmsys/vicuna-13b-v1.3/64ab8b1a-62be-4561-8f0c-e42f1fe37178.json delete mode 100644 data/hfopenllm_v2/lmsys/vicuna-7b-v1.3/3eb22885-eb7c-4c85-b79f-cd47ffacd551.json delete mode 100644 data/hfopenllm_v2/lmsys/vicuna-7b-v1.5/8956d608-c627-469b-943d-bfad6c7382af.json delete mode 100644 data/hfopenllm_v2/lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7/9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json delete mode 100644 data/hfopenllm_v2/lordjia/Llama-3-Cantonese-8B-Instruct/e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json delete mode 100644 data/hfopenllm_v2/lordjia/Qwen2-Cantonese-7B-Instruct/20acb302-3a74-4425-af4c-a1d719b90a88.json delete mode 100644 data/hfopenllm_v2/lt-asset/nova-1.3b/a8613588-687d-4291-ae5a-57688501cffd.json delete mode 100644 data/hfopenllm_v2/lunahr/thea-3b-50r-u1/83dd67cb-5508-4aa5-9435-d5585b7f3d52.json delete mode 100644 data/hfopenllm_v2/lunahr/thea-v2-3b-50r/26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json delete mode 100644 data/hfopenllm_v2/m42-health/Llama3-Med42-70B/df06c977-b54c-4668-837f-eb583ef24d29.json delete mode 100644 data/hfopenllm_v2/macadeliccc/Samantha-Qwen-2-7B/31a8ac03-f58b-46e3-9f17-53311b1fd506.json delete mode 100644 data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-base/3e4a7141-7a82-421a-a107-bbac3cbafc9b.json delete mode 100644 data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-it/9a3069f2-81ed-484a-b6e6-a45a259e9a43.json delete mode 100644 data/hfopenllm_v2/magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002/c0a3d0c3-c541-4606-a925-4100b062284f.json delete mode 100644 data/hfopenllm_v2/maldv/Awqward2.5-32B-Instruct/20685a4b-686f-4cd4-b49d-3067a005256d.json delete mode 100644 data/hfopenllm_v2/maldv/Lytta2.5-32B-Instruct/85a91293-cd51-4f79-8b98-2f4bc67d78c1.json delete mode 100644 data/hfopenllm_v2/maldv/Qwentile2.5-32B-Instruct/d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json delete mode 100644 data/hfopenllm_v2/maldv/badger-kappa-llama-3-8b/c4d686f2-2af1-4271-9556-09380f07ba5f.json delete mode 100644 data/hfopenllm_v2/maldv/badger-lambda-llama-3-8b/93167303-b38e-43f0-a552-72c26ccb4339.json delete mode 100644 data/hfopenllm_v2/maldv/badger-mu-llama-3-8b/b52a176f-f369-4791-a7e3-88a72709c868.json delete mode 100644 data/hfopenllm_v2/maldv/badger-writer-llama-3-8b/b6310012-17f1-4ee0-abd0-0079a9299350.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Cheng-1/f581e832-0f77-496e-bcd3-6cfec51ef594.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Cheng-2-v1.1/47b47c89-b13b-4099-98b2-854feae05f63.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Cheng-2/8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1/4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3/0bdb6574-69e2-4858-b7aa-a90a5fadf741.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST/fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-Preview/d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M/912446e3-efdf-4ed0-80bd-261c6c87a3d0.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.1/5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.2/80680e5e-ab83-4a59-aeec-9d4166509c47.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.3/c5bc9c92-8469-4174-aafd-67bb61aaccf2.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.4/1d67b792-178b-4baa-a108-2362f658bd4e.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Qwen2.5-7B-Preview/eb0c87b0-4795-4029-82c1-57ce37ba8259.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1/dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview/e005624d-c822-4be1-9477-873642aae228.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/absolute-o1-7b/e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-2-28-2025/704598c3-c5d6-4ce0-bab3-0fa98118e16a.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.1/fafc9463-d725-4827-8bc1-5cd9e83814b6.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.2-normalize-false/109820e0-ee00-449c-9ae5-58a7bf1da5f8.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b/37f29d5b-d803-4195-9ce0-75e45e32c160.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursor-o1-7b/43546f48-8c46-4481-b1e5-f4b1ad2535be.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursorr-o1.2-7b/ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.1/9290c86f-40b0-4520-b8aa-3460de62c396.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.2/a4bf576e-9556-4956-8dcb-4d8906d45db0.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/etr1o-v1.1/320a5c00-3307-4bc3-9f47-9befb88e461c.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/etr1o-v1.2/844d1556-6bc6-467e-a145-f92646770727.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/fan-o1-7b/78923f4b-c2e7-4472-8398-10a0a8453ec5.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/olmner-7b/17abe1bf-2e97-409e-88e3-4f661861a195.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/olmner-della-7b/756978e5-1dfe-433e-ba88-339004a50ea7.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/olmner-o1-7b/a889ae3a-5d86-4454-bfb9-332c4b61b836.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/olmner-sbr-7b/2c5e1086-03b7-4cdd-801e-03fb26183076.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/post-cursa-o1/d9578847-b732-4c75-b246-9cdf03674fe0.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.2/4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.3/619037af-d528-4579-b7e3-58628468d8fb.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.4/5113b737-8d9f-4321-9a67-91f1aabb40a1.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.6/641ac372-2e5a-4b44-b22e-a17600a6a868.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1/7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/r1o-et/c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/sbr-o1-7b/50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json delete mode 100644 data/hfopenllm_v2/marcuscedricridia/stray-r1o-et/99d97aef-bb6b-471b-8ed7-f6f92f75842c.json delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/b98504a0-f1d6-4872-b748-2ca8199c5328.json delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/5a159667-7460-4a97-884e-6a96df59873b.json delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/acbb93b3-f8fc-479d-9610-392efd7d4ecc.json delete mode 100644 data/hfopenllm_v2/mattshumer/Reflection-Llama-3.1-70B/6d0589bd-1f05-44ee-afa5-3657b960d7c9.json delete mode 100644 data/hfopenllm_v2/mattshumer/ref_70_e3/134663d8-05a8-4336-90e2-68e7cba5f1df.json delete mode 100644 data/hfopenllm_v2/maywell/Qwen2-7B-Multilingual-RP/3bfced28-b06e-46ab-a6aa-171b0c424337.json delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.1-MedIT-SUN-8B/b6a83b82-6b05-4437-a076-e2a3982f6169.json delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-Instruct/f621201b-f571-4487-9f1e-b767675c659d.json delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-chat/710fdb79-fba4-42da-8e26-45b4caf75207.json delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000/35fa7a5e-8866-4ce3-9899-8737e908f34f.json delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800/2b24b69b-15dc-4666-83f3-c77db545bdbd.json delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0/0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.5B-chat/f45135b0-3c26-44b5-9922-a6c0817a172d.json delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct/67eb0d6c-9086-4c80-8506-c3e1489f2673.json delete mode 100644 data/hfopenllm_v2/meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json delete mode 100644 data/hfopenllm_v2/meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json delete mode 100644 data/hfopenllm_v2/meditsolutions/MedIT-Mesh-3B-Instruct/6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json delete mode 100644 data/hfopenllm_v2/meditsolutions/SmolLM2-MedIT-Upscale-2B/871131c1-295d-40a0-a396-09d24b880064.json delete mode 100644 data/hfopenllm_v2/meetkai/functionary-small-v3.1/44eefbb2-22d4-4dff-889d-a87fc40b2eea.json delete mode 100644 data/hfopenllm_v2/meraGPT/mera-mix-4x7B/cd1de470-a174-4c08-9efe-a06d493dc4b2.json delete mode 100644 data/hfopenllm_v2/mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B/fdb55a14-0697-4775-8358-fed202498b4f.json delete mode 100644 data/hfopenllm_v2/mergekit-community/SuperQwen-2.5-1.5B/c069a224-638a-4cad-a9ad-e4f8579e8c15.json delete mode 100644 data/hfopenllm_v2/mergekit-community/VirtuosoSmall-InstructModelStock/10e5c103-f25f-45bb-bfe6-a22876cffe87.json delete mode 100644 data/hfopenllm_v2/mergekit-community/diabolic6045_ELN-AOC-CAIN/a9ecca9a-c5d4-45b2-a403-e74a98a46322.json delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-dare_ties-ajgjgea/630d8a60-03b7-4550-82f4-e879b2e01c6c.json delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-della-zgowfmf/206b5a96-ae07-41fd-822f-436d49c57dcb.json delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-model_stock-azgztvm/702d2120-5301-4e03-bb0f-1f8ab19e522a.json delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-slerp-fmrazcr/61e39700-c237-49fc-baef-3fa573b3b0c6.json delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-ties-rraxdhv/8892ab84-750d-494f-9f87-ad28e73cf364.json delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-ties-ykqemwr/538a2eb7-34e4-4e78-a382-60a13710096e.json delete mode 100644 data/hfopenllm_v2/mergekit-community/sexeh_time_testing/a041629e-8ed8-4a6c-95ee-98e759501e19.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-13b-chat-hf/09f05984-5815-4b3d-bc73-83ea1e5ecc27.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-13b-hf/6535524e-f8cf-4f2f-9d89-9ba70aedac91.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-70b-chat-hf/08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-70b-hf/631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-7b-chat-hf/b771f6db-7516-4423-9010-3467db0e26e3.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-7b-hf/cf580dfb-2924-4c4b-9352-394275b959bd.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.1-70B-Instruct/ba549fe6-7718-4abf-a610-7e0f48611483.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.1-70B/b92440b1-78a9-4288-a432-f057f2b04a2f.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.1-8B-Instruct/838f3932-edf2-4f72-9238-981d1aadc771.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.1-8B/61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.2-1B-Instruct/0b307c78-94c7-418f-bc47-5106b81c30de.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.2-1B/18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.2-3B-Instruct/dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.2-3B/8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.3-70B-Instruct/f74d26e6-9dfb-4e81-8522-8309b27760cf.json delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B-Instruct/2022bcf3-a057-4b0a-aa33-6cf074ffc714.json delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B/a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/24d850fe-1817-4041-8767-085f4bd2bac3.json delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/610a3be1-1032-4079-ba37-d6c2c5f9fd55.json delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B/857bb10e-1b43-4714-a758-0cef5816ba02.json delete mode 100644 data/hfopenllm_v2/mhl1/Qwen2.5-0.5B-cinstruct-stage1/cdabdd54-6101-471c-9bd8-446953be986b.json delete mode 100644 data/hfopenllm_v2/microsoft/DialoGPT-medium/8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json delete mode 100644 data/hfopenllm_v2/microsoft/Orca-2-13b/65d10996-2c5b-4e11-9a07-319c2446a237.json delete mode 100644 data/hfopenllm_v2/microsoft/Orca-2-7b/ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-medium-128k-instruct/45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-medium-4k-instruct/4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-mini-128k-instruct/b4a79f30-3a04-4f78-861e-1571316a0642.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/53426038-df38-45ba-b621-34231c9cad7f.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-small-128k-instruct/d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-small-8k-instruct/8b752519-63d4-4638-b56e-1c45c7f4694e.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3.5-MoE-instruct/8da71b7c-7b73-453f-998b-84e70b54e471.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3.5-mini-instruct/2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json delete mode 100644 data/hfopenllm_v2/microsoft/Phi-4-mini-instruct/37e19712-3197-42da-a8f2-ae1f36c2b06c.json delete mode 100644 data/hfopenllm_v2/microsoft/phi-1/c6ae6691-64ec-443d-8d76-af614c8cc7f9.json delete mode 100644 data/hfopenllm_v2/microsoft/phi-1_5/80567722-8c6b-41b9-8103-3bdaedfdb8ee.json delete mode 100644 data/hfopenllm_v2/microsoft/phi-2/20192dc4-ea3a-4413-8457-18a592fa0c64.json delete mode 100644 data/hfopenllm_v2/microsoft/phi-4/8c878c05-86f7-4d61-81d7-9bb286516581.json delete mode 100644 data/hfopenllm_v2/microsoft/phi-4/fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json delete mode 100644 data/hfopenllm_v2/migtissera/Llama-3-70B-Synthia-v3.5/0516b46b-a957-413f-aadc-58f4339dc60a.json delete mode 100644 data/hfopenllm_v2/migtissera/Llama-3-8B-Synthia-v3.5/97200dd7-7ed0-4a7b-ace9-31c173f017f1.json delete mode 100644 data/hfopenllm_v2/migtissera/Tess-3-7B-SFT/758f8332-ffa8-4059-ac6f-400f9367bb23.json delete mode 100644 data/hfopenllm_v2/migtissera/Tess-3-Mistral-Nemo-12B/b1103662-055c-471e-ace8-dd75f607491d.json delete mode 100644 data/hfopenllm_v2/migtissera/Tess-v2.5-Phi-3-medium-128k-14B/27b0d675-498f-4351-b92f-7c0d1a3c83bd.json delete mode 100644 data/hfopenllm_v2/migtissera/Tess-v2.5.2-Qwen2-72B/3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json delete mode 100644 data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/3883b0d3-e442-42d3-adc6-ed959c902dd3.json delete mode 100644 data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/da172cdb-1388-42f5-97b1-ae8e15291631.json delete mode 100644 data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B/7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json delete mode 100644 data/hfopenllm_v2/mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/7cdd1de0-767d-4527-a024-c67166bb8b20.json delete mode 100644 data/hfopenllm_v2/minghaowu/Qwen1.5-1.8B-OpenHermes-2.5/d4702278-54c4-42e8-a901-dfe5c7f2004a.json delete mode 100644 data/hfopenllm_v2/ministral/Ministral-3b-instruct/149f8ee5-4376-4fcc-8f87-7412a3083570.json delete mode 100644 data/hfopenllm_v2/mistral-community/Mistral-7B-v0.2/de82b746-c5d7-450a-bc2b-1b2859d91d6b.json delete mode 100644 data/hfopenllm_v2/mistral-community/Mixtral-8x22B-v0.1/d2a916a6-288a-4761-a3fd-ca674edb67c1.json delete mode 100644 data/hfopenllm_v2/mistral-community/mixtral-8x22B-v0.3/cda497f9-c7f9-48d6-944b-0167476e5e5c.json delete mode 100644 data/hfopenllm_v2/mistralai/Codestral-22B-v0.1/b56c6c01-a226-4090-9332-330535d79e24.json delete mode 100644 data/hfopenllm_v2/mistralai/Ministral-8B-Instruct-2410/0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.1/2917c469-7e22-497e-8d62-9b9972266658.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.2/2424d85c-e092-4e7c-bf4f-ae014d08a159.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.3/90278363-1d8f-47ca-a7dc-c51c6b511dc9.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-v0.1/3c3197ee-675d-4bb7-874d-28104d2a3cae.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-v0.3/eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Large-Instruct-2411/d770f88d-b110-4f27-85e9-e52217c11798.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Nemo-Base-2407/364328ce-5de7-401f-ad84-0c76e3c1dc91.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Nemo-Instruct-2407/f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Small-24B-Base-2501/d641aa88-9981-4a25-90d5-fcc4564ede52.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/8915e742-df2e-41bc-b83f-3e111edfd257.json delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/e29a5e35-8677-4e53-83fd-85e919b4366a.json delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x22B-Instruct-v0.1/e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x22B-v0.1/504baceb-6684-430d-a532-b7b5b0b061fe.json delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x7B-Instruct-v0.1/31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/01ab0a3e-393a-497a-9b32-8af790b7581a.json delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/541967a6-b856-4dc9-958a-9335197fba99.json delete mode 100644 data/hfopenllm_v2/mixtao/MixTAO-7Bx2-MoE-v8.1/ee31c801-67cb-46a3-9e39-02e842c0473f.json delete mode 100644 data/hfopenllm_v2/mkurman/llama-3.2-MEDIT-3B-o1/65fabe8b-05af-461e-b804-fcff3492da34.json delete mode 100644 data/hfopenllm_v2/mkurman/phi-4-MedIT-11B-exp-1/7e1a7121-2c9f-4196-bbdd-48aea257f384.json delete mode 100644 data/hfopenllm_v2/mkurman/phi4-MedIT-10B-o1/dd32609c-316e-4511-8791-fcae33a1a506.json delete mode 100644 data/hfopenllm_v2/mkxu/llama-3-8b-instruct-fpo/d95d7058-49eb-47d7-b790-3a253291d22b.json delete mode 100644 data/hfopenllm_v2/mkxu/llama-3-8b-po1/37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json delete mode 100644 data/hfopenllm_v2/mlabonne/AlphaMonarch-7B/76d0d338-e502-4638-adad-c4c4df00c26f.json delete mode 100644 data/hfopenllm_v2/mlabonne/Beyonder-4x7B-v3/f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json delete mode 100644 data/hfopenllm_v2/mlabonne/BigQwen2.5-52B-Instruct/6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json delete mode 100644 data/hfopenllm_v2/mlabonne/BigQwen2.5-Echo-47B-Instruct/0e59c8ca-cde0-4482-ab03-3309bcb8737c.json delete mode 100644 data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v2/d7e900e2-0574-44cd-a68a-0dd2715cf48c.json delete mode 100644 data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v3/fd626c3f-566d-4193-9a85-e7c9a89e671c.json delete mode 100644 data/hfopenllm_v2/mlabonne/Daredevil-8B-abliterated/196b04ae-fd53-400f-9f08-19edd4959f6e.json delete mode 100644 data/hfopenllm_v2/mlabonne/Daredevil-8B/57177299-076a-4506-89a7-ce54af08df4f.json delete mode 100644 data/hfopenllm_v2/mlabonne/Hermes-3-Llama-3.1-70B-lorablated/d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json delete mode 100644 data/hfopenllm_v2/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated/92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json delete mode 100644 data/hfopenllm_v2/mlabonne/NeuralBeagle14-7B/cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json delete mode 100644 data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/6999bb02-29fd-4c59-886f-184362afa06e.json delete mode 100644 data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/913d1d8e-0b02-4ce5-9b7c-403143a8c880.json delete mode 100644 data/hfopenllm_v2/mlabonne/OrpoLlama-3-8B/82c87bc0-29cf-4150-92f5-c80fb0028ea6.json delete mode 100644 data/hfopenllm_v2/mlabonne/phixtral-2x2_8/a18834ad-6143-4ce2-9842-471817a60a39.json delete mode 100644 data/hfopenllm_v2/mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/be900bcf-8ec9-484f-81db-0e83975c1ecd.json delete mode 100644 data/hfopenllm_v2/mlx-community/Mistral-Small-24B-Instruct-2501-bf16/d226ccf6-674b-44c6-8b11-d782b59a961a.json delete mode 100644 data/hfopenllm_v2/mmnga/Llama-3-70B-japanese-suzume-vector-v0.1/d8839a1a-8d07-4e0b-bd44-2668c84f750c.json delete mode 100644 data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1/e90b04db-2eb3-483a-ab0e-ea8aef821d84.json delete mode 100644 data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1/900921ae-fbb2-4488-ab19-18987c1d008d.json delete mode 100644 data/hfopenllm_v2/moeru-ai/L3.1-Moe-2x8B-v0.2/0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json delete mode 100644 data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.1/b50a49cd-2909-4dbe-9c9f-c150abb99845.json delete mode 100644 data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.2/13831d81-a9dd-43c7-bce1-240aad42fbc6.json delete mode 100644 data/hfopenllm_v2/monsterapi/Llama-3_1-8B-Instruct-orca-ORPO/56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json delete mode 100644 data/hfopenllm_v2/monsterapi/gemma-2-2b-LoRA-MonsterInstruct/8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json delete mode 100644 data/hfopenllm_v2/mosaicml/mpt-7b/18ab167d-b72e-4fa9-94a8-09edc641c73f.json delete mode 100644 data/hfopenllm_v2/mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection/7df237ea-29c0-4d0a-9092-c41df4c13aca.json delete mode 100644 data/hfopenllm_v2/mrdayl/OpenCogito/e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json delete mode 100644 data/hfopenllm_v2/mrdayl/OpenCognito-r1/01591bb6-9daf-40fb-b802-0a007f4cc388.json delete mode 100644 data/hfopenllm_v2/mrdayl/OpenCognito-r2/f6c32abf-bbae-4827-9ce2-29ce20c9463e.json delete mode 100644 data/hfopenllm_v2/mrdayl/OpenCognito/74a6605d-3557-4458-bef5-cc9420434e68.json delete mode 100644 data/hfopenllm_v2/mrdayl/OpenThink/dbe6e126-d35c-4634-a544-adf374ed5d00.json delete mode 100644 data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-gsm8k-3e/d68681c1-01e4-4af0-9a81-e0aaed0ae865.json delete mode 100644 data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-limo/de9620b8-7112-436f-8941-fae2c5e7f9e0.json delete mode 100644 data/hfopenllm_v2/mukaj/Llama-3.1-Hawkish-8B/cafee7ac-deb6-4c4b-af8f-81548648cb14.json delete mode 100644 data/hfopenllm_v2/natong19/Mistral-Nemo-Instruct-2407-abliterated/3e3cb617-6f19-4731-b31a-b1f4d88237d5.json delete mode 100644 data/hfopenllm_v2/natong19/Qwen2-7B-Instruct-abliterated/3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json delete mode 100644 data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/8909f916-401b-4457-ab8f-2691696049c6.json delete mode 100644 data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/ae191508-7dad-4cac-ad4a-af95d7a15b5d.json delete mode 100644 data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish/507f5047-fac3-415f-b9fa-aae4311fa837.json delete mode 100644 data/hfopenllm_v2/nbeerbower/BigKartoffel-mistral-nemo-20B/0ee8716c-74f0-41b4-94a2-efc715150293.json delete mode 100644 data/hfopenllm_v2/nbeerbower/DoppelKartoffel-Mistral-Nemo-23B/fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json delete mode 100644 data/hfopenllm_v2/nbeerbower/DoublePotato-Mistral-Nemo-13B/4fd20259-c7c7-4da5-9013-ae2feb2175b1.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-1.5B/a7c8c345-cade-48fd-93c0-0f344044d2b5.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-14B/7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r16/7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5/25468720-93d7-4f10-a534-30c4976657e8.json delete mode 100644 data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B/5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json delete mode 100644 data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B/27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Flammades-Mistral-Nemo-12B/65917125-bb7c-4d64-ba5f-b5e4f67ec332.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Gemma2-Gutenberg-Doppel-9B/30bf22d8-b93a-4775-8073-30e14e15e35d.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Gutensuppe-mistral-nemo-12B/ff510365-a13d-4e44-9709-59a56e864991.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Hermes2-Gutenberg2-Mistral-7B/6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Kartoffel-Deepfry-12B/f1e8cdbb-14b7-4959-a053-fb1b37629aff.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Llama-3.1-Nemotron-lorablated-70B/4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Llama3.1-Gutenberg-Doppel-70B/d6966190-e254-4902-8472-cac59bfbdbe0.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Lyra-Gutenberg-mistral-nemo-12B/5fdb5437-f413-451d-9800-42036cda7686.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg-12B/347577a4-2768-4472-ba48-9b174ad89724.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg2-12B/33af440e-837d-4454-9340-af0d3ee74f77.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated/1a1f4709-8d05-4905-8105-0c3606d5ef5b.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT/28421948-089b-4487-bb71-a06e5ce74402.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2/3fa0c783-9226-4fc8-b3a0-6e960684f43d.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B/743b7fe2-f998-408c-98b1-af02d9c1ee2a.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental/0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v2/87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v7/6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B/525f1b9f-88a2-459d-bb4a-7c01a0107968.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Small-Drummer-22B/503f79be-7f05-4464-ac9f-0f284f1e7965.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Small-Gutenberg-Doppel-22B/86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Nemo-Loony-12B-experimental/d472ba79-6592-4f8a-a99c-ec3f71468d3e.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Nemoties-ChatML-12B/6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Qwen2.5-Gutenberg-Doppel-14B/76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json delete mode 100644 data/hfopenllm_v2/nbeerbower/SmolNemo-12B-FFT-experimental/d2845d6e-65dd-4448-901d-d554b3e741f3.json delete mode 100644 data/hfopenllm_v2/nbeerbower/Stella-mistral-nemo-12B-v2/f7dd203f-24d8-4875-878a-12ed99e20cd3.json delete mode 100644 data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-27B/287ae246-bee5-4fae-b78f-203491aa8df2.json delete mode 100644 data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-9B/9ee493f7-e031-4593-beae-65be17678e00.json delete mode 100644 data/hfopenllm_v2/nbeerbower/llama-3-gutenberg-8B/86b10c6f-41c6-4d0a-ae59-f90e204e466c.json delete mode 100644 data/hfopenllm_v2/nbeerbower/llama3.1-cc-8B/043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json delete mode 100644 data/hfopenllm_v2/nbeerbower/llama3.1-kartoffeldes-70B/1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades-12B/ee625c29-62c4-49da-9790-e7e67233157d.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades3-12B/02b16bf2-62bb-401e-9726-2135d8d610be.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-cc-12B/db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutades-12B/aa37bda0-2e0a-4361-a5b4-468154d8ac72.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v2/d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v3/becf9805-83a9-4137-a938-81a61a10e4f0.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v4/6e848120-bc31-4628-af05-30707a6dcc41.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B/864af855-71b0-4b11-ae3f-56294a7d0db9.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg2-12B-test/285bd390-1dd9-4db2-af45-68dea557da3c.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-kartoffel-12B/459e2375-1a15-4129-bee0-dc8852d531e2.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-narwhal-12B/7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-wissenschaft-12B/7ceab841-f9a3-455b-9314-243d8fc3cd11.json delete mode 100644 data/hfopenllm_v2/nbrahme/IndusQ/c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json delete mode 100644 data/hfopenllm_v2/necva/IE-cont-Llama3.1-8B/68cb2ca1-1648-41a2-92b7-969bccdca4ee.json delete mode 100644 data/hfopenllm_v2/necva/replica-IEPile/5f285d61-5e4b-4c5c-8960-c10313d76ae3.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.1-bf16-falcon3-7b-instruct/3af19898-8590-4aec-b324-46c7fbf596d3.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.1-falcon3-10b-instruct/e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.1-qwen2.5-7b-instruct/3f578b45-48f9-4022-991c-32a71706aba3.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.1-virtuoso-small/ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-10b-instruct/81630ea2-d496-4872-92b7-e476badaf50d.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-7b-instruct/9436d04a-9c81-47ad-a7b8-496e14058627.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.3-falcon3-7b-instruct/f1e6e54e-cb97-4980-8957-2190ee5c4c34.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.4-falcon3-7b-instruct/30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.5-falcon3-7b-instruct/1c389a32-68b3-47c0-a6b8-2c2291293002.json delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.6-falcon3-7b-instruct/e759a217-6571-446d-9bf9-d1512793f307.json delete mode 100644 data/hfopenllm_v2/neopolita/loki-v0.1-virtuoso/753f3b21-7365-4117-b2a0-a91f03ec3d39.json delete mode 100644 data/hfopenllm_v2/netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/297ef102-67c1-4e9c-b418-fed026bb1f8a.json delete mode 100644 data/hfopenllm_v2/netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b/9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json delete mode 100644 data/hfopenllm_v2/netcat420/Llama3.1-MFANN-8b/b1446577-f13f-434a-a0b4-916091395d4a.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/fc8946aa-8b04-482c-8c05-d026d2af07be.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/fabe3784-948c-4618-9cf0-c76a3ddd3820.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4/736dcf09-6a19-4e88-a790-7a7ee74d8717.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5/75b4c750-1570-4825-a04a-965c06861fd4.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES/b7f8b678-2aea-4d41-ba21-2083fc472574.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2/a8010630-58de-448c-af08-70b8ffec431b.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-SFT/4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-abliterated-phi2-merge-unretrained/1132251a-59c7-402e-9957-f9288864508f.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-llama3.1-Abliterated-SLERP/e2fac049-8f9f-4b71-bcd3-5746b7d90150.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1/d891a1e1-ad65-498f-9ee8-59523c1bfd19.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3/9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-v2/ca031f70-5785-46d1-8a58-b279d8340776.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V2/18457711-92b8-4c27-a89a-928fecdf5724.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.2/3398aeb8-08a8-4be9-a24c-efeabcaa2139.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.3/707bc006-4318-41bc-b91b-aa43ca7cba6f.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3b/7bfda919-13be-4b68-8655-99fe6a4605a2.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.15/f844e739-5f0d-4db4-ba66-bd33b1290571.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.18/0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.19/87652005-4404-4c45-bd4f-5f63c44adf63.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.20/a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.21/e8ba93e6-6f90-4169-8403-381b7f9e26ab.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.22/ea86b542-3d06-4e71-b49d-17cdd362b465.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.23/15615d2c-46a1-47c7-a273-697e97bdf9f2.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.24/a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv1.1/76f3fa3a-1629-4cdd-b457-3a108784b427.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv1.2/c9e979e1-4433-4a38-8fd4-c14895e74f44.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv1.3/3f2effba-1ab8-476d-b228-ed9491e83adf.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv1.4/a5f0fb1b-27a7-495f-a010-3307afdb8949.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.19/22f2aa1d-fff1-430a-9c20-3b32859d9665.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.20/daff0e6f-d29f-4861-855f-902a0cd9a469.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.21/0f5cb926-b691-4d57-87f5-290235fd250a.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.22.1/d9e813da-2966-4901-99f9-c7627c64fc52.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.23/4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.24/f7494fd4-d248-46a6-a46d-f9d8db560aae.json delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.25/4b8533d1-7770-435f-ba76-a5c658aabd8f.json delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/309c7906-0010-4f17-848f-185062d96a26.json delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-7b-MFANN-slerp/f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp/b4a70c71-dfac-4888-937e-d5220b491b0e.json delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/b879a534-6b24-4873-a0e4-e18453540121.json delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/7766c638-b4dc-4b2d-8c14-becdb1b709ef.json delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-MFANN-7b/dd211bef-3940-4d78-8f7b-a67da81d605b.json delete mode 100644 data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERP-V1.2/87e20b7a-85c8-4845-94b0-ace1e18814cb.json delete mode 100644 data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERPv1.1/9ab01db6-3154-4c5b-b6a2-35479538d332.json delete mode 100644 data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-v1.1/9d35316a-011d-4e45-ae57-317b53de621f.json delete mode 100644 data/hfopenllm_v2/netease-youdao/Confucius-o1-14B/c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json delete mode 100644 data/hfopenllm_v2/newsbang/Homer-7B-v0.1/0659cb01-0d52-42cb-9e3a-2d8cac01692e.json delete mode 100644 data/hfopenllm_v2/newsbang/Homer-7B-v0.2/98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v0.3-Qwen2.5-7B/6e0f7e7e-8927-436e-95a7-5a7c626ca241.json delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v0.4-Qwen2.5-7B/9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v0.5-Qwen2.5-7B/04840708-a4cc-407c-8b2a-876b382920a1.json delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-72B/83b0844c-70fe-4b63-8ed2-4147390518ee.json delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-7B/9cf10c60-bee1-4f4f-9e03-c3c10287bded.json delete mode 100644 data/hfopenllm_v2/nguyentd/FinancialAdvice-Qwen2.5-7B/8e92dd9e-a68c-46ef-9b03-955c06a21437.json delete mode 100644 data/hfopenllm_v2/ngxson/MiniThinky-1B-Llama-3.2/dd1139d8-2b44-4516-b24a-1219826f5482.json delete mode 100644 data/hfopenllm_v2/ngxson/MiniThinky-v2-1B-Llama-3.2/e37e86f7-b67b-4f0a-b1bd-92f30842b303.json delete mode 100644 data/hfopenllm_v2/nhyha/N3N_Delirium-v1_1030_0227/bc3b55d5-35ca-48b5-832e-8544e145b1b1.json delete mode 100644 data/hfopenllm_v2/nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216/5757cd3d-c64e-4743-8200-5e610e24bf95.json delete mode 100644 data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241029_1532/ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json delete mode 100644 data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241110_2026/bee54048-ebb2-4051-a18f-aa85b0f2ce27.json delete mode 100644 data/hfopenllm_v2/nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314/2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json delete mode 100644 data/hfopenllm_v2/nidum/Nidum-Limitless-Gemma-2B/2c530a3b-888e-4a61-b97b-ea875b30ec9c.json delete mode 100644 data/hfopenllm_v2/nisten/franqwenstein-35b/4c9fb322-735e-4644-8121-088d00f78c5f.json delete mode 100644 data/hfopenllm_v2/nisten/franqwenstein-35b/e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json delete mode 100644 data/hfopenllm_v2/nisten/tqwendo-36b/e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json delete mode 100644 data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.8/42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json delete mode 100644 data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.9/915ae579-786a-4eb2-a1bb-107a12c9c40d.json delete mode 100644 data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.1.0/3489ffea-a607-4f3d-a0c2-bd17147f244f.json delete mode 100644 data/hfopenllm_v2/nlpguy/Miisce-one/7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json delete mode 100644 data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v1/6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json delete mode 100644 data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v2/fe344f84-7428-45af-940f-736275bc4d50.json delete mode 100644 data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v3/60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json delete mode 100644 data/hfopenllm_v2/nlpguy/StableProse/1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json delete mode 100644 data/hfopenllm_v2/nlpguy/StarFusion-alpha1/2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json delete mode 100644 data/hfopenllm_v2/noname0202/Llama-3.2-4x3B-Instruct/e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json delete mode 100644 data/hfopenllm_v2/noname0202/gemma-2-2b-it-ties/01bc964f-552b-4cda-9ed0-cf720f0c8de4.json delete mode 100644 data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v1/c9e95c55-978e-485b-8a77-ab2e668e3254.json delete mode 100644 data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v2/c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json delete mode 100644 data/hfopenllm_v2/noname0202/llama-math-1b-r16-0to512tokens-test/ae1801cb-d112-4d1a-895d-c6743779846a.json delete mode 100644 data/hfopenllm_v2/noname0202/llama-math-1b-r32-0to512tokens-test/008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json delete mode 100644 data/hfopenllm_v2/noname0202/llama-math-1b-r32-test/379b315d-96fb-4edb-b2d6-3dc113a10c17.json delete mode 100644 data/hfopenllm_v2/noname0202/llama-math-1b-r8-512tokens-test/8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json delete mode 100644 data/hfopenllm_v2/notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/f76ce244-29f7-44f0-9850-7291f8e4cbf1.json delete mode 100644 data/hfopenllm_v2/nothingiisreal/L3.1-8B-Celeste-V1.5/506871f1-0c87-4e8c-a270-eed7b5da2599.json delete mode 100644 data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v2/c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json delete mode 100644 data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v3/59f14dca-923a-41f1-b443-cc3551063f45.json delete mode 100644 data/hfopenllm_v2/nvidia/AceInstruct-1.5B/a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json delete mode 100644 data/hfopenllm_v2/nvidia/AceInstruct-72B/51d8f53f-ad7e-4dae-9e2a-0895729ff790.json delete mode 100644 data/hfopenllm_v2/nvidia/AceInstruct-7B/421119ea-0da8-4b26-a335-f2e720618c44.json delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-1.5B-Instruct/b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-72B-Instruct/7c4c2ccf-7d7b-4d24-802e-20c182290d07.json delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-72B-RM/95212a55-f382-4869-9e11-cfa201ba865b.json delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-7B-Instruct/a7da2118-063c-489f-bb31-40f1b7beeefe.json delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-7B-RM/9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json delete mode 100644 data/hfopenllm_v2/nvidia/Hymba-1.5B-Base/a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json delete mode 100644 data/hfopenllm_v2/nvidia/Hymba-1.5B-Instruct/2fd1c45e-209c-43da-ae85-d60887513a96.json delete mode 100644 data/hfopenllm_v2/nvidia/Llama-3.1-Minitron-4B-Depth-Base/91e0e6aa-b933-4a02-a28d-8d69e698c60a.json delete mode 100644 data/hfopenllm_v2/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF/6f3f3d06-2937-4c55-9b95-a62ae5253571.json delete mode 100644 data/hfopenllm_v2/nvidia/Minitron-4B-Base/9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json delete mode 100644 data/hfopenllm_v2/nvidia/Minitron-8B-Base/60077cbd-87af-4a00-a359-9235acb011ed.json delete mode 100644 data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Base/577936a8-b450-4233-b633-064565b3d1a4.json delete mode 100644 data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Instruct/470b9413-2cc8-4bf4-9e7c-0b8e99929568.json delete mode 100644 data/hfopenllm_v2/nvidia/Nemotron-Mini-4B-Instruct/3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json delete mode 100644 data/hfopenllm_v2/nvidia/OpenMath2-Llama3.1-8B/3fccb1d0-5ae1-427a-adae-37004ecbacaa.json delete mode 100644 data/hfopenllm_v2/nxmwxm/Beast-Soul-new/6463183f-4043-4b96-b4d1-0bd41b4d6876.json delete mode 100644 data/hfopenllm_v2/occiglot/occiglot-7b-es-en-instruct/0b102423-1a06-4e5b-a287-710695658b63.json delete mode 100644 data/hfopenllm_v2/odyssey-labs/Astral-1-10B/b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json delete mode 100644 data/hfopenllm_v2/olabs-ai/reflection_model/3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json delete mode 100644 data/hfopenllm_v2/ontocord/Llama_3.2_1b-autoredteam_helpfulness-train/abd48d9d-0443-40be-a23a-68922771e14f.json delete mode 100644 data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only/436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json delete mode 100644 data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam/7a654100-b206-4011-828e-fb386df27d0c.json delete mode 100644 data/hfopenllm_v2/ontocord/RedPajama3b_v1-autoredteam_helpfulness-train/2f0e262c-a099-41f4-89f1-8b251708a960.json delete mode 100644 data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8-stack_2x/7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json delete mode 100644 data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8/8703dbdd-12ef-457b-8cda-f570c8f5c890.json delete mode 100644 data/hfopenllm_v2/ontocord/merged_0.5_expert_0.5/d77f3e8f-1eea-478e-babd-ba873d2d427c.json delete mode 100644 data/hfopenllm_v2/ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/783a4385-c802-4bb3-9a21-90629d16efc7.json delete mode 100644 data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1-instruct/bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json delete mode 100644 data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1/e80d25b5-3f4b-45a7-9472-09f98db03bf0.json delete mode 100644 data/hfopenllm_v2/ontocord/starcoder2-29b-ls/7fed0b1d-0d79-4784-8fd6-42f8611b1751.json delete mode 100644 data/hfopenllm_v2/ontocord/starcoder2_3b-AutoRedteam/be534cd3-8245-4370-ba6c-9687b431ee8d.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b-merge_test/e98967b7-3aff-4baa-92eb-eff86bf09797.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained/8736a22a-f980-4a01-953d-217f27050129.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/f8579305-003b-4727-b904-bad4f363a616.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/3103f36a-4a88-4a39-8261-0b597f8d6db4.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/eda9de3b-ae53-4102-b203-eddadbc50464.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/fa6ecaf9-457e-4135-ad25-4790ebc27737.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/e388c707-8b35-49a4-94eb-f32e983fe33e.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/f6273192-31cf-4ee1-af45-c2f62de05330.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue/105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text/3c4713a3-3973-4a04-9c4a-a6782251734e.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to/de70c700-a007-4e87-a3db-941ee285eb1f.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_math/a1324a7f-1911-4fa9-8d83-be891f752a61.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_news/9c4af0df-f538-4755-8cd0-eec6b2b26524.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_software/fde650a6-a5d1-4edc-bd64-8be806663263.json delete mode 100644 data/hfopenllm_v2/ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/96dd1a08-b166-4d8e-ac31-5e948adf931b.json delete mode 100644 data/hfopenllm_v2/oobabooga/CodeBooga-34B-v0.1/3b90b9db-a68e-4ee9-bd4d-a18cec357753.json delete mode 100644 data/hfopenllm_v2/oopere/Llama-FinSent-S/444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json delete mode 100644 data/hfopenllm_v2/oopere/Llama-FinSent-S/7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json delete mode 100644 data/hfopenllm_v2/oopere/pruned10-llama-3.2-3B/e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json delete mode 100644 data/hfopenllm_v2/oopere/pruned20-llama-1b/d05b129c-6b9e-4e6b-80fc-af65db620c5d.json delete mode 100644 data/hfopenllm_v2/oopere/pruned20-llama-3.2-3b/d9792fac-29c1-45b2-b649-cdebb6830e2f.json delete mode 100644 data/hfopenllm_v2/oopere/pruned40-llama-1b/fcc2f06a-e6c8-4c28-bf22-4ee582392912.json delete mode 100644 data/hfopenllm_v2/oopere/pruned40-llama-3.2-1B/c6e13327-90b3-440d-9367-dbcec54dd6cc.json delete mode 100644 data/hfopenllm_v2/oopere/pruned40-llama-3.2-3b/30b02429-350c-4d86-aded-ba8597bec4d5.json delete mode 100644 data/hfopenllm_v2/oopere/pruned60-llama-1b/7d1ee802-106e-4313-ba1d-72d5a0676c88.json delete mode 100644 data/hfopenllm_v2/oopere/pruned60-llama-3.2-3b/1b3af020-f65e-44b8-a9a2-ad60fa686427.json delete mode 100644 data/hfopenllm_v2/open-atlas/Atlas-Flash-1.5B-Preview/6e40871d-bc23-4f1c-a005-f5b8eb096f84.json delete mode 100644 data/hfopenllm_v2/open-atlas/Atlas-Flash-7B-Preview/1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json delete mode 100644 data/hfopenllm_v2/open-neo/Kyro-n1-3B/ec601f5d-bf19-4407-ac41-6b9272d94735.json delete mode 100644 data/hfopenllm_v2/open-neo/Kyro-n1-7B/87e53761-e8b7-4032-ae7a-c3a91704d115.json delete mode 100644 data/hfopenllm_v2/open-thoughts/OpenThinker-7B/59492d86-4b85-4865-84e9-84ab4ace630c.json delete mode 100644 data/hfopenllm_v2/openai-community/gpt2-large/cc082df2-259c-44c1-abe4-ef349056a2a9.json delete mode 100644 data/hfopenllm_v2/openai-community/gpt2-medium/3f069053-b24e-4242-9302-d46b82e511aa.json delete mode 100644 data/hfopenllm_v2/openai-community/gpt2-xl/62cd9bcb-a74c-40b9-be84-a0077235ae3c.json delete mode 100644 data/hfopenllm_v2/openai-community/gpt2/b4cd25f1-87d5-4173-a4d3-928444f6cb37.json delete mode 100644 data/hfopenllm_v2/openai-community/gpt2/ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json delete mode 100644 data/hfopenllm_v2/openbmb/MiniCPM-S-1B-sft-llama-format/1e5b62a3-018b-429a-b2b4-325545ee99dc.json delete mode 100644 data/hfopenllm_v2/openchat/openchat-3.5-0106/958d410e-ce43-44c0-8a56-685c0a618408.json delete mode 100644 data/hfopenllm_v2/openchat/openchat-3.5-1210/57c53f20-aa32-49fd-926a-f26c9d0759d4.json delete mode 100644 data/hfopenllm_v2/openchat/openchat-3.6-8b-20240522/76def522-6fe1-458f-bfbf-99b50ece3367.json delete mode 100644 data/hfopenllm_v2/openchat/openchat_3.5/c467bc88-6769-48ac-abd4-867ee38bbe57.json delete mode 100644 data/hfopenllm_v2/openchat/openchat_v3.2/801681eb-66f4-46e0-bb2b-7ba4b46679af.json delete mode 100644 data/hfopenllm_v2/openchat/openchat_v3.2_super/cdd0ea1c-b17a-4816-953c-1d7164c64114.json delete mode 100644 data/hfopenllm_v2/orai-nlp/Llama-eus-8B/b2060893-1f7d-4e7a-a458-3623147ac118.json delete mode 100644 data/hfopenllm_v2/oxyapi/oxy-1-small/cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json delete mode 100644 data/hfopenllm_v2/ozone-ai/0x-lite/34bfe887-5a3a-4626-997e-c35d3a0ec341.json delete mode 100644 data/hfopenllm_v2/ozone-research/Chirp-01/b81acc47-6fd5-4f89-8c70-f8f14b677e04.json delete mode 100644 data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V1/30b977a8-7882-49be-8621-9ee3fce270ec.json delete mode 100644 data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V2/3367fd79-713c-4691-80cd-4abb6b2818ef.json delete mode 100644 data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V5/add899b8-f3e6-4d87-8846-8254f4dfbd5f.json delete mode 100644 data/hfopenllm_v2/paloalma/Le_Triomphant-ECE-TW3/53829ec0-f233-4b61-a672-6a467823caaa.json delete mode 100644 data/hfopenllm_v2/paloalma/TW3-JRGL-v2/e2b41200-bff2-4835-a0ea-27ff56937570.json delete mode 100644 data/hfopenllm_v2/pankajmathur/Al_Dente_v1_8b/3d33f26d-72be-451e-bcf0-501e0bc2f1db.json delete mode 100644 data/hfopenllm_v2/pankajmathur/model_007_13b_v2/3b4c05fc-2ccf-46db-8d64-045508f6614b.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_3b/af83a91c-3b07-48c6-9726-5bd77347f810.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_7b/48759b07-9aea-42bd-8d73-9c4208d2789f.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_phi-4/68820679-55f4-494d-91a0-0db1bccb8983.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v2_7b/029774ac-a63d-4acc-a37c-4194e4afdecc.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v3_13b/146df856-e2c8-41eb-b860-ceb78c126e55.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v3_70b/74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v3_7b/b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b/e79d0a8c-caec-4dec-b119-3229ffa69a73.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_dpo/2c760893-b52a-40a9-9420-fb193a62a5c3.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_orpo/ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b/fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b_dpo/9450acd9-16b6-49a2-9b73-cf1161b96df3.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v7_72b/0d50ec2d-5dd4-487e-80cb-9533246a9876.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v7_7b/f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v8_1_70b/c5e48fd8-0eea-46a9-8790-1745923561d3.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_0_3B-Instruct/870c7739-8886-47df-8e20-09bfae03b9c5.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_1_1B-Instruct/d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_14B/6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_70b/24e7df20-e046-48f7-909e-502d0c70216a.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_4_70B/7920f562-9e7f-4a64-85f4-584b13af44de.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct/c6620817-69fe-40e2-bb0a-1e9c739ab65d.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct_preview/520e2d66-4143-493b-8533-64f86c6d676e.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_3B-Instruct/993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_1B-Instruct/4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_3B-Instruct/42c174d1-6211-4438-bb9a-24f3cf386a6d.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_1B-Instruct/625bf39b-a118-4ec6-82d0-5405cf70ba53.json delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_3B-Instruct/e09cb198-d259-42ea-a356-6efe61b1e12b.json delete mode 100644 data/hfopenllm_v2/paulml/ECE-ILAB-Q1/5838b130-c2e6-400c-80b7-6822efb5db2c.json delete mode 100644 data/hfopenllm_v2/pints-ai/1.5-Pints-16K-v0.1/52b51638-64cd-4b19-8fc7-c223d50bc549.json delete mode 100644 data/hfopenllm_v2/pints-ai/1.5-Pints-2K-v0.1/28b3178b-c963-4267-9649-3f7fc10fba3c.json delete mode 100644 data/hfopenllm_v2/piotr25691/thea-3b-25r/748298a2-5042-4636-ac7e-051c28916f3a.json delete mode 100644 data/hfopenllm_v2/piotr25691/thea-c-3b-25r/03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json delete mode 100644 data/hfopenllm_v2/piotr25691/thea-rp-3b-25r/c7fba530-63cc-4ece-a171-4a2919aa8057.json delete mode 100644 data/hfopenllm_v2/postbot/gpt2-medium-emailgen/c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json delete mode 100644 data/hfopenllm_v2/prince-canuma/Ministral-8B-Instruct-2410-HF/c3800a5c-310b-41cb-9b07-cfc1f1b13256.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Base/e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/2da19e45-117f-446b-b956-b35a20bb7411.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/9e982a33-19cb-4381-8560-884bc8946a2b.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Base/9130a862-cfd7-47ce-a92a-f60438739491.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Instruct/858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-CPO/5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-DPO/6feca911-7a6e-43a2-b59d-7cb48070fe8e.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-IPO/d3ad9813-273e-47de-be16-312cc67ac64f.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-KTO/317205ee-2cc6-4523-9662-be6508314b08.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-ORPO/3b5fe65a-50a1-4036-b81a-86117356cab9.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RDPO/812ac262-97f4-485e-93de-f8d420b8658e.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RRHF/39cd7eb0-781e-47b6-8eaa-c72e702f778f.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF/9411a8a4-306e-43da-96d7-c93eb3aac398.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SimPO/c93feb32-0526-44ac-b3ed-95f08c37cc9f.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT/1a3b0f7a-afb6-4002-9321-23a86f000c5c.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2/8d29363d-3096-4c54-a40e-acf4a7318a04.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO/8cea452d-63b8-4e82-9511-64c94f8e140d.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2/5e5b5424-1d48-4a5e-8775-52c75609c338.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO/73787033-ed1d-4d2e-b7b2-e886ef6f1036.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2/54c9403f-2525-45c0-a585-9ff598f95f6b.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO/77d0d88d-7ca8-4f3e-8b79-295f53140635.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2/727f27e3-2a3f-4572-8db5-87e498c4b6ca.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO/b6e0cc97-27cf-4082-a908-95d5c39014b8.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2/3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO/b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2/e47a3cab-dfef-47f6-9377-9ee32489bab6.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF/1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2/6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF/55f43b53-6ed9-4c16-bf75-c968999a6f36.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2/6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO/95096a89-2baf-4b14-bc6e-1f30e920c086.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-CPO/f1651632-2787-47cf-b471-89d1b89a6b01.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-DPO/e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-IPO/d3accbc1-d698-4357-ab08-0b98fb49b4ed.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-KTO/5388a25a-5780-4ae1-999f-172b558a7b52.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RDPO/9e4143ff-d461-4fdb-8bc7-86f959f69e68.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RRHF/5d843bd7-b34b-41d4-92ff-c25a709b4930.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF/87975b2f-298b-4297-8f4d-e5bb1bf5d113.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SimPO/41bb8174-f3d6-4862-b892-dbc9f6e2e696.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-CPO/683ad2cd-5e39-4088-b98b-94d89dda7b88.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-DPO/08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-IPO/4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-KTO/4986c30a-85b0-4263-9be4-d69c9b067e0c.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-ORPO/47b5a878-1a4a-425f-ae6f-ac286f681cca.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RDPO/992a6862-46b9-415e-858f-2eff8709ca81.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RRHF/c6391381-c973-4068-b72c-af08762d9e5c.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SLiC-HF/0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SimPO/56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-1.3B/d3e753cc-37fc-4d77-8b2d-da90a7843d60.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-2.7B/eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-DPO/2207b154-c5d4-4e5a-ade0-271e62d6345f.json delete mode 100644 data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-SimPO/f4161154-7777-4261-9275-a3002a1305d8.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Bellatrix-1.5B-xElite/8523812d-1db6-4a9d-b06b-ac904191789d.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1.5B-R1/6cd9ea81-618d-444e-a892-d4f9819daa67.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1B-v2/2217326d-377a-4503-8180-206c12c87436.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Blaze-14B-xElite/3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/COCO-7B-Instruct-1M/01124f11-b739-422b-97f7-062074b8d0fb.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-1M/7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-Stock/bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/2eae8905-5338-4a78-86e7-d354d06efa23.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/9dcc4121-e046-49c7-969e-7255b0c32d3d.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2-R1/dd7d4acd-549a-467b-b461-0eba5b019122.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2/159969cc-32c5-4f6f-b586-8e6d44180b44.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite3/b80e559d-e519-4678-8abc-ee5591b81fac.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite4/90c137c9-939d-4e77-9fcc-9e33551a6121.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Merge/f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-20B-v1/c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Codepy-Deepthink-3B/96c64d23-d23d-486c-83a4-4c0ab4f09d60.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Coma-II-14B/243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Condor-Opus-14B-Exp/438fb728-d6ad-4c28-a43c-ff82d522cd50.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Cygnus-II-14B/94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Deepthink-Llama-3-8B-Preview/5618fc82-d455-4261-8e34-1190d70fd3f3.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-14B/395f6339-3fca-4f4d-befc-2d231008efdd.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-7B/b22696ac-7074-44f2-b72f-c59ca0a41ce6.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Dinobot-Opus-14B-Exp/6856f8b6-a719-4f69-be71-4df582015f28.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Elita-0.1-Distilled-R1-abliterated/f2c0ea2b-76ae-4469-832e-84c0b79fa283.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Elita-1/5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Epimetheus-14B-Axo/9d5e329f-491a-4608-bcac-1ee63046b34a.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Equuleus-Opus-14B-Exp/80953f08-6530-4bab-a375-cc542081aabb.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Eridanus-Opus-14B-r999/0b8691a8-f394-4da3-a67b-faa1af9b42c9.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Evac-Opus-14B-Exp/fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/FastThink-0.5B-Tiny/c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview/8a10eeb6-7178-4c78-8940-68fad78e389b.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview2/f0bb774c-a842-4261-b817-b169ce65a493.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/GWQ2b/59afe234-3a7f-49bb-873c-df6cf793e5e5.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Gaea-Opus-14B-Exp/4074081a-66a6-42e4-994f-72541f90888b.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp1/6a618ec8-c029-49ec-9ea5-da52b5231280.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp2/edc8f510-c961-4c1f-9757-e80c4247f275.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Gauss-Opus-14B-R999/aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Jolt-v0.1/89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Lacerta-Opus-14B-Elite8/41000c74-8b29-4369-996f-cf3a2fd09f63.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-3.1-5B-Instruct/a1765846-74e1-440a-8851-12a571444059.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-3.1-8B-Open-SFT/9c6b594f-387a-42a3-9e40-3b26363e6071.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-3.2-3B-Math-Oct/2b910401-457a-45dd-920a-559f4595897b.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-3.2-6B-AlgoCode/90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-8B-Distill-CoT/5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-1B/df6e0cfb-d720-428a-a5ad-b1529faa07c0.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-3B/a88a6e6f-2253-4b67-9527-55ab6153e40f.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-Express.1-Math/00c66a37-b46b-47e8-a098-ce12433c1135.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/LwQ-10B-Instruct/6ad5483c-13dc-4e79-a719-66af383d195a.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/LwQ-Reasoner-10B/9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Magellanic-Opus-14B-Exp/3880e3bf-6ff0-4eef-a519-2649014254e1.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Magellanic-Qwen-25B-R999/e77efb9d-b1fc-4833-8e7f-8da683019018.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp.v2/2bcc02df-8d27-412a-8b58-c331df98e4d4.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp/622531d5-03f8-42cf-974e-94291aa1e515.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.0/b772f20f-afbd-496c-9f94-e5fd30d54466.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.1/169d5ad3-ae4a-42de-b951-f264d85bf623.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Exp/e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Stock/594780dc-d969-4a6b-b90b-1cc32f40c452.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-7B-Exp/4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Messier-Opus-14B-Elite7/bb576dc9-eede-48d6-b438-732da91a4d29.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Omni-Reasoner-Merged/0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Omni-Reasoner3-Merged/03d59002-dc98-467f-b2a9-605ef8d9b763.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Pegasus-Opus-14B-Exp/8a7034fd-7027-4a87-9cac-c95b745935d0.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Empathetic/717f745f-1eae-4277-8a31-dbed140ef3e8.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Math-IO/2dc78735-c0c3-4dd7-8e97-52c92785e623.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-QwQ/e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Super-1/6303d73e-4129-472a-a6fd-c64cb3de7204.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Super-o1/8a689e8f-19cc-45b7-80be-ce861a549af7.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Super/84881315-55a4-4f05-a115-cf82f850090d.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-o1/970dc71c-42be-4d50-86ac-f7301ec969ca.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi4-Super/c02e1fcf-a837-4b8a-a42d-63837c56128d.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Porpoise-Opus-14B-Exp/37280340-5b9a-47d9-aa37-9299d9025518.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v1/46e7ad9b-b774-46b9-933c-913d1b307f7a.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v2/c154d3f5-39dc-43c0-85ea-2e43b08494b4.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-14B-Conversational/abd830e4-2b7f-4895-8262-75926edbafd9.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-3B-Instruct/2c945021-72e3-4e7a-9c6f-81efb27b2206.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-7B-Instruct/5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT1-Merged/6c73f6ae-8ffd-4948-8071-33eab07437a6.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT2-7B-Instruct/fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-MathOct-7B/e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-1.5B-CoT/9278bcf2-bfab-437f-bd64-7496b24fb8cf.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-7B-CoT/633aa068-5613-41d8-a194-aebc9ce1586f.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Qwen-7B-Distill-Reasoner/d3c1a922-a453-4c7b-b33b-52934e7bf72b.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct/3a27b2a6-5eea-450b-91c7-1dc006229985.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M/395e37ae-005d-47c0-9cf5-919460e34350.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M/b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/SmolLM2-CoT-360M/452ab810-6921-4922-9446-f2a5c081dc61.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite5/1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite6/b2eefd3a-795c-4dc0-a10e-924bece05ea5.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm1/008cc919-f156-4a2e-af4b-eed015ca91f6.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm2/9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm4/7ea26e73-a501-40bf-8f01-81ab8e850a91.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm5/e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sqweeks-7B-Instruct/ba1965f8-b59f-4d71-920c-e3b401ca0534.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Tadpole-Opus-14B-Exp/6dc87410-a39e-41b1-8759-68c1556c8419.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Taurus-Opus-7B/c4ebe788-fb60-453b-914b-56bf87dd6374.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Triangulum-10B/45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Triangulum-5B/10593c13-3b30-4605-8063-c6a6526fc9d9.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Triangulum-v2-10B/12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Tucana-Opus-14B-r999/96d9b675-c299-4138-a381-fb4de36287e5.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Tulu-MathLingo-8B/17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-7B-Elite14/8999a5f3-f421-4663-835e-7626cebd2282.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.2/951e1a4f-ed6c-49ca-b648-6086989e333f.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.3/2acc0666-e0ff-4760-a74a-227a02775344.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-HybridMini-v1.3/3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-v0.1/e858aa6c-c424-447e-b512-7dcf794f9f0f.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.1/8773eac5-205e-4264-981b-58f1a25f872a.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.6-r999/c26ae286-a9b8-499f-b886-4b75be0cf2da.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.7-Vsm6/d3a61998-2d41-4349-bd15-ce29143cc910.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-OneCoder-UIGEN/56b66428-2751-4c62-b98c-6c60e58c45ca.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/Volans-Opus-14B-Exp/9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json delete mode 100644 data/hfopenllm_v2/prithivMLmods/WebMind-7B-v0.1/5855a920-428f-4699-becc-73d4422f706f.json delete mode 100644 data/hfopenllm_v2/pszemraj/Llama-3-6.3b-v0.1/f1004f08-7f46-4eb1-8f60-66893fca7180.json delete mode 100644 data/hfopenllm_v2/pszemraj/Mistral-v0.3-6B/97db158a-3035-45d3-8d92-a08c9e605493.json delete mode 100644 data/hfopenllm_v2/qingy2019/LLaMa_3.2_3B_Catalysts/0d81b928-2a24-4eb4-93d5-224e3c505532.json delete mode 100644 data/hfopenllm_v2/qingy2019/OpenMath2-Llama3.1-8B/bf4cc7ee-cad4-42af-8638-6b371577ec68.json delete mode 100644 data/hfopenllm_v2/qingy2019/Oracle-14B/5b574dda-0d85-47aa-9ebc-7f8581d402ca.json delete mode 100644 data/hfopenllm_v2/qingy2019/Oracle-14B/6043830f-8a9d-4a03-9de5-4805724a9ae8.json delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Alpha/9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Pro/217819b0-2c4b-4c26-823b-1ea14f893e01.json delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/0f844855-fb46-4b53-82c2-f36e5721c385.json delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/59aaa7ed-27d4-4765-b115-90570ad86c77.json delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Ultimate-14B-Instruct/4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json delete mode 100644 data/hfopenllm_v2/qingy2024/Benchmaxx-Llama-3.2-1B-Instruct/9202146d-5889-49fd-9025-e03153ba9093.json delete mode 100644 data/hfopenllm_v2/qingy2024/Eyas-17B-Instruct/94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json delete mode 100644 data/hfopenllm_v2/qingy2024/Falcon3-2x10B-MoE-Instruct/2245cf71-fb8d-44ca-b58d-06608312ee8c.json delete mode 100644 data/hfopenllm_v2/qingy2024/Fusion-14B-Instruct/9a823fde-7802-4876-b72c-d8f73cd17236.json delete mode 100644 data/hfopenllm_v2/qingy2024/Fusion2-14B-Instruct/ede99239-ef8f-49eb-a48b-0ec2553c99e5.json delete mode 100644 data/hfopenllm_v2/qingy2024/Fusion4-14B-Instruct/4a307570-994f-491c-87a7-ad90b7965b8b.json delete mode 100644 data/hfopenllm_v2/qingy2024/OwO-14B-Instruct/eb448d78-6417-4533-8458-99c1869a74ae.json delete mode 100644 data/hfopenllm_v2/qingy2024/QwEnlarge-16B-Instruct/e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json delete mode 100644 data/hfopenllm_v2/qingy2024/QwQ-14B-Math-v0.2/aab6b224-b948-4fb1-84b7-0dbe5c46d527.json delete mode 100644 data/hfopenllm_v2/qingy2024/Qwarkstar-4B-Instruct-Preview/2e5cd1de-6109-4f76-b722-abbd4b207f4d.json delete mode 100644 data/hfopenllm_v2/qingy2024/Qwarkstar-4B/767d1296-4971-478f-8d78-1d63d162ae5b.json delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.5-4B/eab74e3b-de61-4fa9-87c2-56e69b70349a.json delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct/3219d563-3bfb-4618-8cb3-e9b198d5b11f.json delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Alpha/233fd27c-561e-4c9e-a917-cbc5b08c055a.json delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Preview/a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.6-14B-Instruct/4b68ba49-6681-4add-9197-2cd711701e15.json delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.6-Math-14B-Instruct/5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json delete mode 100644 data/hfopenllm_v2/qq8933/OpenLongCoT-Base-Gemma2-2B/a6c631f6-890c-4199-abee-18b012bc48df.json delete mode 100644 data/hfopenllm_v2/raphgg/test-2.5-72B/1edc3610-40fc-467d-8410-26d4b6adebce.json delete mode 100644 data/hfopenllm_v2/rasyosef/Mistral-NeMo-Minitron-8B-Chat/42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json delete mode 100644 data/hfopenllm_v2/rasyosef/Phi-1_5-Instruct-v0.1/1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json delete mode 100644 data/hfopenllm_v2/rasyosef/phi-2-instruct-apo/821a21a0-6fd7-438a-933d-5e31b2dd2adc.json delete mode 100644 data/hfopenllm_v2/rasyosef/phi-2-instruct-v0.1/781a4cc6-a69d-4106-81aa-06e114f7c897.json delete mode 100644 data/hfopenllm_v2/realtreetune/rho-1b-sft-MATH/e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json delete mode 100644 data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/3b7524a8-d17b-4788-93f2-11076df464a7.json delete mode 100644 data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/6188a57f-4bc3-42a5-ad18-c59774e40407.json delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.1/28689805-7c4c-438e-8431-f4a6aceb5e94.json delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7c156689-9668-4ded-bacc-c88a03ad1526.json delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7e43f187-1959-4dfe-802f-094ba88f3b0d.json delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/a6170173-ef17-4cfa-a76e-8e51cb8cb970.json delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/e998d52b-dd94-4ef2-9cfc-5034ded0105a.json delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.4/a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.5/0f69217c-74ed-4398-8d1b-53d1a43be890.json delete mode 100644 data/hfopenllm_v2/redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/b973adcc-769c-4009-87c5-5f5af02a5d3a.json delete mode 100644 data/hfopenllm_v2/redrix/patricide-12B-Unslop-Mell/4b30f11e-a2b9-40e9-b080-9d7484a5d048.json delete mode 100644 data/hfopenllm_v2/refuelai/Llama-3-Refueled/befdae09-4caa-4996-a3ac-fe36310aaf01.json delete mode 100644 data/hfopenllm_v2/rhplus0831/maid-yuzu-v7/8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json delete mode 100644 data/hfopenllm_v2/rhymes-ai/Aria/7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json delete mode 100644 data/hfopenllm_v2/rhysjones/phi-2-orange-v2/7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json delete mode 100644 data/hfopenllm_v2/riaz/FineLlama-3.1-8B/1f3a733d-a6d3-453b-9763-61992cd514b0.json delete mode 100644 data/hfopenllm_v2/riaz/FineLlama-3.1-8B/d0eed3c1-2226-48c5-a314-e429f66c5053.json delete mode 100644 data/hfopenllm_v2/rmdhirr/Gluon-8B/957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-14b/55a01e8e-318a-4609-a862-bab4d62b3e7a.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-7b/cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-0.5b/c7b6515e-6f96-468b-8bc0-15212c31e790.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-1.5b/f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-14b/994aa481-627a-4bed-8719-9e874373cbc6.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-32b/9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-3b/c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-72b/e908b473-a015-4156-8e88-d67153479cb9.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-7b/173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/0bb65f09-323d-485f-886e-5a35c8bcd342.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/86b4c877-ef2d-4563-93a2-92d7e77eab5c.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Nemotron-70b/be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Qwen-14b/e574af17-dd3b-4c09-8689-ea598d44e562.json delete mode 100644 data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Instruct-8b-Merged/83958185-047a-4356-918d-2f45f273c08a.json delete mode 100644 data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Llama3-8B/d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json delete mode 100644 data/hfopenllm_v2/rootxhacker/Apollo-70B/a218e260-7f56-4676-af58-254bd84d0327.json delete mode 100644 data/hfopenllm_v2/rootxhacker/Apollo_v2-32B/f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json delete mode 100644 data/hfopenllm_v2/rootxhacker/apollo-7B/da5774b2-8a6f-4f2d-8267-beb25490b06a.json delete mode 100644 data/hfopenllm_v2/rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/274705bd-8eb6-4863-8998-f5d67c4ac827.json delete mode 100644 data/hfopenllm_v2/rubenroy/Geneva-12B-GCv2-5m/5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json delete mode 100644 data/hfopenllm_v2/rubenroy/Gilgamesh-72B/6918d1a3-e547-46b7-9062-274057c1f513.json delete mode 100644 data/hfopenllm_v2/rubenroy/Zurich-14B-GCv2-5m/599deb3c-49f9-4c0b-af8d-78f9e166820b.json delete mode 100644 data/hfopenllm_v2/ruizhe1217/sft-s1-qwen-0.5b/b4ea3f14-3787-434b-8f26-20ff640c0146.json delete mode 100644 data/hfopenllm_v2/rwitz/go-bruins-v2/6952c527-ca23-494a-910c-1c027e4a5a29.json delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-CPO/3f12e79c-dd1b-428d-9094-10a047205e3e.json delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-DPO/d508da29-0288-4a0a-b727-fc5355515c5e.json delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-IPO/48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-KTO/4bb7d331-f305-4c08-a073-87ba7b2cbde2.json delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-SPO/94639454-c525-4e6f-af27-d92d45a9ac40.json delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-SimPO/9fa81bb7-7abc-4764-9465-d61217590da5.json delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama3/9a683492-4057-4de4-a30a-aa66becffb13.json delete mode 100644 data/hfopenllm_v2/sabersalehk/Llama3-001-300/b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json delete mode 100644 data/hfopenllm_v2/sabersalehk/Llama3-SimPO/ba658bc7-b89d-4fb7-a794-f48bd3715a49.json delete mode 100644 data/hfopenllm_v2/sabersalehk/Llama3_001_200/93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json delete mode 100644 data/hfopenllm_v2/sabersalehk/Llama3_01_300/5a91b0bf-b043-41d2-960d-5f0e78abc400.json delete mode 100644 data/hfopenllm_v2/saishf/Fimbulvetr-Kuro-Lotus-10.7B/263f56e5-b578-475a-9bc4-b5ffc142f9e2.json delete mode 100644 data/hfopenllm_v2/saishf/Neural-SOVLish-Devil-8B-L3/9219ff66-73ba-45d8-99a0-23d23b3555ba.json delete mode 100644 data/hfopenllm_v2/saishshinde15/TethysAI_Base_Reasoning/b2328396-e9b2-464d-94e4-f03db19144ea.json delete mode 100644 data/hfopenllm_v2/saishshinde15/TethysAI_Vortex/3f895edf-8f54-48ff-a731-666144af0fda.json delete mode 100644 data/hfopenllm_v2/saishshinde15/TethysAI_Vortex_Reasoning/b48b8e16-a555-466b-8b1c-246137223311.json delete mode 100644 data/hfopenllm_v2/sakaltcommunity/novablast-preview/5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json delete mode 100644 data/hfopenllm_v2/sakaltcommunity/sakaltum-7b/d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json delete mode 100644 data/hfopenllm_v2/sakhan10/quantized_open_llama_3b_v2/0176903f-e6ca-4f21-b98a-00bc443bf244.json delete mode 100644 data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.0/11f32afc-95c1-4531-ae45-5a0974d36b3a.json delete mode 100644 data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.2/70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json delete mode 100644 data/hfopenllm_v2/sam-paech/Darkest-muse-v1/53cf325b-6f32-4791-8f95-8b982ea03b23.json delete mode 100644 data/hfopenllm_v2/sam-paech/Delirium-v1/8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json delete mode 100644 data/hfopenllm_v2/sam-paech/Quill-v1/7adf79de-a51d-4b87-989a-c218ec6d99e3.json delete mode 100644 data/hfopenllm_v2/sarvamai/OpenHathi-7B-Hi-v0.1-Base/92358e5a-5e73-4747-9e92-e5ac003b97f7.json delete mode 100644 data/hfopenllm_v2/schnapss/testmerge-7b/f1636512-b98f-4fe4-adf3-abd556dd0ab9.json delete mode 100644 data/hfopenllm_v2/sci-m-wang/Mistral-7B-Instruct-sa-v0.1/9333afdd-4866-412b-b11b-dfb118a06db9.json delete mode 100644 data/hfopenllm_v2/sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1/840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json delete mode 100644 data/hfopenllm_v2/sci-m-wang/deepseek-llm-7b-chat-sa-v0.1/071b49f2-8e23-47b1-9858-78d676d9905e.json delete mode 100644 data/hfopenllm_v2/securin/Securin-LLM-V2.5-Qwen-1.5B/d3821f53-87aa-470a-a403-c8e3cd100ae1.json delete mode 100644 data/hfopenllm_v2/senseable/WestLake-7B-v2/389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-70B-PlumChat/5f78f39a-42cc-4cf6-bb27-e2160765bf24.json delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-8B-MOTH/b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumChat/bef1cbad-4f75-4dde-b467-6145f72a87f4.json delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumCode/654bebe0-b461-427e-a4cf-06386e9272d8.json delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumMath/37ef4e34-58f8-463a-950f-48b3a6833d54.json delete mode 100644 data/hfopenllm_v2/sequelbox/gemma-2-9B-MOTH/20687086-8aab-40f1-aec6-03917f4f9bf5.json delete mode 100644 data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct/53a0a998-a0a6-4800-80bf-bfd83123f2f6.json delete mode 100644 data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct/4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json delete mode 100644 data/hfopenllm_v2/sethuiyer/LlamaZero-3.1-8B-Experimental-1208/42c8d84d-c8b8-42c6-8f49-4e971df173d7.json delete mode 100644 data/hfopenllm_v2/sethuiyer/Llamaverse-3.1-8B-Instruct/77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json delete mode 100644 data/hfopenllm_v2/sethuiyer/Llamazing-3.1-8B-Instruct/a9ed5d04-57d2-4566-91df-b798be939fdb.json delete mode 100644 data/hfopenllm_v2/sethuiyer/Qwen2.5-7B-Anvita/bad4ec47-fe84-4518-b072-6955938f0c86.json delete mode 100644 data/hfopenllm_v2/shadowml/BeagSake-7B/497e585c-059a-4e18-9a8f-bdaa066f59ea.json delete mode 100644 data/hfopenllm_v2/shadowml/Mixolar-4x7b/e24b2a4e-83e4-4a79-bc41-03a54af00595.json delete mode 100644 data/hfopenllm_v2/shastraai/Shastra-LLAMA2-Math-Commonsense-SFT/15e39361-585b-4870-b91a-64dce4fb37ec.json delete mode 100644 data/hfopenllm_v2/shivam9980/NEPALI-LLM/96efd11b-e9f2-4bf1-90f9-561714137edf.json delete mode 100644 data/hfopenllm_v2/shivam9980/mistral-7b-news-cnn-merged/98e9936d-d376-4c72-80a6-0a28cf722ac4.json delete mode 100644 data/hfopenllm_v2/shivank21/mistral_dpo_self/7ada9c83-7851-4da2-b9d1-d744b174b777.json delete mode 100644 data/hfopenllm_v2/shuttleai/shuttle-3/a6ed72b7-14f1-464c-a7f5-590791982696.json delete mode 100644 data/hfopenllm_v2/shyamieee/Padma-v7.0/79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json delete mode 100644 data/hfopenllm_v2/silma-ai/SILMA-9B-Instruct-v1.0/ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json delete mode 100644 data/hfopenllm_v2/silma-ai/SILMA-Kashif-2B-Instruct-v1.0/2663884f-941c-4e16-8029-b38e3a543733.json delete mode 100644 data/hfopenllm_v2/siqi00/Mistral-7B-DFT/ca7af645-4796-4b31-ae7d-2cbebe5a369b.json delete mode 100644 data/hfopenllm_v2/siqi00/Mistral-7B-DFT2/f95e098c-d320-4db1-887d-8c3252bbaf77.json delete mode 100644 data/hfopenllm_v2/skumar9/Llama-medx_v2/2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json delete mode 100644 data/hfopenllm_v2/skymizer/Llama2-7b-sft-chat-custom-template-dpo/5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json delete mode 100644 data/hfopenllm_v2/someon98/qwen-CoMa-0.5b/aadfae06-73b6-4306-b056-0a733b9bd8f4.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/ChocoTrio-14B-v1/cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-40/97640dd1-d415-4b56-818c-cdcede3c52fd.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-80/b750c460-ef70-4abf-b77d-118a82039598.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/KytheraMix-7B-v0.2/f4c20519-9e33-4698-a17a-07e5fe7d2707.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.1-experimental/0f204733-55b4-4c06-bd12-dbc2e2593abd.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.3/0bb226ed-fe88-4678-9b50-f77883ceb708.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.4-Qwenvergence/fb297e45-9e14-4853-8384-75c187b28a9b.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-002-model_stock/4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-model_stock/c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6/5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-Fusion/4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc1/2c044767-1169-48c6-9e37-e9d1e35f4cfe.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc4/bad67b35-d9ef-417a-955b-9c33e87cb927.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v1/60eaa315-f489-405d-a67d-7f1312e90cab.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-hi/50de312a-293d-41a4-8bee-4feb0c148b90.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-lo/56f24cac-394c-4439-8f2e-8270e7519bda.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2/8efa1423-0a39-4674-a94d-3d92448010d6.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v3/350b3491-cba8-46b4-a07f-3d1277270530.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen-14B-ProseStock-v4/0741ead7-24f3-49b0-9967-f726df84f78a.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen-2.5-14B-Virmarckeoso/1ea4d10e-e099-4967-8c43-e84acaeb40be.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v2/6c78d9f7-a61e-4f65-ac57-61597f735541.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant/e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01/153cfe7f-c27a-40b8-b8d2-54351f26f583.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock/b58372cd-5d55-4f42-a5da-2970e55b44b0.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3/34a028ac-2002-480c-a1af-5b945ffe872e.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso/065ffc51-154c-4a93-a342-0dd476fda473.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose/ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason/91004d26-7b8b-4c0a-bd8c-8880654dc93a.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1/5eb1aa92-a031-40d4-ad64-552075dae68a.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentessential-14B-v1/3ebc147d-58f2-4605-a011-a71c591fac0e.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v013/01795776-e909-46d3-8b6c-0989334e3d0e.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v1/00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v2/736249d0-cea9-46c6-9677-ecae4b410af4.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v3/ef602cfe-3453-4189-b583-292cf05421d1.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v5/559af2c1-deca-4c35-b83a-004c22ac958a.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6-Prose/8d66d895-626a-477f-91b6-2195f35aacb3.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6/004df803-70da-4e59-b3ad-f210c790f29e.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v7/bb2972ca-e673-4be5-bc7e-2689adeac3a9.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v8/eacf2411-a0ea-41fd-8363-e565fce0f26f.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v9/4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-qv256/f19dab38-48ed-438e-8a62-86e4d111f6c8.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock/ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v10/9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v11/404e3d61-26d3-4f95-9847-064f0c7c6970.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose-DS/0b4574f2-1b71-427f-9923-17db449be191.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose/775b88cd-98e8-4d93-acca-e294f68f2da2.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v13-Prose-DS/89464568-47cb-4659-af37-8b061d3f0c8c.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v15-Prose-MS/9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v2-Prose/c1882335-0df5-4df2-bfa1-c16126c328fb.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Prose/291471ed-3b7c-4bd4-91bb-c27cd74ec460.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/53565fe4-0368-477b-9916-ac9a4b8a9c7b.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3/e51fee25-7648-49d9-a8da-b8dbc68a722b.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock/6acdc96b-cfde-439f-b6b3-a66257b3fcde.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose/850da8de-ca13-4f15-bb9f-68b910355cfd.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v8/542fbb7a-d4eb-4cbf-b63a-4305cb108361.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v9/1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/lamarck-14b-prose-model_stock/6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json delete mode 100644 data/hfopenllm_v2/sometimesanotion/lamarck-14b-reason-model_stock/e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json delete mode 100644 data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/5113439d-1394-46f2-a38e-34b54e94a9e6.json delete mode 100644 data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json delete mode 100644 data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json delete mode 100644 data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps/446ac93f-d47c-4207-bf32-0cd94e88a931.json delete mode 100644 data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps/7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json delete mode 100644 data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps/ca77f821-4722-45b1-b731-7d774232acb4.json delete mode 100644 data/hfopenllm_v2/sophosympatheia/Midnight-Miqu-70B-v1.5/f32d2a11-edd3-4662-aed7-88c6820b2c2e.json delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2.0-Instruct/71c56883-dd14-4f16-b839-5ce607a4aadb.json delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2.1-Instruct/639004c2-81a5-410d-bd61-e3e263f55335.json delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2.2-Instruct/5f232a99-07c9-4df7-9d3b-837966ea6de5.json delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2.3-Instruct/482e34ee-8974-46c6-b3f4-4cc9872ef562.json delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2/13743252-3ba3-406d-8e95-5a4cd3ac3772.json delete mode 100644 data/hfopenllm_v2/spmurrayzzz/Mistral-Syndicate-7B/ff25cb66-ed6f-421a-a038-1feb24666645.json delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_12B_v2.0/843f0d9a-04e8-4cea-bb18-94651a814d1f.json delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_22B_v2.0_preview/fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_v1.4/ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/bd8fdfa5-bda1-402b-9010-94bf78b0127b.json delete mode 100644 data/hfopenllm_v2/ssmits/Qwen2.5-95B-Instruct/a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json delete mode 100644 data/hfopenllm_v2/stabilityai/StableBeluga2/dbf4fbac-cd99-426d-b725-600e60af00d2.json delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-12b-chat/f793c471-1638-476a-a050-455a32368e29.json delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-12b/1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-1_6b-chat/99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-1_6b/82a44b46-156f-4232-92e4-6a08d7a4f197.json delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-zephyr-1_6b/3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-3b-4e1t/dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-zephyr-3b/1cffcbeb-ef81-4efe-b883-0a8540a799e7.json delete mode 100644 data/hfopenllm_v2/sthenno-com/miscii-14b-0130/033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json delete mode 100644 data/hfopenllm_v2/sthenno-com/miscii-14b-0218/bfe654b8-cb79-4845-bf14-85012207ce90.json delete mode 100644 data/hfopenllm_v2/sthenno-com/miscii-14b-1028/5c4efc23-9591-447b-aecc-4c82797d7d01.json delete mode 100644 data/hfopenllm_v2/sthenno-com/miscii-14b-1225/a5fe3fab-95d9-41ac-a95f-66205e489dae.json delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-0120/c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-fusion-0309/3d556d9f-036b-4368-bb4a-18ad6b444bdf.json delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-kto-0205-ckpt80/92905e27-1033-4423-b87d-23236f9be964.json delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-001/17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-0124/11574f56-6c34-48e4-8fb5-c58d42f07330.json delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-ppo-ckpt40/8f728c51-15f9-422d-bbdb-4d976961ab9d.json delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-sft-0309-ckpt10/8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-sft-0314-stage1-ckpt50/5e33bf05-6c67-4ecc-982d-7590e9953145.json delete mode 100644 data/hfopenllm_v2/sthenno/tempestissimo-14b-0309/f55ae879-bd95-409c-a8a3-9a57cd615a31.json delete mode 100644 data/hfopenllm_v2/streamerbtw1002/Nexuim-R1-7B-Instruct/b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json delete mode 100644 data/hfopenllm_v2/stupidity-ai/Llama-3-8B-Instruct-MultiMoose/51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json delete mode 100644 data/hfopenllm_v2/suayptalha/Clarus-7B-v0.1/c46e4fa1-afae-4b68-a13e-034b5cd2b779.json delete mode 100644 data/hfopenllm_v2/suayptalha/Clarus-7B-v0.2/42cc06ed-20fc-4e84-836f-3d7243ec336d.json delete mode 100644 data/hfopenllm_v2/suayptalha/Clarus-7B-v0.3/aaa53387-af33-4454-95f0-3af85f4778c0.json delete mode 100644 data/hfopenllm_v2/suayptalha/DeepSeek-R1-Distill-Llama-3B/465bca6d-b32a-4d34-9916-fc8b3166faa0.json delete mode 100644 data/hfopenllm_v2/suayptalha/Falcon3-Jessi-v0.4-7B-Slerp/bf138f3d-09d9-4dea-aa43-5efc804bc775.json delete mode 100644 data/hfopenllm_v2/suayptalha/HomerCreativeAnvita-Mix-Qw7B/cb4e944c-66f6-49f2-b1e0-d90454e34315.json delete mode 100644 data/hfopenllm_v2/suayptalha/Komodo-Llama-3.2-3B-v2-fp16/b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json delete mode 100644 data/hfopenllm_v2/suayptalha/Lamarckvergence-14B/933f3d40-8726-418f-be2f-1f9686e9ab02.json delete mode 100644 data/hfopenllm_v2/suayptalha/Lix-14B-v0.1/af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json delete mode 100644 data/hfopenllm_v2/suayptalha/Luminis-phi-4/43df4336-1eb8-4df7-8309-1199aafc07b1.json delete mode 100644 data/hfopenllm_v2/suayptalha/Maestro-10B/44ae222d-407c-4c8b-9b67-75440631f848.json delete mode 100644 data/hfopenllm_v2/suayptalha/Rombos-2.5-T.E-8.1/a87db0fe-3727-4ff1-875f-9edd3109f3a2.json delete mode 100644 data/hfopenllm_v2/sumink/Qmerft/0c73e33a-7f6f-4925-970b-db289069d5ca.json delete mode 100644 data/hfopenllm_v2/sumink/Qwenftmodel/02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json delete mode 100644 data/hfopenllm_v2/sumink/Qwenmplus/590c031c-2aa6-48e6-9b3f-68b1a585dd39.json delete mode 100644 data/hfopenllm_v2/sumink/Qwensci/970c9fb8-c217-444b-a025-f4d9acdd679d.json delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen/07a08dd7-822b-49ac-859b-d2fc75b9c88d.json delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen2/0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen3/2ae306b1-5409-4418-b5e4-50feff9dafe7.json delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen4/44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen5/e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen6/6369fceb-148f-4491-9488-420182a9838f.json delete mode 100644 data/hfopenllm_v2/sumink/flflmillama/045c814e-a30f-4b6b-b4f4-382dee4063b7.json delete mode 100644 data/hfopenllm_v2/sumink/ftgpt/59d2b375-5696-47d0-9c96-1a826c08bea0.json delete mode 100644 data/hfopenllm_v2/sumink/llamaft/ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json delete mode 100644 data/hfopenllm_v2/sumink/llamamerge/8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json delete mode 100644 data/hfopenllm_v2/sumink/llftfl7/ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json delete mode 100644 data/hfopenllm_v2/sumink/llmer/d69ecbfa-5036-48b8-8fed-f9162e2857f5.json delete mode 100644 data/hfopenllm_v2/sumink/qwft/b5924329-c182-482a-bee8-22fcb348281d.json delete mode 100644 data/hfopenllm_v2/sumink/qwmer/a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json delete mode 100644 data/hfopenllm_v2/sumink/solarmer3/b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json delete mode 100644 data/hfopenllm_v2/sumink/somer/b5de0218-91dc-487a-be90-70f8bcb64803.json delete mode 100644 data/hfopenllm_v2/sumink/somer2/3870f65b-3429-45c2-846f-6af30155a78b.json delete mode 100644 data/hfopenllm_v2/sumink/somerft/d6c33a51-be09-4cb5-9942-4348668d3e5e.json delete mode 100644 data/hfopenllm_v2/sunbaby/BrainCog-8B-0.1-Instruct/1ccd36ee-445a-4861-8835-d602973148fc.json delete mode 100644 data/hfopenllm_v2/swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json delete mode 100644 data/hfopenllm_v2/synergetic/FrankenQwen2.5-14B/6a69202c-1c68-43e4-bd45-bbc2ff2db743.json delete mode 100644 data/hfopenllm_v2/talha2001/Beast-Soul-new/a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json delete mode 100644 data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct/f76d3d30-4fce-48a9-a26b-7d714fff1d29.json delete mode 100644 data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct/eb38a092-1b56-4348-8188-baa2243f7046.json delete mode 100644 data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/1c4cfb94-fc66-4fe2-9879-78683abe654f.json delete mode 100644 data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/2deef730-c37b-46ca-82b7-de38ae724fd4.json delete mode 100644 data/hfopenllm_v2/tanliboy/lambda-qwen2.5-14b-dpo-test/13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json delete mode 100644 data/hfopenllm_v2/tanliboy/lambda-qwen2.5-32b-dpo-test/36cf5b59-5369-4baf-80c1-3a47678eb5cb.json delete mode 100644 data/hfopenllm_v2/tannedbum/Ellaria-9B/fced3ef1-fb69-47fe-bf68-3efe72db3142.json delete mode 100644 data/hfopenllm_v2/tannedbum/L3-Nymeria-Maid-8B/7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json delete mode 100644 data/hfopenllm_v2/tannedbum/L3-Nymeria-v2-8B/6f413d72-cd9f-435c-b13e-9cec14edeb5c.json delete mode 100644 data/hfopenllm_v2/tannedbum/L3-Rhaenys-8B/a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json delete mode 100644 data/hfopenllm_v2/teknium/CollectiveCognition-v1.1-Mistral-7B/0b19508c-4996-4fb7-b0e0-9fa952854fa3.json delete mode 100644 data/hfopenllm_v2/teknium/OpenHermes-13B/447c22c1-8929-420f-b59b-01ab32a22281.json delete mode 100644 data/hfopenllm_v2/teknium/OpenHermes-2-Mistral-7B/ab3dbe43-658e-4c8a-a399-b3d070d467ba.json delete mode 100644 data/hfopenllm_v2/teknium/OpenHermes-2.5-Mistral-7B/ee5c87a4-aa06-4728-a9bf-2fc35284b987.json delete mode 100644 data/hfopenllm_v2/teknium/OpenHermes-7B/6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json delete mode 100644 data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v1/9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json delete mode 100644 data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v2/100cf60a-c43c-4b3a-a667-a45cffdd562a.json delete mode 100644 data/hfopenllm_v2/tensopolis/lamarckvergence-14b-tensopolis-v1/2088fca7-11d7-47de-808d-d47da0caad0f.json delete mode 100644 data/hfopenllm_v2/tensopolis/mistral-small-2501-tensopolis-v1/bf0b3560-9d38-406a-ad30-5fd157f0fe43.json delete mode 100644 data/hfopenllm_v2/tensopolis/mistral-small-r1-tensopolis/9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json delete mode 100644 data/hfopenllm_v2/tensopolis/phi-4-tensopolis-v1/14501de3-dac0-44af-8c17-7abcd9bbba8b.json delete mode 100644 data/hfopenllm_v2/tensopolis/qwen2.5-14b-tensopolis-v1/c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json delete mode 100644 data/hfopenllm_v2/tensopolis/qwen2.5-3b-or1-tensopolis/8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json delete mode 100644 data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v1/1326ff61-d0b4-46eb-9bcf-f978166e622b.json delete mode 100644 data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v2/4c9e829f-7a99-4d61-8730-7457215a4fd6.json delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v1/afc24d42-6d25-4036-8f22-fcf944b481b7.json delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v2/6f6db681-991e-408b-8d4e-71fff9e1c974.json delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v1/f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v2/77b457d9-4957-4f0d-a8d3-e005ae382239.json delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-small-v2-tensopolis-v1/11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json delete mode 100644 data/hfopenllm_v2/tensoropera/Fox-1-1.6B/23cc1e7f-0994-43a5-8403-5361a2976285.json delete mode 100644 data/hfopenllm_v2/tenyx/Llama3-TenyxChat-70B/88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json delete mode 100644 data/hfopenllm_v2/theo77186/Qwen2.5-Coder-7B-Instruct-20241106/ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json delete mode 100644 data/hfopenllm_v2/theprint/Boptruth-Agatha-7B/3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-7B-v2/a06ad94f-13ee-466c-b25f-87cd87012678.json delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-7B-v3/9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-Instruct/7dcd6e37-3685-4b08-b983-b2a711aeaf73.json delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-v2/b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-Nemo-12B-v2/4cc037a2-d952-4566-a575-015f8e3a5925.json delete mode 100644 data/hfopenllm_v2/theprint/Code-Llama-Bagel-8B/a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json delete mode 100644 data/hfopenllm_v2/theprint/Conversely-Mistral-7B/40e452df-8f0a-4473-a3d1-41f9c288c12f.json delete mode 100644 data/hfopenllm_v2/theprint/Llama-3.2-3B-VanRossum/216020ac-276b-436e-815b-d6968eb83770.json delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-7B/1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Llama-3.1-8B-v2/25739611-f690-41b4-87de-9f4ea8b3d815.json delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Llama-3.2-3B/b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Nemo-12B-Instruct/fa237949-c3ac-482a-8a54-5a2019f24016.json delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Qwen-2.5-14B/b60dd828-a3e7-46a8-b4c2-322aeca42faf.json delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Worldbuilder-7B/5de9f914-333f-4181-a93f-79257a3daf54.json delete mode 100644 data/hfopenllm_v2/theprint/RuDolph-Hermes-7B/e2d23da4-226a-4a02-8390-e8edaea4b65b.json delete mode 100644 data/hfopenllm_v2/theprint/WorldBuilder-12B/c64c7470-dcf9-46f8-b789-cab7e902739d.json delete mode 100644 data/hfopenllm_v2/theprint/phi-3-mini-4k-python/f6d727a3-19dc-4173-a88f-2c47449896aa.json delete mode 100644 data/hfopenllm_v2/thinkcoder/llama3-8b-instruct-lora-8-sft/490d14c8-2cb0-4328-9f41-6074b28d6fdc.json delete mode 100644 data/hfopenllm_v2/thirdeyeai/elevate360m/9351b079-7ef5-42ec-bb83-f0d8ec7de479.json delete mode 100644 data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-1_5B/852d5adb-f422-4102-8114-082ab0b3c07d.json delete mode 100644 data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B-0917/c64e98cd-c022-4834-a3e0-3949416d1fb1.json delete mode 100644 data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B/f101bd15-ac61-49d4-beac-c89bc889b34b.json delete mode 100644 data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2.5-7B-0917/11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json delete mode 100644 data/hfopenllm_v2/tianyil1/MistralForCausalLM_Cal_DPO/f0b57a60-8402-4430-93f3-b846a94113f2.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-10B-Base/50aa8077-4493-47a9-9cec-014c56343ecf.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-10B-Instruct/5e70d00b-c822-4ad6-afe8-3756a7038c57.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-1B-Base/8162ba41-e630-470f-a297-72fb9f2110fd.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-1B-Instruct/60dd9d02-476f-459d-a41c-f89f82116dc3.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-3B-Base/73e89f21-5799-4835-a0e0-a6664c0483da.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-3B-Instruct/7f355ad4-9156-486d-8cf4-723117da3bb8.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-7B-Base/4ccc6026-b639-488d-867f-d98ea49cf1b6.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-7B-Instruct/3cf2e68e-4de0-436e-935e-86935e11f72f.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Base/e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Instruct/c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-11B/94fb625d-f58c-4f2e-8268-1dc4472c1cce.json delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-40b-instruct/4481ddef-2bef-4284-b56d-21054f5a9a97.json delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-40b/80048c4b-e97b-45c7-aa04-70ce69481a97.json delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-7b-instruct/d21a2557-2348-4087-b2a6-6e1c0101bccc.json delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-7b/76290d4b-5526-400b-8ca4-24d220f7c02d.json delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-mamba-7b/3a146535-09b3-4246-8bd8-0e984e0905b1.json delete mode 100644 data/hfopenllm_v2/tinycompany/BiBo-v0.3/6683f95c-f97f-4117-b3c5-c1ed9587289e.json delete mode 100644 data/hfopenllm_v2/tinycompany/BiBo-v0.7/bbe74b2b-9e13-4c13-92c8-618078667248.json delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-bgem3/61876ce3-acc4-4619-b0c2-78ac4dff48ea.json delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-e5-large/b304baee-c9de-4982-801d-2b9e7f1a7334.json delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-ib/6f27e746-1bdd-4cec-a955-c27f2f9900ef.json delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic-moe/30637c5d-1bc0-49dc-8afd-335a9a66f196.json delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic1.5/169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-base/427d32f7-190b-4005-b02c-6a8ce089dbbf.json delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-bge-m3/de7551a8-63b1-4de3-899f-9d98cb985005.json delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-bgem3/eff6f456-906d-4320-8e6f-667fbbf0574a.json delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-ib/6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-nomic-moe/7e3d3803-c8d4-4025-8d12-c4c29c49c059.json delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5-fp32/a43a6ca9-3543-44bc-8511-ee5c45552070.json delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5/83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json delete mode 100644 data/hfopenllm_v2/tinycompany/Tamed-Shawty/6e2d4174-303f-437b-9abb-26667b1dd04c.json delete mode 100644 data/hfopenllm_v2/tklohj/WindyFloLLM/955e93d0-bec1-483c-b3f0-258e13d5cb16.json delete mode 100644 data/hfopenllm_v2/togethercomputer/GPT-JT-6B-v1/3065ca79-c5e9-4875-9f81-4231e971d818.json delete mode 100644 data/hfopenllm_v2/togethercomputer/GPT-NeoXT-Chat-Base-20B/fc7e485f-a416-420b-b43c-e45e502c4a8f.json delete mode 100644 data/hfopenllm_v2/togethercomputer/LLaMA-2-7B-32K/53e882c6-6eb5-4202-a8d0-3a313556c9f4.json delete mode 100644 data/hfopenllm_v2/togethercomputer/Llama-2-7B-32K-Instruct/ba715669-c0ed-471f-80a6-b67453fb4930.json delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Base/316cab27-5cac-4d26-90ae-05d1fc3bd14a.json delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Chat/d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Instruct/bf3eabff-fbf7-421c-9e04-548accc7678c.json delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Base-3B-v1/b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Chat-3B-v1/b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Instruct-3B-v1/5b769770-3b63-4863-a723-95212e2be40e.json delete mode 100644 data/hfopenllm_v2/tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1/f2264b41-efa5-4278-91fd-2f454aa91c61.json delete mode 100644 data/hfopenllm_v2/tomasmcm/sky-t1-coder-32b-flash/5c3484b4-6faa-47fd-a1a2-881898450f79.json delete mode 100644 data/hfopenllm_v2/trthminh1112/autotrain-llama32-1b-finetune/326b95f8-9eae-4064-a261-077a957e233c.json delete mode 100644 data/hfopenllm_v2/tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1/c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json delete mode 100644 data/hfopenllm_v2/universalml/NepaliGPT-2.0/89e55482-b762-4f5d-a021-211048719bdc.json delete mode 100644 data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct-no-system-message/81018e12-63f8-4ad8-87c4-181a13202497.json delete mode 100644 data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct/5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json delete mode 100644 data/hfopenllm_v2/unsloth/Phi-3-mini-4k-instruct/8b344f21-9038-4b15-aba8-308aa62e4b39.json delete mode 100644 data/hfopenllm_v2/unsloth/phi-4-bnb-4bit/68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json delete mode 100644 data/hfopenllm_v2/unsloth/phi-4-unsloth-bnb-4bit/df557f25-5505-49dd-a0cb-88fff601c6e2.json delete mode 100644 data/hfopenllm_v2/unsloth/phi-4/a50bf387-bf34-490f-979a-b6217a85a1bd.json delete mode 100644 data/hfopenllm_v2/upstage/SOLAR-10.7B-Instruct-v1.0/89264aa0-3bed-41d3-b171-2a5434cc990f.json delete mode 100644 data/hfopenllm_v2/upstage/SOLAR-10.7B-v1.0/a3272caf-a292-4dc7-8932-636a4099ca6b.json delete mode 100644 data/hfopenllm_v2/upstage/solar-pro-preview-instruct/c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json delete mode 100644 data/hfopenllm_v2/utkmst/chimera-beta-test2-lora-merged/b030646c-5f5c-43ab-bbc4-405f82992265.json delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-code-mistral-7b-v1.0/399e516c-d8c8-4511-a746-76c81f72b36a.json delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-codellama-34b-v2.0/bd8e4424-7903-43e7-8105-269de734582e.json delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-coder-ds-6.7b/9126e939-3a87-4774-9606-084c5b56e933.json delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-instruct-mistral-7b-v0.2/be2ef197-738e-422d-9a88-cafd124584b7.json delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/ee22e6c5-8529-4987-86d0-4abf3b525f90.json delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-zephyr-code-functionary-7b/83294141-a70f-40da-b3f8-21b367098cce.json delete mode 100644 data/hfopenllm_v2/v000000/L3-8B-Stheno-v3.2-abliterated/303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json delete mode 100644 data/hfopenllm_v2/v000000/L3.1-Niitorm-8B-DPO-t0.0001/1b13d76d-259f-41f2-baba-ce96ef0cb937.json delete mode 100644 data/hfopenllm_v2/v000000/L3.1-Storniitova-8B/b644a420-0a70-4b3d-9a5a-ff91911c857b.json delete mode 100644 data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-1e-Delta/33aaa60f-eb69-4d36-917c-6862121a223e.json delete mode 100644 data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json delete mode 100644 data/hfopenllm_v2/v000000/Qwen2.5-Lumen-14B/ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json delete mode 100644 data/hfopenllm_v2/vhab10/Llama-3.1-8B-Base-Instruct-SLERP/b8043d04-c3ab-4d6a-97eb-44b195a52710.json delete mode 100644 data/hfopenllm_v2/vhab10/Llama-3.2-Instruct-3B-TIES/c6bff6da-382f-4423-ba3a-d987839132e0.json delete mode 100644 data/hfopenllm_v2/vhab10/llama-3-8b-merged-linear/f3574ad1-a6d7-47fb-86e7-69c256452dea.json delete mode 100644 data/hfopenllm_v2/vicgalle/CarbonBeagle-11B-truthy/f2e47267-6c40-4d70-8420-295c95b318f3.json delete mode 100644 data/hfopenllm_v2/vicgalle/CarbonBeagle-11B/395f246e-34c6-40e6-bfeb-b047aa12cf90.json delete mode 100644 data/hfopenllm_v2/vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B/3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json delete mode 100644 data/hfopenllm_v2/vicgalle/Configurable-Llama-3.1-8B-Instruct/97c92043-9bed-460a-8d7b-70ab3584c75b.json delete mode 100644 data/hfopenllm_v2/vicgalle/Configurable-Yi-1.5-9B-Chat/ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json delete mode 100644 data/hfopenllm_v2/vicgalle/ConfigurableBeagle-11B/f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json delete mode 100644 data/hfopenllm_v2/vicgalle/ConfigurableHermes-7B/cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json delete mode 100644 data/hfopenllm_v2/vicgalle/ConfigurableSOLAR-10.7B/c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json delete mode 100644 data/hfopenllm_v2/vicgalle/Humanish-RP-Llama-3.1-8B/1b32c387-97a7-42ff-892c-d3bacebbf050.json delete mode 100644 data/hfopenllm_v2/vicgalle/Merge-Mistral-Prometheus-7B/cbea057c-b0f9-48ac-a075-eb28ebbaf358.json delete mode 100644 data/hfopenllm_v2/vicgalle/Merge-Mixtral-Prometheus-8x7B/0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json delete mode 100644 data/hfopenllm_v2/vicgalle/Roleplay-Llama-3-8B/a86678ad-344c-430f-80c7-02d634b0cd5b.json delete mode 100644 data/hfopenllm_v2/viettelsecurity-ai/security-llama3.2-3b/827f3236-74fa-432b-8177-8785ac25ad76.json delete mode 100644 data/hfopenllm_v2/vihangd/smart-dan-sft-v0.1/7f694687-77e5-41d2-923b-f2d5f231729b.json delete mode 100644 data/hfopenllm_v2/voidful/smol-360m-ft/daa9d03e-63b0-4c08-ae72-e11041200ac7.json delete mode 100644 data/hfopenllm_v2/vonjack/MobileLLM-125M-HF/1539822f-acc4-4dae-9e61-133da97ebcbe.json delete mode 100644 data/hfopenllm_v2/vonjack/Phi-3-mini-4k-instruct-LLaMAfied/eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json delete mode 100644 data/hfopenllm_v2/vonjack/Phi-3.5-mini-instruct-hermes-fc-json/448cac5f-a7d3-41fb-9b49-666758037eb4.json delete mode 100644 data/hfopenllm_v2/vonjack/Qwen2.5-Coder-0.5B-Merged/5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json delete mode 100644 data/hfopenllm_v2/vonjack/SmolLM2-1.7B-Merged/7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json delete mode 100644 data/hfopenllm_v2/vonjack/SmolLM2-135M-Merged/ec4d21be-b1a6-47a9-84a4-1a25249c1768.json delete mode 100644 data/hfopenllm_v2/vonjack/SmolLM2-360M-Merged/c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json delete mode 100644 data/hfopenllm_v2/w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored/f156ac38-056e-4ef1-bdbe-e83c299a683b.json delete mode 100644 data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp/11d3c8db-300c-4e02-b729-7adba6844ad2.json delete mode 100644 data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp0.1/fc75a820-fc0b-4e50-9304-61f0e93795c0.json delete mode 100644 data/hfopenllm_v2/wanlige/li-14b-v0.4/bb66896f-799c-4e17-8b54-af5e795699fa.json delete mode 100644 data/hfopenllm_v2/wannaphong/KhanomTanLLM-Instruct/30a1a786-7478-401f-85ae-57037ada3d32.json delete mode 100644 data/hfopenllm_v2/waqasali1707/Beast-Soul-new/05430b16-07b6-41a1-ade9-6211cdf8ccf1.json delete mode 100644 data/hfopenllm_v2/wave-on-discord/qwent-7b/09bc4d5a-f104-4a36-999c-11e2532eef1e.json delete mode 100644 data/hfopenllm_v2/weathermanj/Menda-3B-500/a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json delete mode 100644 data/hfopenllm_v2/weathermanj/Menda-3b-750/8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json delete mode 100644 data/hfopenllm_v2/weathermanj/Menda-3b-Optim-100/e4f39815-9704-4d0a-8d9b-39359367adcc.json delete mode 100644 data/hfopenllm_v2/weathermanj/Menda-3b-Optim-200/f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json delete mode 100644 data/hfopenllm_v2/win10/ArliAI-RPMax-v1.3-merge-13.3B/398996d9-299b-4120-a757-e2fe14e779ee.json delete mode 100644 data/hfopenllm_v2/win10/Breeze-13B-32k-Instruct-v1_0/4398633e-77b0-4b61-ae85-29b0e5aad38b.json delete mode 100644 data/hfopenllm_v2/win10/EVA-Norns-Qwen2.5-v0.1/1bc60148-512f-4830-b541-f30535cf74bf.json delete mode 100644 data/hfopenllm_v2/win10/Llama-3.2-3B-Instruct-24-9-29/a9dfb20a-13e0-4419-a747-7c001b2e9435.json delete mode 100644 data/hfopenllm_v2/win10/Norns-Qwen2.5-12B/388e3559-a3b6-4738-9843-9bdd048bae09.json delete mode 100644 data/hfopenllm_v2/win10/Norns-Qwen2.5-7B/994a6930-42d5-463a-9e7c-0a3070144211.json delete mode 100644 data/hfopenllm_v2/win10/Qwen2.5-2B-Instruct/cce46320-9794-443a-831a-92e2a21515b0.json delete mode 100644 data/hfopenllm_v2/win10/llama3-13.45b-Instruct/988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json delete mode 100644 data/hfopenllm_v2/win10/miscii-14b-1M-0128/3c675148-5d09-4778-baad-9295ef8cfc79.json delete mode 100644 data/hfopenllm_v2/winglian/Llama-3-8b-64k-PoSE/620b80ba-81ab-4504-9f42-4965014f3cd1.json delete mode 100644 data/hfopenllm_v2/winglian/llama-3-8b-256k-PoSE/b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json delete mode 100644 data/hfopenllm_v2/wzhouad/gemma-2-9b-it-WPO-HB/19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json delete mode 100644 data/hfopenllm_v2/x0000001/Deepseek-Lumen-R1-Qwen2.5-14B/7966789d-8ace-4b39-9093-96bbb8e641d8.json delete mode 100644 data/hfopenllm_v2/xMaulana/FinMatcha-3B-Instruct/5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json delete mode 100644 data/hfopenllm_v2/xinchen9/Llama3.1_8B_Instruct_CoT/a17563e3-0369-4042-8006-2ec781653f63.json delete mode 100644 data/hfopenllm_v2/xinchen9/Llama3.1_CoT/68369110-e371-4112-ae0a-14f7fe9fc40f.json delete mode 100644 data/hfopenllm_v2/xinchen9/Llama3.1_CoT_V1/2a6925d3-992f-4c4f-a57b-3eb41062743b.json delete mode 100644 data/hfopenllm_v2/xinchen9/Mistral-7B-CoT/28290ea9-9ce5-4605-ac5b-aa2d606994d8.json delete mode 100644 data/hfopenllm_v2/xinchen9/llama3-b8-ft-dis/eb2ed6eb-4789-400d-aea5-841547a20cd7.json delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/873218a0-7ddb-4287-88ce-8c8214e85c85.json delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/e4c32b92-46b4-431a-83f2-11499f587534.json delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/a05681a0-07e4-4206-ae89-dee4e9706467.json delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/b078f823-d603-4030-81a2-a3ca1a1117f9.json delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/26625158-6720-47c7-8c28-46ca7b4b947e.json delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/4d99a55e-39c0-41c7-9ef0-494f739ceaec.json delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/0af58746-0492-4ba7-8a17-c0a5c43d0700.json delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/bc79527d-ae58-4b17-afd8-df931562dbf3.json delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/3e7423d5-ad7e-48e2-bd25-a4946d443c24.json delete mode 100644 data/hfopenllm_v2/xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/7979fd6a-a886-41cc-987b-356b7c452bff.json delete mode 100644 data/hfopenllm_v2/xwen-team/Xwen-7B-Chat/2be6bc34-1e61-426f-b963-6e096b5418fb.json delete mode 100644 data/hfopenllm_v2/xxx777xxxASD/L3.1-ClaudeMaid-4x8B/c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json delete mode 100644 data/hfopenllm_v2/yam-peleg/Hebrew-Gemma-11B-Instruct/c845eb10-a028-4cc2-8f64-25d75480c0d5.json delete mode 100644 data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/377e7223-4876-49b6-8057-b1831d7f129b.json delete mode 100644 data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/4ddb9ed6-0599-482e-b12e-bcb01975cc85.json delete mode 100644 data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B/9d5af106-be69-4b62-99c1-fcfb6091d080.json delete mode 100644 data/hfopenllm_v2/yanng1242/Marcoro14-7B-slerp/2f2d7a55-2838-446d-9487-a6cfa0c03356.json delete mode 100644 data/hfopenllm_v2/yasserrmd/Coder-GRPO-3B/65d20d45-f63b-4b09-b66d-5f53297c0c20.json delete mode 100644 data/hfopenllm_v2/yasserrmd/Text2SQL-1.5B/4712953f-0777-4b97-8f13-f7309f19f0dc.json delete mode 100644 data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/84382308-04b5-439f-b486-b26d20da605a.json delete mode 100644 data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/e82be06f-14ed-45e8-a273-d28c50f5212b.json delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/5815ba55-40fc-4f8e-ae0b-b329c42fd503.json delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/e58eceb3-b501-4924-9d0d-98d7da3c16c5.json delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/5a88455c-7699-4c49-8a12-76cda15d878c.json delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/122b4c1e-6e6c-4db5-8991-b091361c3ecf.json delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/679f214f-e03f-47a9-8a11-91adbf1c4880.json delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/680e77b8-9c64-4c52-aa83-55236039cef1.json delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/c24c471c-14b3-462e-8b81-6548b27e5ffc.json delete mode 100644 data/hfopenllm_v2/yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/efa7fa62-2e8b-403c-b345-eef876b48dbd.json delete mode 100644 data/hfopenllm_v2/ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8/40bae762-65bd-4b4c-b422-ffd0fd3790a9.json delete mode 100644 data/hfopenllm_v2/ymcki/Llama-3.1-8B-GRPO-Instruct/596957cc-719c-44c7-8284-06a9ba0d1a30.json delete mode 100644 data/hfopenllm_v2/ymcki/Llama-3.1-8B-SFT-GRPO-Instruct/706bbc09-f867-4327-bc4d-b5ede41ebd93.json delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/8962e9be-75bf-4f57-8ce2-b29523740851.json delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18/014f4838-22ff-4802-a887-4d2de01a9256.json delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24/5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/09b81cf2-3b79-448c-ab8e-87e378c804bb.json delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO/28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17/845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO/706737c7-cd1a-4958-9ffc-2655f0b50178.json delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18/5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-24/d374a68d-b985-47c2-b087-500bffa93c80.json delete mode 100644 data/hfopenllm_v2/yuchenxie/ArlowGPT-3B-Multilingual/23fbceb0-b646-4945-b17f-66dde24a0e43.json delete mode 100644 data/hfopenllm_v2/yuchenxie/ArlowGPT-8B/73d9e204-e829-4159-b340-6d9581c6f0e1.json delete mode 100644 data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO/a6979dda-fba6-4104-b153-3b0a89de8585.json delete mode 100644 data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties/62e04968-0c5c-4aad-a434-d9d24bccbdb8.json delete mode 100644 data/hfopenllm_v2/yuvraj17/Llama3-8B-abliterated-Spectrum-slerp/bae4064e-b10f-4082-876d-e4168ca1a8cc.json delete mode 100644 data/hfopenllm_v2/zake7749/gemma-2-2b-it-chinese-kyara-dpo/0040b48c-0f54-4c9b-97ee-1ca833c68e36.json delete mode 100644 data/hfopenllm_v2/zake7749/gemma-2-9b-it-chinese-kyara/6050e969-bcde-4594-8e53-05fa74c7287d.json delete mode 100644 data/hfopenllm_v2/zelk12/Gemma-2-TM-9B/3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen1-gemma-2-9B/ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen2-GI-gemma-2-9B/4048fa60-7427-4f7e-9939-e270aa5e8b51.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen2-gemma-2-9B/f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen3-gemma-2-9B/1da70796-d40b-4f2a-8ce3-b304f414a6d5.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen4-gemma-2-9B/de476f79-2539-4f9e-a1d2-901c6c4342d4.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen5-gemma-2-9B/80aee542-c894-46b6-a6ed-9f3400aefa9e.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen6-gemma-2-9B/5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen6fix-gemma-2-9B/4b019824-8454-4ce8-aa49-d122a2491f9c.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen7-gemma-2-9B/0dfcd13c-f057-4aec-82ad-b5cf2b266502.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Max-Merge_02012025163610-gemma-2-9B/927589bf-f6a0-4155-a24b-120231bbf029.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge-gemma-2-9B/1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge1-gemma-2-9B/0110d1c9-755e-4f09-888b-0c9c1a263639.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/cda65781-494c-45bd-8c32-7b1fe987f31c.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge2-gemma-2-9B/2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge3-gemma-2-9B/acf07f51-5acd-4375-bafa-7a1a244db3c6.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge4-gemma-2-9B/ff985193-ba26-45d3-97be-b7d3b17ab4d7.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge5-gemma-2-9B/21dbea2c-5cb1-431c-a496-af9b932b3440.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge6-gemma-2-9B/1143955c-c32c-4b41-8484-2c77e72f4946.json delete mode 100644 data/hfopenllm_v2/zelk12/MT-gemma-2-9B/94824ceb-08c3-415c-8003-b70a0d9af09d.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen1-gemma-2-9B/bf2903cb-b954-4870-98c3-116a96aa49fb.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen2-gemma-2-9B/b089c439-a38c-438d-bdad-1c68a1265d95.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen3-gemma-2-9B/c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen4-gemma-2-9B/fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B/1c81787b-594e-4bb6-aee1-7f193a628b16.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen5-gemma-2-9B/fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen6-gemma-2-9B/0625f09a-3e02-410b-963b-49b83dfc5c8f.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen7-gemma-2-9B/50c1399e-b409-4dff-b4d6-9be01dbb02c7.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B/402bdb4a-b258-40a4-ac9f-de74026c02f3.json delete mode 100644 data/hfopenllm_v2/zelk12/MT1-gemma-2-9B/65dcf458-db0f-45cd-a8a4-e16108e51161.json delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen1-gemma-2-9B/f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen2-gemma-2-9B/11e7b55a-d872-474a-98a6-fc82ce5a863e.json delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen3-gemma-2-9B/19688633-fa6c-412a-8dbc-c16fc49b3276.json delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen4-gemma-2-9B/7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen5-gemma-2-9B/447f880c-643f-4041-8cdb-87697d798085.json delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen6-gemma-2-9B/653d459e-f8b7-48bc-a9db-779e515532cf.json delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen7-gemma-2-9B/4e56faf6-dbde-4059-b502-32c76bdbed2d.json delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B/f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json delete mode 100644 data/hfopenllm_v2/zelk12/MT2-gemma-2-9B/7d08412d-e987-497f-a6ec-ce0affe0f80f.json delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen1-gemma-2-9B/f042f897-cfe8-4d8c-b75b-bbfca44505ea.json delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen2-gemma-2-9B/f24ab334-c022-4e34-a930-3fed6ee18793.json delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen3-gemma-2-9B/2bd3c620-780f-452d-92d7-d01a04539939.json delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen4-gemma-2-9B/234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B/d8e0a32e-f307-4056-b450-47a12a0a7b15.json delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B_v1/9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen6-gemma-2-9B/037787fb-9c61-4c56-a7fc-704c04b519f7.json delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B/5df3dd8f-4921-4916-8163-8651b796e478.json delete mode 100644 data/hfopenllm_v2/zelk12/MT3-gemma-2-9B/50463593-3a53-4b3f-9621-d05670309b7e.json delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen1-gemma-2-9B/d7fef356-36c7-488f-8f49-997682a2c01a.json delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen2-gemma-2-9B/42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen3-gemma-2-9B/b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen4-gemma-2-9B/e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen5-gemma-2-9B/731a5f85-a59e-40af-870c-00e519ca0e7e.json delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B/38d93ae8-90ec-473c-8570-33d52c46770b.json delete mode 100644 data/hfopenllm_v2/zelk12/MT4-gemma-2-9B/9072fd28-040b-44df-bd58-6e3f59398189.json delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen1-gemma-2-9B/14827e00-09c5-4ebd-93cb-8e026ac73d20.json delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen2-gemma-2-9B/11e76d74-b8e0-408f-b429-566faa5d60a2.json delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen3-gemma-2-9B/944c84d8-231d-47ef-85f4-23c0286a4a02.json delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen4-gemma-2-9B/47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen5-gemma-2-9B/ca54a8d4-153b-4169-b6ee-133461a9bedd.json delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B/652359ec-14f2-4f94-a694-b7dc98819bfc.json delete mode 100644 data/hfopenllm_v2/zelk12/MT5-gemma-2-9B/b34f3335-c7a3-431f-b2c8-6f0731a81378.json delete mode 100644 data/hfopenllm_v2/zelk12/MTM-Merge-gemma-2-9B/077306f9-5d40-40dc-9df4-b5ca559af5c7.json delete mode 100644 data/hfopenllm_v2/zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B/e0f0fe87-8ed3-4398-8683-65aa042d01d9.json delete mode 100644 data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25-gemma-2-9B/2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json delete mode 100644 data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/db476911-87fb-433f-b164-4435718dab46.json delete mode 100644 data/hfopenllm_v2/zelk12/Rv0.4MT4g2-gemma-2-9B/75a967f6-a8ab-435f-999b-4889e8217dce.json delete mode 100644 data/hfopenllm_v2/zelk12/T31122024203920-gemma-2-9B/e072997b-2f79-4d25-b8dc-ebf15ac311e1.json delete mode 100644 data/hfopenllm_v2/zelk12/Test01012025155054/6d681a29-0d1a-4054-8250-5246993509f8.json delete mode 100644 data/hfopenllm_v2/zelk12/Test01012025155054t0.5_gemma-2/2a6af4ce-e45c-4721-a23c-03071a5e774f.json delete mode 100644 data/hfopenllm_v2/zelk12/gemma-2-S2MTM-9B/5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/60052d34-f6a7-4204-baea-532f5ba29880.json delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/e1ddd882-f8a1-48d0-bb2a-878f43095895.json delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1/d2c3edec-38d8-48e3-9f6d-e26a63442af8.json delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2/dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ifable-9B-v0.1/fc262523-dcde-4b45-80ba-2922e66d42c4.json delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/f8d745da-9867-4348-bace-d8052c3b4025.json delete mode 100644 data/hfopenllm_v2/zetasepic/Qwen2.5-32B-Instruct-abliterated-v2/3d410f0f-6b24-4e86-a353-6142c51b1ecc.json delete mode 100644 data/hfopenllm_v2/zetasepic/Qwen2.5-72B-Instruct-abliterated/46329fc3-974f-4d04-be9e-ba85b3816efc.json delete mode 100644 data/hfopenllm_v2/zhengr/MixTAO-7Bx2-MoE-v8.1/b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json delete mode 100644 data/livecodebenchpro/alibaba/qwen3-235b-a22b-thinking-2507/126326f3-6521-45d1-aa14-5c51335c1929.json delete mode 100644 data/livecodebenchpro/alibaba/qwen3-30b-a3b/b3f5937a-1489-417b-8162-6c62dea0703d.json delete mode 100644 data/livecodebenchpro/alibaba/qwen3-max/f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json delete mode 100644 data/livecodebenchpro/alibaba/qwen3-next-80b-a3b-thinking/809a1503-a161-4532-afd3-fdbd6551eb63.json delete mode 100644 data/livecodebenchpro/aliyun/qwen3-next-80b-a3b-thinking/808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json delete mode 100644 data/livecodebenchpro/anthropic/claude-3-7-sonnet-20250219/be076445-eb88-49b0-a855-2e0cb1551bab.json delete mode 100644 data/livecodebenchpro/anthropic/claude-3.7-sonnet/69210faf-04a8-46d4-b92b-94f2ca521c09.json delete mode 100644 data/livecodebenchpro/anthropic/claude-sonnet-4-5-20250929/ed293aa1-f64e-429d-bddf-91a35a4203d1.json delete mode 100644 data/livecodebenchpro/ark/ep-20250603132404-cgpjm/2bddd388-5e9a-423e-8767-37d6f9f69032.json delete mode 100644 data/livecodebenchpro/bytedance/doubao-seed-1-6-thinking-250615/bfd991ca-13e9-4716-b389-11e0d2afe286.json delete mode 100644 data/livecodebenchpro/deepseek/chat-v3-0324/b29b7c8e-759e-45fe-a9d3-1054f19af617.json delete mode 100644 data/livecodebenchpro/deepseek/ep-20250214004308-p7n89/801d2dc6-17e7-47f1-a54f-87b94a59b508.json delete mode 100644 data/livecodebenchpro/deepseek/ep-20250228232227-z44x5/def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json delete mode 100644 data/livecodebenchpro/deepseek/ep-20250603132404-cgpjm/157dd68b-fcc2-416f-a2c0-c9781020e6af.json delete mode 100644 data/livecodebenchpro/google/gemini-2.5-flash/174f0e23-84f1-43d0-bcdf-11b83c37025a.json delete mode 100644 data/livecodebenchpro/google/gemini-2.5-pro/bef7254b-549f-4e6b-b5c8-31b84dc6acda.json delete mode 100644 data/livecodebenchpro/kuaishou/kwaipilot-40b-0604/aa236b03-b81f-431b-b049-7101cea165f2.json delete mode 100644 data/livecodebenchpro/meta/llama-4-maverick/abc37028-a362-4e02-8499-1bb7497e0293.json delete mode 100644 data/livecodebenchpro/openai/gpt-4.1/ba46ef91-d157-4984-b3df-ce33d8d97f8e.json delete mode 100644 data/livecodebenchpro/openai/gpt-4o-2024-11-20/e70acf51-30ef-4c20-b7cc-51704d114d70.json delete mode 100644 data/livecodebenchpro/openai/gpt-5-2025-08-07/0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json delete mode 100644 data/livecodebenchpro/openai/gpt-5-2025-08-07/de66cc70-b456-4165-a827-5193dd77e84d.json delete mode 100644 data/livecodebenchpro/openai/gpt-5.2-2025-12-11/e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json delete mode 100644 data/livecodebenchpro/openai/gpt-oss-120b/1dd8c827-72af-4c8f-9ead-989de7105590.json delete mode 100644 data/livecodebenchpro/openai/gpt-oss-20b/ead39f61-b408-42b2-808f-8421a3200c89.json delete mode 100644 data/livecodebenchpro/openai/o3-2025-04-16/f96bdb35-4d61-4fde-8d91-edf55f13dc03.json delete mode 100644 data/livecodebenchpro/openai/o4-mini-2025-04-16/5516f77c-932a-4eaa-ac31-dda9260ce82d.json delete mode 100644 data/livecodebenchpro/openai/o4-mini-2025-04-16/8992cef5-df7e-40a1-b099-331532c3deb0.json delete mode 100644 data/livecodebenchpro/z-ai/glm-4.5/a77c08d6-a782-440c-b545-c60b6169712d.json delete mode 100644 data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json delete mode 100644 data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json delete mode 100644 data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json delete mode 100644 data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json delete mode 100644 data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json delete mode 100644 data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json delete mode 100644 data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json delete mode 100644 data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json delete mode 100644 data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json delete mode 100644 data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json delete mode 100644 data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json delete mode 100644 data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json delete mode 100644 data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json delete mode 100644 data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json delete mode 100644 data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json delete mode 100644 data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json delete mode 100644 data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json delete mode 100644 data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json delete mode 100644 data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json delete mode 100644 data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json delete mode 100644 data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json delete mode 100644 data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json delete mode 100644 data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json delete mode 100644 data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json delete mode 100644 data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json delete mode 100644 data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json delete mode 100644 data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json delete mode 100644 data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json delete mode 100644 data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json delete mode 100644 data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json delete mode 100644 data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json delete mode 100644 data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json delete mode 100644 data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json delete mode 100644 data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json delete mode 100644 data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json delete mode 100644 data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json delete mode 100644 data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json delete mode 100644 data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json delete mode 100644 data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json delete mode 100644 data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json delete mode 100644 data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json delete mode 100644 data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json delete mode 100644 data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json delete mode 100644 data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json delete mode 100644 data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json delete mode 100644 data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json delete mode 100644 data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json delete mode 100644 data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json delete mode 100644 data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json delete mode 100644 data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json delete mode 100644 data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json delete mode 100644 data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json delete mode 100644 data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json delete mode 100644 data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json delete mode 100644 data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json delete mode 100644 data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json delete mode 100644 data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json delete mode 100644 data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json delete mode 100644 data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json delete mode 100644 data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json delete mode 100644 data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json delete mode 100644 data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json delete mode 100644 data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json delete mode 100644 data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json delete mode 100644 data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json delete mode 100644 data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json delete mode 100644 data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json delete mode 100644 data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json delete mode 100644 data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json delete mode 100644 data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json delete mode 100644 data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json delete mode 100644 data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json delete mode 100644 data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json delete mode 100644 data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json delete mode 100644 data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json delete mode 100644 data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json delete mode 100644 data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json delete mode 100644 data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json delete mode 100644 data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json delete mode 100644 data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json delete mode 100644 data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json delete mode 100644 data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json delete mode 100644 data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json delete mode 100644 data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json delete mode 100644 data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json delete mode 100644 data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json delete mode 100644 data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json delete mode 100644 data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json delete mode 100644 data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json delete mode 100644 data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json delete mode 100644 data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json delete mode 100644 data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json delete mode 100644 data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json delete mode 100644 data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json delete mode 100644 data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json delete mode 100644 data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json delete mode 100644 data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json delete mode 100644 data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json delete mode 100644 data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json delete mode 100644 data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json delete mode 100644 data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json delete mode 100644 data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json delete mode 100644 data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json delete mode 100644 data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json delete mode 100644 data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json delete mode 100644 data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json delete mode 100644 data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json delete mode 100644 data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json delete mode 100644 data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json delete mode 100644 data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json delete mode 100644 data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json delete mode 100644 data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json delete mode 100644 data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json delete mode 100644 data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json delete mode 100644 data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json delete mode 100644 data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json delete mode 100644 data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json delete mode 100644 data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json delete mode 100644 data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json delete mode 100644 data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json delete mode 100644 data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json delete mode 100644 data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json delete mode 100644 data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json delete mode 100644 data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json delete mode 100644 data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json delete mode 100644 data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json delete mode 100644 data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json delete mode 100644 data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json delete mode 100644 data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json delete mode 100644 data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json delete mode 100644 data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json delete mode 100644 data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json delete mode 100644 data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json delete mode 100644 data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json delete mode 100644 data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json delete mode 100644 data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json delete mode 100644 data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json delete mode 100644 data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json delete mode 100644 data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json delete mode 100644 data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json delete mode 100644 data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json delete mode 100644 data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json delete mode 100644 data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json delete mode 100644 data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json delete mode 100644 data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json delete mode 100644 data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json delete mode 100644 data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json delete mode 100644 data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json delete mode 100644 data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json delete mode 100644 data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json delete mode 100644 data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json delete mode 100644 data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json delete mode 100644 data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json delete mode 100644 data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json delete mode 100644 data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json delete mode 100644 data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json delete mode 100644 data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json delete mode 100644 data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json delete mode 100644 data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json delete mode 100644 data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json delete mode 100644 data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json delete mode 100644 data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json delete mode 100644 data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json delete mode 100644 data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json delete mode 100644 data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json delete mode 100644 data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json delete mode 100644 data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json delete mode 100644 data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json delete mode 100644 data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json delete mode 100644 data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json delete mode 100644 data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json delete mode 100644 data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json delete mode 100644 data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json delete mode 100644 data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json delete mode 100644 data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json delete mode 100644 data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json delete mode 100644 data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json delete mode 100644 data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json delete mode 100644 data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json delete mode 100644 data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json delete mode 100644 data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json delete mode 100644 data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json delete mode 100644 data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json delete mode 100644 data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json delete mode 100644 data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json diff --git a/data/global-mmlu-lite/alibaba/qwen3-235b-a22b-instruct-2507/c8ab4e94-d8e8-417f-be18-fececf3c815c.json b/data/global-mmlu-lite/alibaba/qwen3-235b-a22b-instruct-2507/c8ab4e94-d8e8-417f-be18-fececf3c815c.json deleted file mode 100644 index b3b764f48..000000000 --- a/data/global-mmlu-lite/alibaba/qwen3-235b-a22b-instruct-2507/c8ab4e94-d8e8-417f-be18-fececf3c815c.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen3-235b-a22b-instruct-2507", - "id": "alibaba/qwen3-235b-a22b-instruct-2507", - "developer": "alibaba", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Qwen 3 235B A22B Instruct 2506" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8798 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8522 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "uncertainty": { - "confidence_interval": { - "lower": -0.0307, - "upper": 0.0307, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8875, - "uncertainty": { - "confidence_interval": { - "lower": -0.031, - "upper": 0.031, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "uncertainty": { - "confidence_interval": { - "lower": -0.0313, - "upper": 0.0313, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0321, - "upper": 0.0321, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "uncertainty": { - "confidence_interval": { - "lower": -0.0324, - "upper": 0.0324, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8875, - "uncertainty": { - "confidence_interval": { - "lower": -0.031, - "upper": 0.031, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "uncertainty": { - "confidence_interval": { - "lower": -0.0324, - "upper": 0.0324, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "uncertainty": { - "confidence_interval": { - "lower": -0.033, - "upper": 0.033, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0327, - "upper": 0.0327, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0321, - "upper": 0.0321, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/anthropic/claude-3-5-haiku-20241022/402c8833-1827-46fc-a497-46b40a6794ff.json b/data/global-mmlu-lite/anthropic/claude-3-5-haiku-20241022/402c8833-1827-46fc-a497-46b40a6794ff.json deleted file mode 100644 index 5bff70d19..000000000 --- a/data/global-mmlu-lite/anthropic/claude-3-5-haiku-20241022/402c8833-1827-46fc-a497-46b40a6794ff.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/anthropic_claude-3-5-haiku-20241022/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "claude-3-5-haiku-20241022", - "id": "anthropic/claude-3-5-haiku-20241022", - "developer": "anthropic", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Claude 3.5 Haiku" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6114 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5834 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6394 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.695, - "uncertainty": { - "confidence_interval": { - "lower": -0.0451, - "upper": 0.0451, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.485, - "uncertainty": { - "confidence_interval": { - "lower": -0.049, - "upper": 0.049, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "uncertainty": { - "confidence_interval": { - "lower": -0.0459, - "upper": 0.0459, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565, - "uncertainty": { - "confidence_interval": { - "lower": -0.0486, - "upper": 0.0486, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "uncertainty": { - "confidence_interval": { - "lower": -0.0478, - "upper": 0.0478, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6575, - "uncertainty": { - "confidence_interval": { - "lower": -0.0465, - "upper": 0.0465, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0488, - "upper": 0.0488, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48, - "uncertainty": { - "confidence_interval": { - "lower": -0.049, - "upper": 0.049, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655, - "uncertainty": { - "confidence_interval": { - "lower": -0.0466, - "upper": 0.0466, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6575, - "uncertainty": { - "confidence_interval": { - "lower": -0.0465, - "upper": 0.0465, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5225, - "uncertainty": { - "confidence_interval": { - "lower": -0.0489, - "upper": 0.0489, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.485, - "uncertainty": { - "confidence_interval": { - "lower": -0.049, - "upper": 0.049, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "uncertainty": { - "confidence_interval": { - "lower": -0.0453, - "upper": 0.0453, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6675, - "uncertainty": { - "confidence_interval": { - "lower": -0.0462, - "upper": 0.0462, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "uncertainty": { - "confidence_interval": { - "lower": -0.0453, - "upper": 0.0453, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "uncertainty": { - "confidence_interval": { - "lower": -0.0449, - "upper": 0.0449, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/anthropic/claude-3-7-sonnet-20250219/acd2082a-ce0c-418f-9383-f3c9f11735a2.json b/data/global-mmlu-lite/anthropic/claude-3-7-sonnet-20250219/acd2082a-ce0c-418f-9383-f3c9f11735a2.json deleted file mode 100644 index ec9276c60..000000000 --- a/data/global-mmlu-lite/anthropic/claude-3-7-sonnet-20250219/acd2082a-ce0c-418f-9383-f3c9f11735a2.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/anthropic_claude-3-7-sonnet-20250219/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "claude-3-7-sonnet-20250219", - "id": "anthropic/claude-3-7-sonnet-20250219", - "developer": "anthropic", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Claude 3.7 Sonnet" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8078 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7794 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8362 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0397, - "upper": 0.0397, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7625, - "uncertainty": { - "confidence_interval": { - "lower": -0.0417, - "upper": 0.0417, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "uncertainty": { - "confidence_interval": { - "lower": -0.0372, - "upper": 0.0372, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0382, - "upper": 0.0382, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7675, - "uncertainty": { - "confidence_interval": { - "lower": -0.0414, - "upper": 0.0414, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.805, - "uncertainty": { - "confidence_interval": { - "lower": -0.0388, - "upper": 0.0388, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8175, - "uncertainty": { - "confidence_interval": { - "lower": -0.0379, - "upper": 0.0379, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8225, - "uncertainty": { - "confidence_interval": { - "lower": -0.0374, - "upper": 0.0374, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0357, - "upper": 0.0357, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "uncertainty": { - "confidence_interval": { - "lower": -0.0368, - "upper": 0.0368, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "uncertainty": { - "confidence_interval": { - "lower": -0.0412, - "upper": 0.0412, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0386, - "upper": 0.0386, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0382, - "upper": 0.0382, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "uncertainty": { - "confidence_interval": { - "lower": -0.0384, - "upper": 0.0384, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "uncertainty": { - "confidence_interval": { - "lower": -0.0364, - "upper": 0.0364, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0382, - "upper": 0.0382, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/anthropic/claude-opus-4-1-20250805/c65ed336-b283-46c2-8284-c4695cad588d.json b/data/global-mmlu-lite/anthropic/claude-opus-4-1-20250805/c65ed336-b283-46c2-8284-c4695cad588d.json deleted file mode 100644 index 06dce92ac..000000000 --- a/data/global-mmlu-lite/anthropic/claude-opus-4-1-20250805/c65ed336-b283-46c2-8284-c4695cad588d.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "claude-opus-4-1-20250805", - "id": "anthropic/claude-opus-4-1-20250805", - "developer": "anthropic", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Claude Opus 4.1" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.943 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9331 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9528 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "uncertainty": { - "confidence_interval": { - "lower": -0.0223, - "upper": 0.0223, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0219, - "upper": 0.0219, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0228, - "upper": 0.0228, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "uncertainty": { - "confidence_interval": { - "lower": -0.0233, - "upper": 0.0233, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "uncertainty": { - "confidence_interval": { - "lower": -0.0223, - "upper": 0.0223, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0219, - "upper": 0.0219, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0228, - "upper": 0.0228, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "uncertainty": { - "confidence_interval": { - "lower": -0.0233, - "upper": 0.0233, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "uncertainty": { - "confidence_interval": { - "lower": -0.0233, - "upper": 0.0233, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "uncertainty": { - "confidence_interval": { - "lower": -0.0214, - "upper": 0.0214, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "uncertainty": { - "confidence_interval": { - "lower": -0.0223, - "upper": 0.0223, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "uncertainty": { - "confidence_interval": { - "lower": -0.0223, - "upper": 0.0223, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "uncertainty": { - "confidence_interval": { - "lower": -0.025, - "upper": 0.025, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9375, - "uncertainty": { - "confidence_interval": { - "lower": -0.0237, - "upper": 0.0237, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "uncertainty": { - "confidence_interval": { - "lower": -0.0223, - "upper": 0.0223, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "uncertainty": { - "confidence_interval": { - "lower": -0.0223, - "upper": 0.0223, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/anthropic/claude-sonnet-4-20250514/5ebb009d-b548-4f2b-b075-feb76ca295d2.json b/data/global-mmlu-lite/anthropic/claude-sonnet-4-20250514/5ebb009d-b548-4f2b-b075-feb76ca295d2.json deleted file mode 100644 index 0251345d9..000000000 --- a/data/global-mmlu-lite/anthropic/claude-sonnet-4-20250514/5ebb009d-b548-4f2b-b075-feb76ca295d2.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "claude-sonnet-4-20250514", - "id": "anthropic/claude-sonnet-4-20250514", - "developer": "anthropic", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Claude Sonnet 4" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9058 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8913 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9203 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0277, - "upper": 0.0277, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "uncertainty": { - "confidence_interval": { - "lower": -0.0287, - "upper": 0.0287, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0277, - "upper": 0.0277, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "uncertainty": { - "confidence_interval": { - "lower": -0.028, - "upper": 0.028, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "uncertainty": { - "confidence_interval": { - "lower": -0.0294, - "upper": 0.0294, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9025, - "uncertainty": { - "confidence_interval": { - "lower": -0.0291, - "upper": 0.0291, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "uncertainty": { - "confidence_interval": { - "lower": -0.0294, - "upper": 0.0294, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0277, - "upper": 0.0277, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "uncertainty": { - "confidence_interval": { - "lower": -0.028, - "upper": 0.028, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8975, - "uncertainty": { - "confidence_interval": { - "lower": -0.0297, - "upper": 0.0297, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8975, - "uncertainty": { - "confidence_interval": { - "lower": -0.0297, - "upper": 0.0297, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9175, - "uncertainty": { - "confidence_interval": { - "lower": -0.027, - "upper": 0.027, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0304, - "upper": 0.0304, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/cohere/command-a-03-2025/c7df2916-bde4-4987-9139-fcfd18a14ac1.json b/data/global-mmlu-lite/cohere/command-a-03-2025/c7df2916-bde4-4987-9139-fcfd18a14ac1.json deleted file mode 100644 index 8e9ed8546..000000000 --- a/data/global-mmlu-lite/cohere/command-a-03-2025/c7df2916-bde4-4987-9139-fcfd18a14ac1.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/cohere_command-a-03-2025/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "command-a-03-2025", - "id": "cohere/command-a-03-2025", - "developer": "cohere", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Command A " - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8385 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7993 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8778 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0357, - "upper": 0.0357, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "uncertainty": { - "confidence_interval": { - "lower": -0.0345, - "upper": 0.0345, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8225, - "uncertainty": { - "confidence_interval": { - "lower": -0.0374, - "upper": 0.0374, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0357, - "upper": 0.0357, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8375, - "uncertainty": { - "confidence_interval": { - "lower": -0.0362, - "upper": 0.0362, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8421, - "uncertainty": { - "confidence_interval": { - "lower": -0.0358, - "upper": 0.0358, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8546, - "uncertainty": { - "confidence_interval": { - "lower": -0.0346, - "upper": 0.0346, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8375, - "uncertainty": { - "confidence_interval": { - "lower": -0.0362, - "upper": 0.0362, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "uncertainty": { - "confidence_interval": { - "lower": -0.0355, - "upper": 0.0355, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "uncertainty": { - "confidence_interval": { - "lower": -0.035, - "upper": 0.035, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "uncertainty": { - "confidence_interval": { - "lower": -0.0359, - "upper": 0.0359, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0348, - "upper": 0.0348, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8275, - "uncertainty": { - "confidence_interval": { - "lower": -0.037, - "upper": 0.037, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "uncertainty": { - "confidence_interval": { - "lower": -0.0381, - "upper": 0.0381, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "uncertainty": { - "confidence_interval": { - "lower": -0.0364, - "upper": 0.0364, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8175, - "uncertainty": { - "confidence_interval": { - "lower": -0.0379, - "upper": 0.0379, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/deepseek/deepseek-r1-0528/56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json b/data/global-mmlu-lite/deepseek/deepseek-r1-0528/56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json deleted file mode 100644 index b6e9a89cf..000000000 --- a/data/global-mmlu-lite/deepseek/deepseek-r1-0528/56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/deepseek_deepseek-r1-0528/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "deepseek-r1-0528", - "id": "deepseek/deepseek-r1-0528", - "developer": "deepseek", - "inference_platform": "unknown", - "additional_details": { - "display_name": "DeepSeek-R1" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6744 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6672 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6816 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6825, - "uncertainty": { - "confidence_interval": { - "lower": -0.0456, - "upper": 0.0456, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.715, - "uncertainty": { - "confidence_interval": { - "lower": -0.0442, - "upper": 0.0442, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655, - "uncertainty": { - "confidence_interval": { - "lower": -0.0466, - "upper": 0.0466, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6375, - "uncertainty": { - "confidence_interval": { - "lower": -0.0471, - "upper": 0.0471, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0452, - "upper": 0.0452, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0468, - "upper": 0.0468, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655, - "uncertainty": { - "confidence_interval": { - "lower": -0.0466, - "upper": 0.0466, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0458, - "upper": 0.0458, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0411, - "upper": 0.0411, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6575, - "uncertainty": { - "confidence_interval": { - "lower": -0.0465, - "upper": 0.0465, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.635, - "uncertainty": { - "confidence_interval": { - "lower": -0.0472, - "upper": 0.0472, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7175, - "uncertainty": { - "confidence_interval": { - "lower": -0.0441, - "upper": 0.0441, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0458, - "upper": 0.0458, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "uncertainty": { - "confidence_interval": { - "lower": -0.0412, - "upper": 0.0412, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075, - "uncertainty": { - "confidence_interval": { - "lower": -0.049, - "upper": 0.049, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "uncertainty": { - "confidence_interval": { - "lower": -0.0453, - "upper": 0.0453, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/deepseek/deepseek-v3.1/ad3211a9-4390-4247-b64d-600191a88a75.json b/data/global-mmlu-lite/deepseek/deepseek-v3.1/ad3211a9-4390-4247-b64d-600191a88a75.json deleted file mode 100644 index 7e8deab0e..000000000 --- a/data/global-mmlu-lite/deepseek/deepseek-v3.1/ad3211a9-4390-4247-b64d-600191a88a75.json +++ /dev/null @@ -1,512 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/deepseek_deepseek-v3.1/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "deepseek-v3.1", - "id": "deepseek/deepseek-v3.1", - "developer": "deepseek", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8044 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7793 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8295 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.805, - "uncertainty": { - "confidence_interval": { - "lower": -0.0388, - "upper": 0.0388, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "uncertainty": { - "confidence_interval": { - "lower": -0.0372, - "upper": 0.0372, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8157, - "uncertainty": { - "confidence_interval": { - "lower": -0.0382, - "upper": 0.0382, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0397, - "upper": 0.0397, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8175, - "uncertainty": { - "confidence_interval": { - "lower": -0.0379, - "upper": 0.0379, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7569, - "uncertainty": { - "confidence_interval": { - "lower": -0.0421, - "upper": 0.0421, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7764, - "uncertainty": { - "confidence_interval": { - "lower": -0.0409, - "upper": 0.0409, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0386, - "upper": 0.0386, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8312, - "uncertainty": { - "confidence_interval": { - "lower": -0.0374, - "upper": 0.0374, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0382, - "upper": 0.0382, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8246, - "uncertainty": { - "confidence_interval": { - "lower": -0.0373, - "upper": 0.0373, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0382, - "upper": 0.0382, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.801, - "uncertainty": { - "confidence_interval": { - "lower": -0.0393, - "upper": 0.0393, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7831, - "uncertainty": { - "confidence_interval": { - "lower": -0.0415, - "upper": 0.0415, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8161, - "uncertainty": { - "confidence_interval": { - "lower": -0.0381, - "upper": 0.0381, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0397, - "upper": 0.0397, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/google/gemini-2.5-flash-preview-05-20/1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json b/data/global-mmlu-lite/google/gemini-2.5-flash-preview-05-20/1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json deleted file mode 100644 index 7a051d563..000000000 --- a/data/global-mmlu-lite/google/gemini-2.5-flash-preview-05-20/1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash-preview-05-20/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemini-2.5-flash-preview-05-20", - "id": "google/gemini-2.5-flash-preview-05-20", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Gemini 2.5 Flash Preview" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9092 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8925 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9259 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "uncertainty": { - "confidence_interval": { - "lower": -0.0287, - "upper": 0.0287, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9225, - "uncertainty": { - "confidence_interval": { - "lower": -0.0262, - "upper": 0.0262, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "uncertainty": { - "confidence_interval": { - "lower": -0.028, - "upper": 0.028, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "uncertainty": { - "confidence_interval": { - "lower": -0.0287, - "upper": 0.0287, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0258, - "upper": 0.0258, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0277, - "upper": 0.0277, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "uncertainty": { - "confidence_interval": { - "lower": -0.0307, - "upper": 0.0307, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0277, - "upper": 0.0277, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "uncertainty": { - "confidence_interval": { - "lower": -0.0273, - "upper": 0.0273, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "uncertainty": { - "confidence_interval": { - "lower": -0.0273, - "upper": 0.0273, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "uncertainty": { - "confidence_interval": { - "lower": -0.0287, - "upper": 0.0287, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8825, - "uncertainty": { - "confidence_interval": { - "lower": -0.0316, - "upper": 0.0316, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "uncertainty": { - "confidence_interval": { - "lower": -0.025, - "upper": 0.025, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9025, - "uncertainty": { - "confidence_interval": { - "lower": -0.0291, - "upper": 0.0291, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/google/gemini-2.5-flash/129c8b21-f97e-4284-9574-33d5932332f7.json b/data/global-mmlu-lite/google/gemini-2.5-flash/129c8b21-f97e-4284-9574-33d5932332f7.json deleted file mode 100644 index ffe8e8eb2..000000000 --- a/data/global-mmlu-lite/google/gemini-2.5-flash/129c8b21-f97e-4284-9574-33d5932332f7.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemini-2.5-flash", - "id": "google/gemini-2.5-flash", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Gemini 2.5 Flash" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9145 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9291 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0277, - "upper": 0.0277, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9325, - "uncertainty": { - "confidence_interval": { - "lower": -0.0246, - "upper": 0.0246, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "uncertainty": { - "confidence_interval": { - "lower": -0.028, - "upper": 0.028, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9025, - "uncertainty": { - "confidence_interval": { - "lower": -0.0291, - "upper": 0.0291, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "uncertainty": { - "confidence_interval": { - "lower": -0.028, - "upper": 0.028, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0258, - "upper": 0.0258, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9225, - "uncertainty": { - "confidence_interval": { - "lower": -0.0262, - "upper": 0.0262, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0277, - "upper": 0.0277, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "uncertainty": { - "confidence_interval": { - "lower": -0.0273, - "upper": 0.0273, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0277, - "upper": 0.0277, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9175, - "uncertainty": { - "confidence_interval": { - "lower": -0.027, - "upper": 0.027, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "uncertainty": { - "confidence_interval": { - "lower": -0.0273, - "upper": 0.0273, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "uncertainty": { - "confidence_interval": { - "lower": -0.0273, - "upper": 0.0273, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "uncertainty": { - "confidence_interval": { - "lower": -0.0273, - "upper": 0.0273, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/google/gemini-2.5-pro/3644fd67-0f46-4de3-b542-edf219d0e0cd.json b/data/global-mmlu-lite/google/gemini-2.5-pro/3644fd67-0f46-4de3-b542-edf219d0e0cd.json deleted file mode 100644 index 6a19f6916..000000000 --- a/data/global-mmlu-lite/google/gemini-2.5-pro/3644fd67-0f46-4de3-b542-edf219d0e0cd.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/google_gemini-2.5-pro/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemini-2.5-pro", - "id": "google/gemini-2.5-pro", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Gemini 2.5 Pro" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9323 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9241 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9406 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0219, - "upper": 0.0219, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9275, - "uncertainty": { - "confidence_interval": { - "lower": -0.0254, - "upper": 0.0254, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9275, - "uncertainty": { - "confidence_interval": { - "lower": -0.0254, - "upper": 0.0254, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "uncertainty": { - "confidence_interval": { - "lower": -0.025, - "upper": 0.025, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0228, - "upper": 0.0228, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9275, - "uncertainty": { - "confidence_interval": { - "lower": -0.0254, - "upper": 0.0254, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0258, - "upper": 0.0258, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.935, - "uncertainty": { - "confidence_interval": { - "lower": -0.0242, - "upper": 0.0242, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9375, - "uncertainty": { - "confidence_interval": { - "lower": -0.0237, - "upper": 0.0237, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9275, - "uncertainty": { - "confidence_interval": { - "lower": -0.0254, - "upper": 0.0254, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "uncertainty": { - "confidence_interval": { - "lower": -0.025, - "upper": 0.025, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "uncertainty": { - "confidence_interval": { - "lower": -0.0233, - "upper": 0.0233, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9375, - "uncertainty": { - "confidence_interval": { - "lower": -0.0237, - "upper": 0.0237, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0258, - "upper": 0.0258, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9275, - "uncertainty": { - "confidence_interval": { - "lower": -0.0254, - "upper": 0.0254, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "uncertainty": { - "confidence_interval": { - "lower": -0.025, - "upper": 0.025, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/google/gemini-3-pro-preview/c0692e14-6484-4d02-8dac-55ce4373fb15.json b/data/global-mmlu-lite/google/gemini-3-pro-preview/c0692e14-6484-4d02-8dac-55ce4373fb15.json deleted file mode 100644 index 8538679be..000000000 --- a/data/global-mmlu-lite/google/gemini-3-pro-preview/c0692e14-6484-4d02-8dac-55ce4373fb15.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/google_gemini-3-pro-preview/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemini-3-pro-preview", - "id": "google/gemini-3-pro-preview", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Gemini 3 Pro Preview" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9453 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9397 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9509 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0219, - "upper": 0.0219, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0228, - "upper": 0.0228, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0228, - "upper": 0.0228, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "uncertainty": { - "confidence_interval": { - "lower": -0.0233, - "upper": 0.0233, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9575, - "uncertainty": { - "confidence_interval": { - "lower": -0.0198, - "upper": 0.0198, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0228, - "upper": 0.0228, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.955, - "uncertainty": { - "confidence_interval": { - "lower": -0.0203, - "upper": 0.0203, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.955, - "uncertainty": { - "confidence_interval": { - "lower": -0.0203, - "upper": 0.0203, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "uncertainty": { - "confidence_interval": { - "lower": -0.0233, - "upper": 0.0233, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "uncertainty": { - "confidence_interval": { - "lower": -0.0233, - "upper": 0.0233, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0228, - "upper": 0.0228, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0219, - "upper": 0.0219, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "uncertainty": { - "confidence_interval": { - "lower": -0.0233, - "upper": 0.0233, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0228, - "upper": 0.0228, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0219, - "upper": 0.0219, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0228, - "upper": 0.0228, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/google/gemma-3-27b-it/ab4940d1-118c-479a-bd37-1ea2da6f02a3.json b/data/global-mmlu-lite/google/gemma-3-27b-it/ab4940d1-118c-479a-bd37-1ea2da6f02a3.json deleted file mode 100644 index 211f9d6b8..000000000 --- a/data/global-mmlu-lite/google/gemma-3-27b-it/ab4940d1-118c-479a-bd37-1ea2da6f02a3.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-3-27b-it", - "id": "google/gemma-3-27b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Gemma 3 27B" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.763 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7528 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7733 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "uncertainty": { - "confidence_interval": { - "lower": -0.0406, - "upper": 0.0406, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7337, - "uncertainty": { - "confidence_interval": { - "lower": -0.0434, - "upper": 0.0434, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "uncertainty": { - "confidence_interval": { - "lower": -0.0426, - "upper": 0.0426, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0409, - "upper": 0.0409, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7481, - "uncertainty": { - "confidence_interval": { - "lower": -0.0429, - "upper": 0.0429, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7335, - "uncertainty": { - "confidence_interval": { - "lower": -0.0437, - "upper": 0.0437, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7563, - "uncertainty": { - "confidence_interval": { - "lower": -0.0422, - "upper": 0.0422, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "uncertainty": { - "confidence_interval": { - "lower": -0.0424, - "upper": 0.0424, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0397, - "upper": 0.0397, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.798, - "uncertainty": { - "confidence_interval": { - "lower": -0.0395, - "upper": 0.0395, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7481, - "uncertainty": { - "confidence_interval": { - "lower": -0.0427, - "upper": 0.0427, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7494, - "uncertainty": { - "confidence_interval": { - "lower": -0.0425, - "upper": 0.0425, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "uncertainty": { - "confidence_interval": { - "lower": -0.0403, - "upper": 0.0403, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7444, - "uncertainty": { - "confidence_interval": { - "lower": -0.0428, - "upper": 0.0428, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0397, - "upper": 0.0397, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7719, - "uncertainty": { - "confidence_interval": { - "lower": -0.0412, - "upper": 0.0412, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/google/gemma-3-4b-it/85552093-435f-4d85-897d-4e74c3655533.json b/data/global-mmlu-lite/google/gemma-3-4b-it/85552093-435f-4d85-897d-4e74c3655533.json deleted file mode 100644 index f5d7db0a6..000000000 --- a/data/global-mmlu-lite/google/gemma-3-4b-it/85552093-435f-4d85-897d-4e74c3655533.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/google_gemma-3-4b-it/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-3-4b-it", - "id": "google/gemma-3-4b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Gemma 3 4B" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6511 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6116 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6906 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0467, - "upper": 0.0467, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "uncertainty": { - "confidence_interval": { - "lower": -0.0461, - "upper": 0.0461, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68, - "uncertainty": { - "confidence_interval": { - "lower": -0.0457, - "upper": 0.0457, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0467, - "upper": 0.0467, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6575, - "uncertainty": { - "confidence_interval": { - "lower": -0.0465, - "upper": 0.0465, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0468, - "upper": 0.0468, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0458, - "upper": 0.0458, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6675, - "uncertainty": { - "confidence_interval": { - "lower": -0.0462, - "upper": 0.0462, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6325, - "uncertainty": { - "confidence_interval": { - "lower": -0.0472, - "upper": 0.0472, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "uncertainty": { - "confidence_interval": { - "lower": -0.0464, - "upper": 0.0464, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68, - "uncertainty": { - "confidence_interval": { - "lower": -0.0457, - "upper": 0.0457, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6725, - "uncertainty": { - "confidence_interval": { - "lower": -0.046, - "upper": 0.046, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0479, - "upper": 0.0479, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5825, - "uncertainty": { - "confidence_interval": { - "lower": -0.0483, - "upper": 0.0483, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0468, - "upper": 0.0468, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63, - "uncertainty": { - "confidence_interval": { - "lower": -0.0473, - "upper": 0.0473, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/mistralai/mistral-medium-3/4ddc0062-6577-4ab9-85f1-791fd2822776.json b/data/global-mmlu-lite/mistralai/mistral-medium-3/4ddc0062-6577-4ab9-85f1-791fd2822776.json deleted file mode 100644 index 242b4f1b9..000000000 --- a/data/global-mmlu-lite/mistralai/mistral-medium-3/4ddc0062-6577-4ab9-85f1-791fd2822776.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/mistralai_mistral-medium-3/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-medium-3", - "id": "mistralai/mistral-medium-3", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Mistral Medium 3" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5511 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5631 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.455, - "uncertainty": { - "confidence_interval": { - "lower": -0.0488, - "upper": 0.0488, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38, - "uncertainty": { - "confidence_interval": { - "lower": -0.0476, - "upper": 0.0476, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5175, - "uncertainty": { - "confidence_interval": { - "lower": -0.049, - "upper": 0.049, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0489, - "upper": 0.0489, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.41, - "uncertainty": { - "confidence_interval": { - "lower": -0.0482, - "upper": 0.0482, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555, - "uncertainty": { - "confidence_interval": { - "lower": -0.0487, - "upper": 0.0487, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.515, - "uncertainty": { - "confidence_interval": { - "lower": -0.049, - "upper": 0.049, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.535, - "uncertainty": { - "confidence_interval": { - "lower": -0.0489, - "upper": 0.0489, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "uncertainty": { - "confidence_interval": { - "lower": -0.0484, - "upper": 0.0484, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.595, - "uncertainty": { - "confidence_interval": { - "lower": -0.0481, - "upper": 0.0481, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5175, - "uncertainty": { - "confidence_interval": { - "lower": -0.049, - "upper": 0.049, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5375, - "uncertainty": { - "confidence_interval": { - "lower": -0.0489, - "upper": 0.0489, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0446, - "upper": 0.0446, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7675, - "uncertainty": { - "confidence_interval": { - "lower": -0.0414, - "upper": 0.0414, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.535, - "uncertainty": { - "confidence_interval": { - "lower": -0.0489, - "upper": 0.0489, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7325, - "uncertainty": { - "confidence_interval": { - "lower": -0.0434, - "upper": 0.0434, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/mistralai/mistral-small-2503/50fc4840-933b-43ec-847e-1834b30f9f14.json b/data/global-mmlu-lite/mistralai/mistral-small-2503/50fc4840-933b-43ec-847e-1834b30f9f14.json deleted file mode 100644 index afd35d897..000000000 --- a/data/global-mmlu-lite/mistralai/mistral-small-2503/50fc4840-933b-43ec-847e-1834b30f9f14.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-small-2503", - "id": "mistralai/mistral-small-2503", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Mistral Small 3.1" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7852 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7537 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8166 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7875, - "uncertainty": { - "confidence_interval": { - "lower": -0.0401, - "upper": 0.0401, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "uncertainty": { - "confidence_interval": { - "lower": -0.0392, - "upper": 0.0392, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0411, - "upper": 0.0411, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7975, - "uncertainty": { - "confidence_interval": { - "lower": -0.0394, - "upper": 0.0394, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "uncertainty": { - "confidence_interval": { - "lower": -0.0392, - "upper": 0.0392, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.795, - "uncertainty": { - "confidence_interval": { - "lower": -0.0396, - "upper": 0.0396, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "uncertainty": { - "confidence_interval": { - "lower": -0.0403, - "upper": 0.0403, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.805, - "uncertainty": { - "confidence_interval": { - "lower": -0.0388, - "upper": 0.0388, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "uncertainty": { - "confidence_interval": { - "lower": -0.0412, - "upper": 0.0412, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "uncertainty": { - "confidence_interval": { - "lower": -0.0399, - "upper": 0.0399, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0397, - "upper": 0.0397, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7825, - "uncertainty": { - "confidence_interval": { - "lower": -0.0404, - "upper": 0.0404, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0409, - "upper": 0.0409, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "uncertainty": { - "confidence_interval": { - "lower": -0.0432, - "upper": 0.0432, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0397, - "upper": 0.0397, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7825, - "uncertainty": { - "confidence_interval": { - "lower": -0.0404, - "upper": 0.0404, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/openai/gpt-4.1-2025-04-14/6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json b/data/global-mmlu-lite/openai/gpt-4.1-2025-04-14/6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json deleted file mode 100644 index 4ace59a99..000000000 --- a/data/global-mmlu-lite/openai/gpt-4.1-2025-04-14/6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/openai_gpt-4.1-2025-04-14/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-4.1-2025-04-14", - "id": "openai/gpt-4.1-2025-04-14", - "developer": "openai", - "inference_platform": "unknown", - "additional_details": { - "display_name": "GPT-4.1" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8755 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8541 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8969 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8825, - "uncertainty": { - "confidence_interval": { - "lower": -0.0316, - "upper": 0.0316, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8625, - "uncertainty": { - "confidence_interval": { - "lower": -0.0337, - "upper": 0.0337, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "uncertainty": { - "confidence_interval": { - "lower": -0.0324, - "upper": 0.0324, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8875, - "uncertainty": { - "confidence_interval": { - "lower": -0.031, - "upper": 0.031, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0321, - "upper": 0.0321, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "uncertainty": { - "confidence_interval": { - "lower": -0.0313, - "upper": 0.0313, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0327, - "upper": 0.0327, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "uncertainty": { - "confidence_interval": { - "lower": -0.033, - "upper": 0.033, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "uncertainty": { - "confidence_interval": { - "lower": -0.0324, - "upper": 0.0324, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "uncertainty": { - "confidence_interval": { - "lower": -0.0313, - "upper": 0.0313, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0327, - "upper": 0.0327, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "uncertainty": { - "confidence_interval": { - "lower": -0.0324, - "upper": 0.0324, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "uncertainty": { - "confidence_interval": { - "lower": -0.033, - "upper": 0.033, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8575, - "uncertainty": { - "confidence_interval": { - "lower": -0.0343, - "upper": 0.0343, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/openai/gpt-5-2025-08-07/a668c931-34e4-4702-a84c-97d8c6f59ef4.json b/data/global-mmlu-lite/openai/gpt-5-2025-08-07/a668c931-34e4-4702-a84c-97d8c6f59ef4.json deleted file mode 100644 index 7b0435821..000000000 --- a/data/global-mmlu-lite/openai/gpt-5-2025-08-07/a668c931-34e4-4702-a84c-97d8c6f59ef4.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/openai_gpt-5-2025-08-07/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-5-2025-08-07", - "id": "openai/gpt-5-2025-08-07", - "developer": "openai", - "inference_platform": "unknown", - "additional_details": { - "display_name": "GPT-5" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8895 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8913 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8878 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0304, - "upper": 0.0304, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0327, - "upper": 0.0327, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "uncertainty": { - "confidence_interval": { - "lower": -0.0294, - "upper": 0.0294, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "uncertainty": { - "confidence_interval": { - "lower": -0.028, - "upper": 0.028, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "uncertainty": { - "confidence_interval": { - "lower": -0.0335, - "upper": 0.0335, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.795, - "uncertainty": { - "confidence_interval": { - "lower": -0.0396, - "upper": 0.0396, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8875, - "uncertainty": { - "confidence_interval": { - "lower": -0.031, - "upper": 0.031, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "uncertainty": { - "confidence_interval": { - "lower": -0.0273, - "upper": 0.0273, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8875, - "uncertainty": { - "confidence_interval": { - "lower": -0.031, - "upper": 0.031, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "uncertainty": { - "confidence_interval": { - "lower": -0.0287, - "upper": 0.0287, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "uncertainty": { - "confidence_interval": { - "lower": -0.0335, - "upper": 0.0335, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0277, - "upper": 0.0277, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.895, - "uncertainty": { - "confidence_interval": { - "lower": -0.03, - "upper": 0.03, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "uncertainty": { - "confidence_interval": { - "lower": -0.0273, - "upper": 0.0273, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/openai/o3-mini-2025-01-31/3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json b/data/global-mmlu-lite/openai/o3-mini-2025-01-31/3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json deleted file mode 100644 index 0d22ba810..000000000 --- a/data/global-mmlu-lite/openai/o3-mini-2025-01-31/3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/openai_o3-mini-2025-01-31/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "o3-mini-2025-01-31", - "id": "openai/o3-mini-2025-01-31", - "developer": "openai", - "inference_platform": "unknown", - "additional_details": { - "display_name": "o3 mini" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.765 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.795 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0411, - "upper": 0.0411, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8025, - "uncertainty": { - "confidence_interval": { - "lower": -0.039, - "upper": 0.039, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "uncertainty": { - "confidence_interval": { - "lower": -0.0412, - "upper": 0.0412, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0423, - "upper": 0.0423, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "uncertainty": { - "confidence_interval": { - "lower": -0.043, - "upper": 0.043, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0423, - "upper": 0.0423, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0429, - "upper": 0.0429, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "uncertainty": { - "confidence_interval": { - "lower": -0.0392, - "upper": 0.0392, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "uncertainty": { - "confidence_interval": { - "lower": -0.0384, - "upper": 0.0384, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0386, - "upper": 0.0386, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7975, - "uncertainty": { - "confidence_interval": { - "lower": -0.0394, - "upper": 0.0394, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0409, - "upper": 0.0409, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.765, - "uncertainty": { - "confidence_interval": { - "lower": -0.0416, - "upper": 0.0416, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0411, - "upper": 0.0411, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8125, - "uncertainty": { - "confidence_interval": { - "lower": -0.0382, - "upper": 0.0382, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0386, - "upper": 0.0386, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/unknown/aya-expanse-32b/938a35f1-195d-49c8-9a16-90fab96692bd.json b/data/global-mmlu-lite/unknown/aya-expanse-32b/938a35f1-195d-49c8-9a16-90fab96692bd.json deleted file mode 100644 index 4e5593fdf..000000000 --- a/data/global-mmlu-lite/unknown/aya-expanse-32b/938a35f1-195d-49c8-9a16-90fab96692bd.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/unknown_aya-expanse-32b/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "aya-expanse-32b", - "id": "unknown/aya-expanse-32b", - "developer": "unknown", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Aya Expanse 32B" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7353 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6891 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7815 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0429, - "upper": 0.0429, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7544, - "uncertainty": { - "confidence_interval": { - "lower": -0.0422, - "upper": 0.0422, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7343, - "uncertainty": { - "confidence_interval": { - "lower": -0.0433, - "upper": 0.0433, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7425, - "uncertainty": { - "confidence_interval": { - "lower": -0.0429, - "upper": 0.0429, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7325, - "uncertainty": { - "confidence_interval": { - "lower": -0.0434, - "upper": 0.0434, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7375, - "uncertainty": { - "confidence_interval": { - "lower": -0.0431, - "upper": 0.0431, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7594, - "uncertainty": { - "confidence_interval": { - "lower": -0.0419, - "upper": 0.0419, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7305, - "uncertainty": { - "confidence_interval": { - "lower": -0.0436, - "upper": 0.0436, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7419, - "uncertainty": { - "confidence_interval": { - "lower": -0.0429, - "upper": 0.0429, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0423, - "upper": 0.0423, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7544, - "uncertainty": { - "confidence_interval": { - "lower": -0.0422, - "upper": 0.0422, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7362, - "uncertainty": { - "confidence_interval": { - "lower": -0.0433, - "upper": 0.0433, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7071, - "uncertainty": { - "confidence_interval": { - "lower": -0.0448, - "upper": 0.0448, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6942, - "uncertainty": { - "confidence_interval": { - "lower": -0.0452, - "upper": 0.0452, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "uncertainty": { - "confidence_interval": { - "lower": -0.0432, - "upper": 0.0432, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7025, - "uncertainty": { - "confidence_interval": { - "lower": -0.0448, - "upper": 0.0448, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/unknown/granite-4.0-h-small/ce756801-f75e-4250-9721-1d627a37f055.json b/data/global-mmlu-lite/unknown/granite-4.0-h-small/ce756801-f75e-4250-9721-1d627a37f055.json deleted file mode 100644 index fd8643d63..000000000 --- a/data/global-mmlu-lite/unknown/granite-4.0-h-small/ce756801-f75e-4250-9721-1d627a37f055.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/unknown_granite-4.0-h-small/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-4.0-h-small", - "id": "unknown/granite-4.0-h-small", - "developer": "unknown", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Granite 4.0 Small" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7503 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7182 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7826 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7613, - "uncertainty": { - "confidence_interval": { - "lower": -0.0419, - "upper": 0.0419, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "uncertainty": { - "confidence_interval": { - "lower": -0.0412, - "upper": 0.0412, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7613, - "uncertainty": { - "confidence_interval": { - "lower": -0.0419, - "upper": 0.0419, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "uncertainty": { - "confidence_interval": { - "lower": -0.0421, - "upper": 0.0421, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7594, - "uncertainty": { - "confidence_interval": { - "lower": -0.0419, - "upper": 0.0419, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7575, - "uncertainty": { - "confidence_interval": { - "lower": -0.042, - "upper": 0.042, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7614, - "uncertainty": { - "confidence_interval": { - "lower": -0.0421, - "upper": 0.0421, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0423, - "upper": 0.0423, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7406, - "uncertainty": { - "confidence_interval": { - "lower": -0.0431, - "upper": 0.0431, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0423, - "upper": 0.0423, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757, - "uncertainty": { - "confidence_interval": { - "lower": -0.0423, - "upper": 0.0423, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7638, - "uncertainty": { - "confidence_interval": { - "lower": -0.0417, - "upper": 0.0417, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7318, - "uncertainty": { - "confidence_interval": { - "lower": -0.0435, - "upper": 0.0435, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6921, - "uncertainty": { - "confidence_interval": { - "lower": -0.0456, - "upper": 0.0456, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7475, - "uncertainty": { - "confidence_interval": { - "lower": -0.0426, - "upper": 0.0426, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7419, - "uncertainty": { - "confidence_interval": { - "lower": -0.0429, - "upper": 0.0429, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/unknown/o4-mini-2025-04-16/b83b41d4-6c95-4c7d-a290-65d89bf776c2.json b/data/global-mmlu-lite/unknown/o4-mini-2025-04-16/b83b41d4-6c95-4c7d-a290-65d89bf776c2.json deleted file mode 100644 index 95a579825..000000000 --- a/data/global-mmlu-lite/unknown/o4-mini-2025-04-16/b83b41d4-6c95-4c7d-a290-65d89bf776c2.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/unknown_o4-mini-2025-04-16/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "o4-mini-2025-04-16", - "id": "unknown/o4-mini-2025-04-16", - "developer": "unknown", - "inference_platform": "unknown", - "additional_details": { - "display_name": "o4 mini" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8705 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8503 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8906 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "uncertainty": { - "confidence_interval": { - "lower": -0.0335, - "upper": 0.0335, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8675, - "uncertainty": { - "confidence_interval": { - "lower": -0.0332, - "upper": 0.0332, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8875, - "uncertainty": { - "confidence_interval": { - "lower": -0.031, - "upper": 0.031, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8775, - "uncertainty": { - "confidence_interval": { - "lower": -0.0321, - "upper": 0.0321, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "uncertainty": { - "confidence_interval": { - "lower": -0.033, - "upper": 0.033, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "uncertainty": { - "confidence_interval": { - "lower": -0.033, - "upper": 0.033, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8675, - "uncertainty": { - "confidence_interval": { - "lower": -0.0332, - "upper": 0.0332, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "uncertainty": { - "confidence_interval": { - "lower": -0.0345, - "upper": 0.0345, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "uncertainty": { - "confidence_interval": { - "lower": -0.0313, - "upper": 0.0313, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "uncertainty": { - "confidence_interval": { - "lower": -0.0318, - "upper": 0.0318, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "uncertainty": { - "confidence_interval": { - "lower": -0.0345, - "upper": 0.0345, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0348, - "upper": 0.0348, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0348, - "upper": 0.0348, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "uncertainty": { - "confidence_interval": { - "lower": -0.0307, - "upper": 0.0307, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0327, - "upper": 0.0327, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/xai/grok-3-mini/31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json b/data/global-mmlu-lite/xai/grok-3-mini/31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json deleted file mode 100644 index f816ebb33..000000000 --- a/data/global-mmlu-lite/xai/grok-3-mini/31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "grok-3-mini", - "id": "xai/grok-3-mini", - "developer": "xai", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Grok 3 Mini" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6717 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6743 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "uncertainty": { - "confidence_interval": { - "lower": -0.0421, - "upper": 0.0421, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075, - "uncertainty": { - "confidence_interval": { - "lower": -0.049, - "upper": 0.049, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7355, - "uncertainty": { - "confidence_interval": { - "lower": -0.0434, - "upper": 0.0434, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6591, - "uncertainty": { - "confidence_interval": { - "lower": -0.0465, - "upper": 0.0465, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.485, - "uncertainty": { - "confidence_interval": { - "lower": -0.049, - "upper": 0.049, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "uncertainty": { - "confidence_interval": { - "lower": -0.0486, - "upper": 0.0486, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0438, - "upper": 0.0438, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "uncertainty": { - "confidence_interval": { - "lower": -0.0452, - "upper": 0.0452, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6575, - "uncertainty": { - "confidence_interval": { - "lower": -0.0465, - "upper": 0.0465, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7325, - "uncertainty": { - "confidence_interval": { - "lower": -0.0434, - "upper": 0.0434, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6275, - "uncertainty": { - "confidence_interval": { - "lower": -0.0474, - "upper": 0.0474, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "uncertainty": { - "confidence_interval": { - "lower": -0.0478, - "upper": 0.0478, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7625, - "uncertainty": { - "confidence_interval": { - "lower": -0.0417, - "upper": 0.0417, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8296, - "uncertainty": { - "confidence_interval": { - "lower": -0.0369, - "upper": 0.0369, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5564, - "uncertainty": { - "confidence_interval": { - "lower": -0.0487, - "upper": 0.0487, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8693, - "uncertainty": { - "confidence_interval": { - "lower": -0.0331, - "upper": 0.0331, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/global-mmlu-lite/xai/grok-4-0709/a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json b/data/global-mmlu-lite/xai/grok-4-0709/a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json deleted file mode 100644 index 4e37c60a0..000000000 --- a/data/global-mmlu-lite/xai/grok-4-0709/a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json +++ /dev/null @@ -1,515 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "global-mmlu-lite/xai_grok-4-0709/1770822797.839372", - "retrieved_timestamp": "1770822797.839372", - "source_metadata": { - "source_name": "Global MMLU Lite Leaderboard", - "source_type": "documentation", - "source_organization_name": "kaggle", - "source_organization_url": "www.kaggle.com", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "grok-4-0709", - "id": "xai/grok-4-0709", - "developer": "xai", - "inference_platform": "unknown", - "additional_details": { - "display_name": "Grok 4" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Global MMLU Lite", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Global MMLU Lite", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8881 - } - }, - { - "evaluation_name": "Culturally Sensitive", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Sensitive", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8862 - } - }, - { - "evaluation_name": "Culturally Agnostic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Culturally Agnostic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89 - } - }, - { - "evaluation_name": "Arabic", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Arabic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "uncertainty": { - "confidence_interval": { - "lower": -0.0313, - "upper": 0.0313, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "English", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - English", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "uncertainty": { - "confidence_interval": { - "lower": -0.0287, - "upper": 0.0287, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Bengali", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Bengali", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8925, - "uncertainty": { - "confidence_interval": { - "lower": -0.0304, - "upper": 0.0304, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "German", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - German", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0327, - "upper": 0.0327, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "French", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - French", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "uncertainty": { - "confidence_interval": { - "lower": -0.0324, - "upper": 0.0324, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Hindi", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Hindi", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8675, - "uncertainty": { - "confidence_interval": { - "lower": -0.0332, - "upper": 0.0332, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Indonesian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Indonesian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "uncertainty": { - "confidence_interval": { - "lower": -0.0307, - "upper": 0.0307, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Italian", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Italian", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9025, - "uncertainty": { - "confidence_interval": { - "lower": -0.0291, - "upper": 0.0291, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Japanese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Japanese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "uncertainty": { - "confidence_interval": { - "lower": -0.033, - "upper": 0.033, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Korean", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Korean", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.895, - "uncertainty": { - "confidence_interval": { - "lower": -0.03, - "upper": 0.03, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Portuguese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Portuguese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8725, - "uncertainty": { - "confidence_interval": { - "lower": -0.0327, - "upper": 0.0327, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Spanish", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Spanish", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Swahili", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Swahili", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "uncertainty": { - "confidence_interval": { - "lower": -0.028, - "upper": 0.028, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Yoruba", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Yoruba", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "uncertainty": { - "confidence_interval": { - "lower": -0.0287, - "upper": 0.0287, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Chinese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Chinese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8525, - "uncertainty": { - "confidence_interval": { - "lower": -0.0348, - "upper": 0.0348, - "method": "unknown" - } - } - } - }, - { - "evaluation_name": "Burmese", - "source_data": { - "dataset_name": "global-mmlu-lite", - "source_type": "url", - "url": [ - "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite" - ] - }, - "metric_config": { - "evaluation_description": "Global MMLU Lite - Burmese", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9075, - "uncertainty": { - "confidence_interval": { - "lower": -0.0284, - "upper": 0.0284, - "method": "unknown" - } - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json deleted file mode 100644 index 8176fa91a..000000000 --- a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json +++ /dev/null @@ -1,352 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo 2 32B Instruct March 2025", - "id": "allenai/olmo-2-0325-32b-instruct", - "developer": "allenai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.475, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 191.7591204277284 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414, - "details": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=106.958, mean=106.958, max=106.958, sum=106.958 (1)", - "tab": "Efficiency", - "score": 106.95772108364105 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.506, mean=228.506, max=228.506, sum=228.506 (1)", - "tab": "General information", - "score": 228.506 - }, - "MMLU-Pro - # output tokens": { - "description": "min=338.34, mean=338.34, max=338.34, sum=338.34 (1)", - "tab": "General information", - "score": 338.34 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.287, - "details": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=161.247, mean=161.247, max=161.247, sum=161.247 (1)", - "tab": "Efficiency", - "score": 161.24673478646127 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "General information", - "score": 0.002242152466367713 - }, - "GPQA - # prompt tokens": { - "description": "min=247.26, mean=247.26, max=247.26, sum=247.26 (1)", - "tab": "General information", - "score": 247.26008968609867 - }, - "GPQA - # output tokens": { - "description": "min=526.352, mean=526.352, max=526.352, sum=526.352 (1)", - "tab": "General information", - "score": 526.3520179372198 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=0.78 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=78.302, mean=78.302, max=78.302, sum=78.302 (1)", - "tab": "Efficiency", - "score": 78.30223875301382 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)", - "tab": "General information", - "score": 46.05360443622921 - }, - "IFEval - # output tokens": { - "description": "min=260.017, mean=260.017, max=260.017, sum=260.017 (1)", - "tab": "General information", - "score": 260.0166358595194 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.734, - "details": { - "description": "min=0.734, mean=0.734, max=0.734, sum=0.734 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=333.659, mean=333.659, max=333.659, sum=333.659 (1)", - "tab": "Efficiency", - "score": 333.659037665844 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=764.742, mean=764.742, max=764.742, sum=764.742 (1)", - "tab": "General information", - "score": 764.742 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.161, - "details": { - "description": "min=0.161, mean=0.161, max=0.161, sum=0.161 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=278.63, mean=278.63, max=278.63, sum=278.63 (1)", - "tab": "Efficiency", - "score": 278.6298698496819 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "General information", - "score": 0.001 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=108.843, mean=108.843, max=108.843, sum=108.843 (1)", - "tab": "General information", - "score": 108.843 - }, - "Omni-MATH - # output tokens": { - "description": "min=573.483, mean=573.483, max=573.483, sum=573.483 (1)", - "tab": "General information", - "score": 573.483 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json deleted file mode 100644 index 4d2b264af..000000000 --- a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json +++ /dev/null @@ -1,352 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo 2 13B Instruct November 2024", - "id": "allenai/olmo-2-1124-13b-instruct", - "developer": "allenai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 103.93921828652563 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31, - "details": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.31 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=48.22, mean=48.22, max=48.22, sum=48.22 (1)", - "tab": "Efficiency", - "score": 48.21963578557968 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.506, mean=228.506, max=228.506, sum=228.506 (1)", - "tab": "General information", - "score": 228.506 - }, - "MMLU-Pro - # output tokens": { - "description": "min=200.755, mean=200.755, max=200.755, sum=200.755 (1)", - "tab": "General information", - "score": 200.755 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.316, - "details": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.316 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=44.368, mean=44.368, max=44.368, sum=44.368 (1)", - "tab": "Efficiency", - "score": 44.36780591235567 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "General information", - "score": 0.002242152466367713 - }, - "GPQA - # prompt tokens": { - "description": "min=247.26, mean=247.26, max=247.26, sum=247.26 (1)", - "tab": "General information", - "score": 247.26008968609867 - }, - "GPQA - # output tokens": { - "description": "min=185.419, mean=185.419, max=185.419, sum=185.419 (1)", - "tab": "General information", - "score": 185.41928251121075 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=0.73 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=71.901, mean=71.901, max=71.901, sum=71.901 (1)", - "tab": "Efficiency", - "score": 71.90055892868536 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)", - "tab": "General information", - "score": 46.05360443622921 - }, - "IFEval - # output tokens": { - "description": "min=311.527, mean=311.527, max=311.527, sum=311.527 (1)", - "tab": "General information", - "score": 311.5268022181146 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689, - "details": { - "description": "min=0.689, mean=0.689, max=0.689, sum=0.689 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=194.337, mean=194.337, max=194.337, sum=194.337 (1)", - "tab": "Efficiency", - "score": 194.33703967285157 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=771.135, mean=771.135, max=771.135, sum=771.135 (1)", - "tab": "General information", - "score": 771.135 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.156, - "details": { - "description": "min=0.156, mean=0.156, max=0.156, sum=0.156 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=160.871, mean=160.871, max=160.871, sum=160.871 (1)", - "tab": "Efficiency", - "score": 160.87105113315582 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "General information", - "score": 0.001 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=108.843, mean=108.843, max=108.843, sum=108.843 (1)", - "tab": "General information", - "score": 108.843 - }, - "Omni-MATH - # output tokens": { - "description": "min=681.572, mean=681.572, max=681.572, sum=681.572 (1)", - "tab": "General information", - "score": 681.572 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json deleted file mode 100644 index 39fbc0d1c..000000000 --- a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json +++ /dev/null @@ -1,352 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo 2 7B Instruct November 2024", - "id": "allenai/olmo-2-1124-7b-instruct", - "developer": "allenai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.405, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 164.44917339954657 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.292, - "details": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=65.565, mean=65.565, max=65.565, sum=65.565 (1)", - "tab": "Efficiency", - "score": 65.56540368175507 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.506, mean=228.506, max=228.506, sum=228.506 (1)", - "tab": "General information", - "score": 228.506 - }, - "MMLU-Pro - # output tokens": { - "description": "min=265.659, mean=265.659, max=265.659, sum=265.659 (1)", - "tab": "General information", - "score": 265.659 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.296, - "details": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=184.733, mean=184.733, max=184.733, sum=184.733 (1)", - "tab": "Efficiency", - "score": 184.73346061877606 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "General information", - "score": 0.002242152466367713 - }, - "GPQA - # prompt tokens": { - "description": "min=247.26, mean=247.26, max=247.26, sum=247.26 (1)", - "tab": "General information", - "score": 247.26008968609867 - }, - "GPQA - # output tokens": { - "description": "min=381.121, mean=381.121, max=381.121, sum=381.121 (1)", - "tab": "General information", - "score": 381.1210762331838 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.693, mean=0.693, max=0.693, sum=0.693 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=102.503, mean=102.503, max=102.503, sum=102.503 (1)", - "tab": "Efficiency", - "score": 102.50307150909508 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)", - "tab": "General information", - "score": 46.05360443622921 - }, - "IFEval - # output tokens": { - "description": "min=306.706, mean=306.706, max=306.706, sum=306.706 (1)", - "tab": "General information", - "score": 306.70609981515713 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.628, - "details": { - "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=236.772, mean=236.772, max=236.772, sum=236.772 (1)", - "tab": "Efficiency", - "score": 236.77177815794946 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=768.348, mean=768.348, max=768.348, sum=768.348 (1)", - "tab": "General information", - "score": 768.348 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.116, - "details": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.116 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=232.672, mean=232.672, max=232.672, sum=232.672 (1)", - "tab": "Efficiency", - "score": 232.6721530301571 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "General information", - "score": 0.001 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=108.843, mean=108.843, max=108.843, sum=108.843 (1)", - "tab": "General information", - "score": 108.843 - }, - "Omni-MATH - # output tokens": { - "description": "min=799.769, mean=799.769, max=799.769, sum=799.769 (1)", - "tab": "General information", - "score": 799.769 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json deleted file mode 100644 index 99d31c069..000000000 --- a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json +++ /dev/null @@ -1,352 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMoE 1B-7B Instruct January 2025", - "id": "allenai/olmoe-1b-7b-0125-instruct", - "developer": "allenai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.332, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 449.11527986486544 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.169, - "details": { - "description": "min=0.169, mean=0.169, max=0.169, sum=0.169 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=226.84, mean=226.84, max=226.84, sum=226.84 (1)", - "tab": "Efficiency", - "score": 226.84002213978766 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=231.403, mean=231.403, max=231.403, sum=231.403 (1)", - "tab": "General information", - "score": 231.403 - }, - "MMLU-Pro - # output tokens": { - "description": "min=237.89, mean=237.89, max=237.89, sum=237.89 (1)", - "tab": "General information", - "score": 237.89 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.22, - "details": { - "description": "min=0.22, mean=0.22, max=0.22, sum=0.22 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=263.918, mean=263.918, max=263.918, sum=263.918 (1)", - "tab": "Efficiency", - "score": 263.9177615305768 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "General information", - "score": 0.002242152466367713 - }, - "GPQA - # prompt tokens": { - "description": "min=249.803, mean=249.803, max=249.803, sum=249.803 (1)", - "tab": "General information", - "score": 249.80269058295963 - }, - "GPQA - # output tokens": { - "description": "min=302.475, mean=302.475, max=302.475, sum=302.475 (1)", - "tab": "General information", - "score": 302.47533632286996 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.628, - "details": { - "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=437.953, mean=437.953, max=437.953, sum=437.953 (1)", - "tab": "Efficiency", - "score": 437.95291065332407 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.782, mean=47.782, max=47.782, sum=47.782 (1)", - "tab": "General information", - "score": 47.781885397412196 - }, - "IFEval - # output tokens": { - "description": "min=432.808, mean=432.808, max=432.808, sum=432.808 (1)", - "tab": "General information", - "score": 432.80776340110907 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.551, - "details": { - "description": "min=0.551, mean=0.551, max=0.551, sum=0.551 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=915.237, mean=915.237, max=915.237, sum=915.237 (1)", - "tab": "Efficiency", - "score": 915.2368009176254 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=972.482, mean=972.482, max=972.482, sum=972.482 (1)", - "tab": "General information", - "score": 972.482 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.093, - "details": { - "description": "min=0.093, mean=0.093, max=0.093, sum=0.093 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=401.629, mean=401.629, max=401.629, sum=401.629 (1)", - "tab": "Efficiency", - "score": 401.62890408301354 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "General information", - "score": 0.001 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.864, mean=110.864, max=110.864, sum=110.864 (1)", - "tab": "General information", - "score": 110.864 - }, - "Omni-MATH - # output tokens": { - "description": "min=442.229, mean=442.229, max=442.229, sum=442.229 (1)", - "tab": "General information", - "score": 442.229 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json deleted file mode 100644 index c786f36c7..000000000 --- a/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Lite", - "id": "amazon/nova-lite-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.551, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 2.6046740288354906 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=0.6 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=1.375, mean=1.375, max=1.375, sum=1.375 (1)", - "tab": "Efficiency", - "score": 1.3748559999999983 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=235.232, mean=235.232, max=235.232, sum=235.232 (1)", - "tab": "General information", - "score": 235.232 - }, - "MMLU-Pro - # output tokens": { - "description": "min=343.771, mean=343.771, max=343.771, sum=343.771 (1)", - "tab": "General information", - "score": 343.771 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.397, - "details": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.397 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=2.04, mean=2.04, max=2.04, sum=2.04 (1)", - "tab": "Efficiency", - "score": 2.0404999999999998 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=264.121, mean=264.121, max=264.121, sum=264.121 (1)", - "tab": "General information", - "score": 264.1210762331838 - }, - "GPQA - # output tokens": { - "description": "min=512.256, mean=512.256, max=512.256, sum=512.256 (1)", - "tab": "General information", - "score": 512.2556053811659 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.776, - "details": { - "description": "min=0.776, mean=0.776, max=0.776, sum=0.776 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=3.156, mean=3.156, max=3.156, sum=3.156 (1)", - "tab": "Efficiency", - "score": 3.1562421441774484 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.58, mean=47.58, max=47.58, sum=47.58 (1)", - "tab": "General information", - "score": 47.58040665434381 - }, - "IFEval - # output tokens": { - "description": "min=412.706, mean=412.706, max=412.706, sum=412.706 (1)", - "tab": "General information", - "score": 412.70609981515713 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=0.75 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=4.034, mean=4.034, max=4.034, sum=4.034 (1)", - "tab": "Efficiency", - "score": 4.0338700000000065 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=938.586, mean=938.586, max=938.586, sum=938.586 (1)", - "tab": "General information", - "score": 938.586 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.233, - "details": { - "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=2.418, mean=2.418, max=2.418, sum=2.418 (1)", - "tab": "Efficiency", - "score": 2.4179019999999993 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=117.921, mean=117.921, max=117.921, sum=117.921 (1)", - "tab": "General information", - "score": 117.921 - }, - "Omni-MATH - # output tokens": { - "description": "min=788.8, mean=788.8, max=788.8, sum=788.8 (1)", - "tab": "General information", - "score": 788.8 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json deleted file mode 100644 index 6219cdf47..000000000 --- a/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Micro", - "id": "amazon/nova-micro-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.522, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 2.157983343244118 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.511, - "details": { - "description": "min=0.511, mean=0.511, max=0.511, sum=0.511 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=1.316, mean=1.316, max=1.316, sum=1.316 (1)", - "tab": "Efficiency", - "score": 1.3163370000000014 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=235.232, mean=235.232, max=235.232, sum=235.232 (1)", - "tab": "General information", - "score": 235.232 - }, - "MMLU-Pro - # output tokens": { - "description": "min=367.695, mean=367.695, max=367.695, sum=367.695 (1)", - "tab": "General information", - "score": 367.695 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.383, - "details": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=2.134, mean=2.134, max=2.134, sum=2.134 (1)", - "tab": "Efficiency", - "score": 2.1342376681614366 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=264.121, mean=264.121, max=264.121, sum=264.121 (1)", - "tab": "General information", - "score": 264.1210762331838 - }, - "GPQA - # output tokens": { - "description": "min=587.372, mean=587.372, max=587.372, sum=587.372 (1)", - "tab": "General information", - "score": 587.3721973094171 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=0.76 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=1.605, mean=1.605, max=1.605, sum=1.605 (1)", - "tab": "Efficiency", - "score": 1.6054140480591508 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.58, mean=47.58, max=47.58, sum=47.58 (1)", - "tab": "General information", - "score": 47.58040665434381 - }, - "IFEval - # output tokens": { - "description": "min=385.473, mean=385.473, max=385.473, sum=385.473 (1)", - "tab": "General information", - "score": 385.4731977818854 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=3.624, mean=3.624, max=3.624, sum=3.624 (1)", - "tab": "Efficiency", - "score": 3.6235889999999995 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=925.586, mean=925.586, max=925.586, sum=925.586 (1)", - "tab": "General information", - "score": 925.586 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.214, - "details": { - "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=2.11, mean=2.11, max=2.11, sum=2.11 (1)", - "tab": "Efficiency", - "score": 2.1103390000000006 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=117.921, mean=117.921, max=117.921, sum=117.921 (1)", - "tab": "General information", - "score": 117.921 - }, - "Omni-MATH - # output tokens": { - "description": "min=743.286, mean=743.286, max=743.286, sum=743.286 (1)", - "tab": "General information", - "score": 743.286 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json deleted file mode 100644 index d9f1bd857..000000000 --- a/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Premier", - "id": "amazon/nova-premier-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 7.8055529408801165 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=5.032, mean=5.032, max=5.032, sum=5.032 (1)", - "tab": "Efficiency", - "score": 5.031505000000002 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=235.232, mean=235.232, max=235.232, sum=235.232 (1)", - "tab": "General information", - "score": 235.232 - }, - "MMLU-Pro - # output tokens": { - "description": "min=360.651, mean=360.651, max=360.651, sum=360.651 (1)", - "tab": "General information", - "score": 360.651 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518, - "details": { - "description": "min=0.518, mean=0.518, max=0.518, sum=0.518 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=6.746, mean=6.746, max=6.746, sum=6.746 (1)", - "tab": "Efficiency", - "score": 6.7455403587443925 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=264.121, mean=264.121, max=264.121, sum=264.121 (1)", - "tab": "General information", - "score": 264.1210762331838 - }, - "GPQA - # output tokens": { - "description": "min=452.691, mean=452.691, max=452.691, sum=452.691 (1)", - "tab": "General information", - "score": 452.69058295964123 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.803, - "details": { - "description": "min=0.803, mean=0.803, max=0.803, sum=0.803 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=6.027, mean=6.027, max=6.027, sum=6.027 (1)", - "tab": "Efficiency", - "score": 6.026593345656195 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.58, mean=47.58, max=47.58, sum=47.58 (1)", - "tab": "General information", - "score": 47.58040665434381 - }, - "IFEval - # output tokens": { - "description": "min=325.945, mean=325.945, max=325.945, sum=325.945 (1)", - "tab": "General information", - "score": 325.9445471349353 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=13.055, mean=13.055, max=13.055, sum=13.055 (1)", - "tab": "Efficiency", - "score": 13.055127999999996 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=814.969, mean=814.969, max=814.969, sum=814.969 (1)", - "tab": "General information", - "score": 814.969 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35, - "details": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.35 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=8.169, mean=8.169, max=8.169, sum=8.169 (1)", - "tab": "Efficiency", - "score": 8.168997999999998 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=117.921, mean=117.921, max=117.921, sum=117.921 (1)", - "tab": "General information", - "score": 117.921 - }, - "Omni-MATH - # output tokens": { - "description": "min=778.909, mean=778.909, max=778.909, sum=778.909 (1)", - "tab": "General information", - "score": 778.909 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json deleted file mode 100644 index 658945ff5..000000000 --- a/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Pro", - "id": "amazon/nova-pro-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.591, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 6.538285667967472 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673, - "details": { - "description": "min=0.673, mean=0.673, max=0.673, sum=0.673 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=4.554, mean=4.554, max=4.554, sum=4.554 (1)", - "tab": "Efficiency", - "score": 4.554401999999996 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=235.232, mean=235.232, max=235.232, sum=235.232 (1)", - "tab": "General information", - "score": 235.232 - }, - "MMLU-Pro - # output tokens": { - "description": "min=381.807, mean=381.807, max=381.807, sum=381.807 (1)", - "tab": "General information", - "score": 381.807 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446, - "details": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=5.948, mean=5.948, max=5.948, sum=5.948 (1)", - "tab": "Efficiency", - "score": 5.947926008968607 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=264.121, mean=264.121, max=264.121, sum=264.121 (1)", - "tab": "General information", - "score": 264.1210762331838 - }, - "GPQA - # output tokens": { - "description": "min=534.013, mean=534.013, max=534.013, sum=534.013 (1)", - "tab": "General information", - "score": 534.0134529147982 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=3.945, mean=3.945, max=3.945, sum=3.945 (1)", - "tab": "Efficiency", - "score": 3.945081330868756 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.58, mean=47.58, max=47.58, sum=47.58 (1)", - "tab": "General information", - "score": 47.58040665434381 - }, - "IFEval - # output tokens": { - "description": "min=383.871, mean=383.871, max=383.871, sum=383.871 (1)", - "tab": "General information", - "score": 383.8706099815157 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777, - "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=10.635, mean=10.635, max=10.635, sum=10.635 (1)", - "tab": "Efficiency", - "score": 10.635314999999995 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=899.758, mean=899.758, max=899.758, sum=899.758 (1)", - "tab": "General information", - "score": 899.758 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.242, - "details": { - "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=7.609, mean=7.609, max=7.609, sum=7.609 (1)", - "tab": "Efficiency", - "score": 7.608704000000004 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=117.921, mean=117.921, max=117.921, sum=117.921 (1)", - "tab": "General information", - "score": 117.921 - }, - "Omni-MATH - # output tokens": { - "description": "min=649.195, mean=649.195, max=649.195, sum=649.195 (1)", - "tab": "General information", - "score": 649.195 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json deleted file mode 100644 index d63e271d1..000000000 --- a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3.5 Haiku 20241022", - "id": "anthropic/claude-3-5-haiku-20241022", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 6.973328374403875 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.605, - "details": { - "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=5.171, mean=5.171, max=5.171, sum=5.171 (1)", - "tab": "Efficiency", - "score": 5.170877918004989 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)", - "tab": "General information", - "score": 230.461 - }, - "MMLU-Pro - # output tokens": { - "description": "min=253.047, mean=253.047, max=253.047, sum=253.047 (1)", - "tab": "General information", - "score": 253.047 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363, - "details": { - "description": "min=0.363, mean=0.363, max=0.363, sum=0.363 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=5.33, mean=5.33, max=5.33, sum=5.33 (1)", - "tab": "Efficiency", - "score": 5.329682314877018 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)", - "tab": "General information", - "score": 250.73766816143498 - }, - "GPQA - # output tokens": { - "description": "min=270.388, mean=270.388, max=270.388, sum=270.388 (1)", - "tab": "General information", - "score": 270.38789237668163 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.792, - "details": { - "description": "min=0.792, mean=0.792, max=0.792, sum=0.792 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=5.886, mean=5.886, max=5.886, sum=5.886 (1)", - "tab": "Efficiency", - "score": 5.885677124347793 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", - "tab": "General information", - "score": 47.15896487985213 - }, - "IFEval - # output tokens": { - "description": "min=273.985, mean=273.985, max=273.985, sum=273.985 (1)", - "tab": "General information", - "score": 273.9852125693161 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=0.76 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=10.629, mean=10.629, max=10.629, sum=10.629 (1)", - "tab": "Efficiency", - "score": 10.62865050649643 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=544.911, mean=544.911, max=544.911, sum=544.911 (1)", - "tab": "General information", - "score": 544.911 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.224, - "details": { - "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=7.852, mean=7.852, max=7.852, sum=7.852 (1)", - "tab": "Efficiency", - "score": 7.851754008293152 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", - "tab": "General information", - "score": 110.563 - }, - "Omni-MATH - # output tokens": { - "description": "min=409.742, mean=409.742, max=409.742, sum=409.742 (1)", - "tab": "General information", - "score": 409.742 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json deleted file mode 100644 index c53a3aa66..000000000 --- a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3.5 Sonnet 20241022", - "id": "anthropic/claude-3-5-sonnet-20241022", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.653, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 7.355400399849929 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777, - "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=5.096, mean=5.096, max=5.096, sum=5.096 (1)", - "tab": "Efficiency", - "score": 5.096486385822296 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)", - "tab": "General information", - "score": 230.461 - }, - "MMLU-Pro - # output tokens": { - "description": "min=212.233, mean=212.233, max=212.233, sum=212.233 (1)", - "tab": "General information", - "score": 212.233 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565, - "details": { - "description": "min=0.565, mean=0.565, max=0.565, sum=0.565 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=6.262, mean=6.262, max=6.262, sum=6.262 (1)", - "tab": "Efficiency", - "score": 6.261580738251519 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)", - "tab": "General information", - "score": 250.73766816143498 - }, - "GPQA - # output tokens": { - "description": "min=260.175, mean=260.175, max=260.175, sum=260.175 (1)", - "tab": "General information", - "score": 260.17488789237666 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.856, - "details": { - "description": "min=0.856, mean=0.856, max=0.856, sum=0.856 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=6.967, mean=6.967, max=6.967, sum=6.967 (1)", - "tab": "Efficiency", - "score": 6.966711103365293 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", - "tab": "General information", - "score": 47.15896487985213 - }, - "IFEval - # output tokens": { - "description": "min=299.843, mean=299.843, max=299.843, sum=299.843 (1)", - "tab": "General information", - "score": 299.84288354898337 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.792, - "details": { - "description": "min=0.792, mean=0.792, max=0.792, sum=0.792 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=10.864, mean=10.864, max=10.864, sum=10.864 (1)", - "tab": "Efficiency", - "score": 10.86402980184555 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=603.959, mean=603.959, max=603.959, sum=603.959 (1)", - "tab": "General information", - "score": 603.959 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276, - "details": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=7.588, mean=7.588, max=7.588, sum=7.588 (1)", - "tab": "Efficiency", - "score": 7.588193969964981 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", - "tab": "General information", - "score": 110.563 - }, - "Omni-MATH - # output tokens": { - "description": "min=397.573, mean=397.573, max=397.573, sum=397.573 (1)", - "tab": "General information", - "score": 397.573 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json deleted file mode 100644 index 1f5c52f66..000000000 --- a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3.7 Sonnet 20250219", - "id": "anthropic/claude-3-7-sonnet-20250219", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 9.05170552277221 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.784, - "details": { - "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=4.744, mean=4.744, max=4.744, sum=4.744 (1)", - "tab": "Efficiency", - "score": 4.744252296209336 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)", - "tab": "General information", - "score": 230.461 - }, - "MMLU-Pro - # output tokens": { - "description": "min=242.773, mean=242.773, max=242.773, sum=242.773 (1)", - "tab": "General information", - "score": 242.773 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.608, - "details": { - "description": "min=0.608, mean=0.608, max=0.608, sum=0.608 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=6.459, mean=6.459, max=6.459, sum=6.459 (1)", - "tab": "Efficiency", - "score": 6.4586481999923295 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)", - "tab": "General information", - "score": 250.73766816143498 - }, - "GPQA - # output tokens": { - "description": "min=312.666, mean=312.666, max=312.666, sum=312.666 (1)", - "tab": "General information", - "score": 312.6659192825112 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=0.834 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=8.075, mean=8.075, max=8.075, sum=8.075 (1)", - "tab": "Efficiency", - "score": 8.075105538870623 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", - "tab": "General information", - "score": 47.15896487985213 - }, - "IFEval - # output tokens": { - "description": "min=406.532, mean=406.532, max=406.532, sum=406.532 (1)", - "tab": "General information", - "score": 406.5323475046211 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.814, - "details": { - "description": "min=0.814, mean=0.814, max=0.814, sum=0.814 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=15.683, mean=15.683, max=15.683, sum=15.683 (1)", - "tab": "Efficiency", - "score": 15.682527210235596 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=862.287, mean=862.287, max=862.287, sum=862.287 (1)", - "tab": "General information", - "score": 862.287 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.33 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=10.298, mean=10.298, max=10.298, sum=10.298 (1)", - "tab": "Efficiency", - "score": 10.297994368553162 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", - "tab": "General information", - "score": 110.563 - }, - "Omni-MATH - # output tokens": { - "description": "min=670.885, mean=670.885, max=670.885, sum=670.885 (1)", - "tab": "General information", - "score": 670.885 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json deleted file mode 100644 index da15e55a7..000000000 --- a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 4.5 Haiku 20251001", - "id": "anthropic/claude-haiku-4-5-20251001", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.717, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 7.381503096938465 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777, - "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=3.701, mean=3.701, max=3.701, sum=3.701 (1)", - "tab": "Efficiency", - "score": 3.7008020806312563 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)", - "tab": "General information", - "score": 252.461 - }, - "MMLU-Pro - # output tokens": { - "description": "min=374.129, mean=374.129, max=374.129, sum=374.129 (1)", - "tab": "General information", - "score": 374.129 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.605, - "details": { - "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=5.102, mean=5.102, max=5.102, sum=5.102 (1)", - "tab": "Efficiency", - "score": 5.102193982611857 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)", - "tab": "General information", - "score": 272.73766816143495 - }, - "GPQA - # output tokens": { - "description": "min=524.525, mean=524.525, max=524.525, sum=524.525 (1)", - "tab": "General information", - "score": 524.5246636771301 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.801, - "details": { - "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=4.355, mean=4.355, max=4.355, sum=4.355 (1)", - "tab": "Efficiency", - "score": 4.355410516372229 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", - "tab": "General information", - "score": 47.15896487985213 - }, - "IFEval - # output tokens": { - "description": "min=390.416, mean=390.416, max=390.416, sum=390.416 (1)", - "tab": "General information", - "score": 390.4158964879852 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.839, - "details": { - "description": "min=0.839, mean=0.839, max=0.839, sum=0.839 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=16.317, mean=16.317, max=16.317, sum=16.317 (1)", - "tab": "Efficiency", - "score": 16.317131044387818 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1835.337, mean=1835.337, max=1835.337, sum=1835.337 (1)", - "tab": "General information", - "score": 1835.337 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=7.432, mean=7.432, max=7.432, sum=7.432 (1)", - "tab": "Efficiency", - "score": 7.431977860689163 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", - "tab": "General information", - "score": 110.563 - }, - "Omni-MATH - # output tokens": { - "description": "min=937.799, mean=937.799, max=937.799, sum=937.799 (1)", - "tab": "General information", - "score": 937.799 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json deleted file mode 100644 index c554c6a65..000000000 --- a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 4 Opus 20250514, extended thinking", - "id": "anthropic/claude-opus-4-20250514-thinking-10k", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 52.297304217949794 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "details": { - "description": "min=0.875, mean=0.875, max=0.875, sum=0.875 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=28.466, mean=28.466, max=28.466, sum=28.466 (1)", - "tab": "Efficiency", - "score": 28.46593898815197 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)", - "tab": "General information", - "score": 252.461 - }, - "MMLU-Pro - # output tokens": { - "description": "min=272.871, mean=272.871, max=272.871, sum=272.871 (1)", - "tab": "General information", - "score": 272.871 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=0.709 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=45.529, mean=45.529, max=45.529, sum=45.529 (1)", - "tab": "Efficiency", - "score": 45.52923426562793 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)", - "tab": "General information", - "score": 272.73766816143495 - }, - "GPQA - # output tokens": { - "description": "min=343.762, mean=343.762, max=343.762, sum=343.762 (1)", - "tab": "General information", - "score": 343.76233183856505 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=22.453, mean=22.453, max=22.453, sum=22.453 (1)", - "tab": "Efficiency", - "score": 22.45251508421368 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", - "tab": "General information", - "score": 47.15896487985213 - }, - "IFEval - # output tokens": { - "description": "min=403.745, mean=403.745, max=403.745, sum=403.745 (1)", - "tab": "General information", - "score": 403.74491682070243 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=50.19, mean=50.19, max=50.19, sum=50.19 (1)", - "tab": "Efficiency", - "score": 50.19046350765228 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1195.769, mean=1195.769, max=1195.769, sum=1195.769 (1)", - "tab": "General information", - "score": 1195.769 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=114.848, mean=114.848, max=114.848, sum=114.848 (1)", - "tab": "Efficiency", - "score": 114.84836924410313 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", - "tab": "General information", - "score": 110.563 - }, - "Omni-MATH - # output tokens": { - "description": "min=691.066, mean=691.066, max=691.066, sum=691.066 (1)", - "tab": "General information", - "score": 691.066 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json deleted file mode 100644 index 240e9ebf4..000000000 --- a/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 4 Opus 20250514", - "id": "anthropic/claude-opus-4-20250514", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 20.48127702555515 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=12.63, mean=12.63, max=12.63, sum=12.63 (1)", - "tab": "Efficiency", - "score": 12.630421590518665 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)", - "tab": "General information", - "score": 230.461 - }, - "MMLU-Pro - # output tokens": { - "description": "min=344.469, mean=344.469, max=344.469, sum=344.469 (1)", - "tab": "General information", - "score": 344.469 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.666, - "details": { - "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=16.325, mean=16.325, max=16.325, sum=16.325 (1)", - "tab": "Efficiency", - "score": 16.325411326249803 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)", - "tab": "General information", - "score": 250.73766816143498 - }, - "GPQA - # output tokens": { - "description": "min=453.143, mean=453.143, max=453.143, sum=453.143 (1)", - "tab": "General information", - "score": 453.1434977578475 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.918, - "details": { - "description": "min=0.918, mean=0.918, max=0.918, sum=0.918 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=16.576, mean=16.576, max=16.576, sum=16.576 (1)", - "tab": "Efficiency", - "score": 16.576411149939712 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", - "tab": "General information", - "score": 47.15896487985213 - }, - "IFEval - # output tokens": { - "description": "min=422.774, mean=422.774, max=422.774, sum=422.774 (1)", - "tab": "General information", - "score": 422.7744916820702 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=0.833 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=29.848, mean=29.848, max=29.848, sum=29.848 (1)", - "tab": "Efficiency", - "score": 29.848318881988526 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=936.927, mean=936.927, max=936.927, sum=936.927 (1)", - "tab": "General information", - "score": 936.927 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.511, - "details": { - "description": "min=0.511, mean=0.511, max=0.511, sum=0.511 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=27.026, mean=27.026, max=27.026, sum=27.026 (1)", - "tab": "Efficiency", - "score": 27.025822179079057 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", - "tab": "General information", - "score": 110.563 - }, - "Omni-MATH - # output tokens": { - "description": "min=893.894, mean=893.894, max=893.894, sum=893.894 (1)", - "tab": "General information", - "score": 893.894 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json deleted file mode 100644 index ecc6c0f0a..000000000 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 4 Sonnet 20250514, extended thinking", - "id": "anthropic/claude-sonnet-4-20250514-thinking-10k", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.766, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 38.96330262736815 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=23.165, mean=23.165, max=23.165, sum=23.165 (1)", - "tab": "Efficiency", - "score": 23.16487550187111 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)", - "tab": "General information", - "score": 252.461 - }, - "MMLU-Pro - # output tokens": { - "description": "min=325.194, mean=325.194, max=325.194, sum=325.194 (1)", - "tab": "General information", - "score": 325.194 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=38.16, mean=38.16, max=38.16, sum=38.16 (1)", - "tab": "Efficiency", - "score": 38.15993662211927 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)", - "tab": "General information", - "score": 272.73766816143495 - }, - "GPQA - # output tokens": { - "description": "min=414.928, mean=414.928, max=414.928, sum=414.928 (1)", - "tab": "General information", - "score": 414.92825112107624 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=0.84 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=12.654, mean=12.654, max=12.654, sum=12.654 (1)", - "tab": "Efficiency", - "score": 12.65442304822742 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", - "tab": "General information", - "score": 47.15896487985213 - }, - "IFEval - # output tokens": { - "description": "min=380.645, mean=380.645, max=380.645, sum=380.645 (1)", - "tab": "General information", - "score": 380.64510166358593 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=32.933, mean=32.933, max=32.933, sum=32.933 (1)", - "tab": "Efficiency", - "score": 32.93274651098251 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1274.627, mean=1274.627, max=1274.627, sum=1274.627 (1)", - "tab": "General information", - "score": 1274.627 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602, - "details": { - "description": "min=0.602, mean=0.602, max=0.602, sum=0.602 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=87.905, mean=87.905, max=87.905, sum=87.905 (1)", - "tab": "Efficiency", - "score": 87.90453145364046 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", - "tab": "General information", - "score": 110.563 - }, - "Omni-MATH - # output tokens": { - "description": "min=728.241, mean=728.241, max=728.241, sum=728.241 (1)", - "tab": "General information", - "score": 728.241 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json deleted file mode 100644 index b4413ccdd..000000000 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 4 Sonnet 20250514", - "id": "anthropic/claude-sonnet-4-20250514", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 15.534070909101748 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=9.974, mean=9.974, max=9.974, sum=9.974 (1)", - "tab": "Efficiency", - "score": 9.973703570604325 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)", - "tab": "General information", - "score": 230.461 - }, - "MMLU-Pro - # output tokens": { - "description": "min=402.003, mean=402.003, max=402.003, sum=402.003 (1)", - "tab": "General information", - "score": 402.003 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.643, mean=0.643, max=0.643, sum=0.643 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=13.452, mean=13.452, max=13.452, sum=13.452 (1)", - "tab": "Efficiency", - "score": 13.452103998094396 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)", - "tab": "General information", - "score": 250.73766816143498 - }, - "GPQA - # output tokens": { - "description": "min=543.482, mean=543.482, max=543.482, sum=543.482 (1)", - "tab": "General information", - "score": 543.4820627802691 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.839, - "details": { - "description": "min=0.839, mean=0.839, max=0.839, sum=0.839 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=10.416, mean=10.416, max=10.416, sum=10.416 (1)", - "tab": "Efficiency", - "score": 10.416161362653298 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", - "tab": "General information", - "score": 47.15896487985213 - }, - "IFEval - # output tokens": { - "description": "min=398.978, mean=398.978, max=398.978, sum=398.978 (1)", - "tab": "General information", - "score": 398.9778188539741 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=0.825 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=23.404, mean=23.404, max=23.404, sum=23.404 (1)", - "tab": "Efficiency", - "score": 23.403768165826797 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=954.675, mean=954.675, max=954.675, sum=954.675 (1)", - "tab": "General information", - "score": 954.675 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512, - "details": { - "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=20.425, mean=20.425, max=20.425, sum=20.425 (1)", - "tab": "Efficiency", - "score": 20.424617448329926 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", - "tab": "General information", - "score": 110.563 - }, - "Omni-MATH - # output tokens": { - "description": "min=925.604, mean=925.604, max=925.604, sum=925.604 (1)", - "tab": "General information", - "score": 925.604 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json deleted file mode 100644 index e0991c0d9..000000000 --- a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 4.5 Sonnet 20250929", - "id": "anthropic/claude-sonnet-4-5-20250929", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 17.536448448412127 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.869, mean=0.869, max=0.869, sum=0.869 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=9.03, mean=9.03, max=9.03, sum=9.03 (1)", - "tab": "Efficiency", - "score": 9.029817205530268 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)", - "tab": "General information", - "score": 252.461 - }, - "MMLU-Pro - # output tokens": { - "description": "min=392.292, mean=392.292, max=392.292, sum=392.292 (1)", - "tab": "General information", - "score": 392.292 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=12.414, mean=12.414, max=12.414, sum=12.414 (1)", - "tab": "Efficiency", - "score": 12.414452127318263 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)", - "tab": "General information", - "score": 272.73766816143495 - }, - "GPQA - # output tokens": { - "description": "min=544.215, mean=544.215, max=544.215, sum=544.215 (1)", - "tab": "General information", - "score": 544.2152466367713 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=10.904, mean=10.904, max=10.904, sum=10.904 (1)", - "tab": "Efficiency", - "score": 10.90394415211986 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)", - "tab": "General information", - "score": 47.15896487985213 - }, - "IFEval - # output tokens": { - "description": "min=414.632, mean=414.632, max=414.632, sum=414.632 (1)", - "tab": "General information", - "score": 414.63216266173754 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=38.544, mean=38.544, max=38.544, sum=38.544 (1)", - "tab": "Efficiency", - "score": 38.54364204096484 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1804.604, mean=1804.604, max=1804.604, sum=1804.604 (1)", - "tab": "General information", - "score": 1804.604 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.553, - "details": { - "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=16.79, mean=16.79, max=16.79, sum=16.79 (1)", - "tab": "Efficiency", - "score": 16.790386716127397 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)", - "tab": "General information", - "score": 110.563 - }, - "Omni-MATH - # output tokens": { - "description": "min=892.774, mean=892.774, max=892.774, sum=892.774 (1)", - "tab": "General information", - "score": 892.774 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json deleted file mode 100644 index 682cc94cc..000000000 --- a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-0528", - "id": "deepseek-ai/deepseek-r1-0528", - "developer": "deepseek-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.699, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 115.28182297150872 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=0.793 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=91.015, mean=91.015, max=91.015, sum=91.015 (1)", - "tab": "Efficiency", - "score": 91.01470815229416 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=248.757, mean=248.757, max=248.757, sum=248.757 (1)", - "tab": "General information", - "score": 248.757 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.666, - "details": { - "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=155.439, mean=155.439, max=155.439, sum=155.439 (1)", - "tab": "Efficiency", - "score": 155.438512681311 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=261.59, mean=261.59, max=261.59, sum=261.59 (1)", - "tab": "General information", - "score": 261.5896860986547 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.784, - "details": { - "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=33.752, mean=33.752, max=33.752, sum=33.752 (1)", - "tab": "Efficiency", - "score": 33.75197721056489 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.209, mean=46.209, max=46.209, sum=46.209 (1)", - "tab": "General information", - "score": 46.208872458410355 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=87.848, mean=87.848, max=87.848, sum=87.848 (1)", - "tab": "Efficiency", - "score": 87.84843708276749 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.424, - "details": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=208.355, mean=208.355, max=208.355, sum=208.355 (1)", - "tab": "Efficiency", - "score": 208.35547973060608 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=107.102, mean=107.102, max=107.102, sum=107.102 (1)", - "tab": "General information", - "score": 107.102 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json deleted file mode 100644 index 3b034de70..000000000 --- a/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek v3", - "id": "deepseek-ai/deepseek-v3", - "developer": "deepseek-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.665, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 71.88858741677622 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723, - "details": { - "description": "min=0.723, mean=0.723, max=0.723, sum=0.723 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=50.311, mean=50.311, max=50.311, sum=50.311 (1)", - "tab": "Efficiency", - "score": 50.3109582388401 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=227.757, mean=227.757, max=227.757, sum=227.757 (1)", - "tab": "General information", - "score": 227.757 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.538, - "details": { - "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=74.372, mean=74.372, max=74.372, sum=74.372 (1)", - "tab": "Efficiency", - "score": 74.37158904909553 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=240.59, mean=240.59, max=240.59, sum=240.59 (1)", - "tab": "General information", - "score": 240.5896860986547 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.832, - "details": { - "description": "min=0.832, mean=0.832, max=0.832, sum=0.832 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=47.879, mean=47.879, max=47.879, sum=47.879 (1)", - "tab": "Efficiency", - "score": 47.878683835433286 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.209, mean=46.209, max=46.209, sum=46.209 (1)", - "tab": "General information", - "score": 46.208872458410355 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.831, - "details": { - "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=134.163, mean=134.163, max=134.163, sum=134.163 (1)", - "tab": "Efficiency", - "score": 134.1626427116394 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403, - "details": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=52.719, mean=52.719, max=52.719, sum=52.719 (1)", - "tab": "Efficiency", - "score": 52.71906324887276 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=107.102, mean=107.102, max=107.102, sum=107.102 (1)", - "tab": "General information", - "score": 107.102 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json deleted file mode 100644 index 7d4281de4..000000000 --- a/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Flash 002", - "id": "google/gemini-1.5-flash-002", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 3.3804760044252675 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.678, - "details": { - "description": "min=0.678, mean=0.678, max=0.678, sum=0.678 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=1.799, mean=1.799, max=1.799, sum=1.799 (1)", - "tab": "Efficiency", - "score": 1.799316755771637 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=242.673, mean=242.673, max=242.673, sum=242.673 (1)", - "tab": "General information", - "score": 242.673 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437, - "details": { - "description": "min=0.437, mean=0.437, max=0.437, sum=0.437 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=2.79, mean=2.79, max=2.79, sum=2.79 (1)", - "tab": "Efficiency", - "score": 2.7900896457278677 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=252.735, mean=252.735, max=252.735, sum=252.735 (1)", - "tab": "General information", - "score": 252.7354260089686 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.831, - "details": { - "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=2.302, mean=2.302, max=2.302, sum=2.302 (1)", - "tab": "Efficiency", - "score": 2.302485716320891 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", - "tab": "General information", - "score": 47.33086876155268 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.792, - "details": { - "description": "min=0.792, mean=0.792, max=0.792, sum=0.792 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=5.328, mean=5.328, max=5.328, sum=5.328 (1)", - "tab": "Efficiency", - "score": 5.327828770410083 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.305, - "details": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.305 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=4.683, mean=4.683, max=4.683, sum=4.683 (1)", - "tab": "Efficiency", - "score": 4.682659133895859 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", - "tab": "General information", - "score": 111.956 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json deleted file mode 100644 index 3c438fd59..000000000 --- a/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Pro 002", - "id": "google/gemini-1.5-pro-002", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.657, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 9.106040294719884 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.737, - "details": { - "description": "min=0.737, mean=0.737, max=0.737, sum=0.737 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=5.124, mean=5.124, max=5.124, sum=5.124 (1)", - "tab": "Efficiency", - "score": 5.123855731964111 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=242.673, mean=242.673, max=242.673, sum=242.673 (1)", - "tab": "General information", - "score": 242.673 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534, - "details": { - "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=7.392, mean=7.392, max=7.392, sum=7.392 (1)", - "tab": "Efficiency", - "score": 7.392140488988081 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=252.735, mean=252.735, max=252.735, sum=252.735 (1)", - "tab": "General information", - "score": 252.7354260089686 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.837, - "details": { - "description": "min=0.837, mean=0.837, max=0.837, sum=0.837 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=6.353, mean=6.353, max=6.353, sum=6.353 (1)", - "tab": "Efficiency", - "score": 6.352943865957631 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", - "tab": "General information", - "score": 47.33086876155268 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.813, - "details": { - "description": "min=0.813, mean=0.813, max=0.813, sum=0.813 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=17.527, mean=17.527, max=17.527, sum=17.527 (1)", - "tab": "Efficiency", - "score": 17.52709009152358 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364, - "details": { - "description": "min=0.364, mean=0.364, max=0.364, sum=0.364 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=9.134, mean=9.134, max=9.134, sum=9.134 (1)", - "tab": "Efficiency", - "score": 9.134171295166016 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", - "tab": "General information", - "score": 111.956 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json deleted file mode 100644 index 7f589b967..000000000 --- a/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 2.0 Flash", - "id": "google/gemini-2.0-flash-001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 5.700146694170831 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.737, - "details": { - "description": "min=0.737, mean=0.737, max=0.737, sum=0.737 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=3.221, mean=3.221, max=3.221, sum=3.221 (1)", - "tab": "Efficiency", - "score": 3.221250217437744 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=242.673, mean=242.673, max=242.673, sum=242.673 (1)", - "tab": "General information", - "score": 242.673 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.556, - "details": { - "description": "min=0.556, mean=0.556, max=0.556, sum=0.556 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=4.919, mean=4.919, max=4.919, sum=4.919 (1)", - "tab": "Efficiency", - "score": 4.919003446005919 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=252.735, mean=252.735, max=252.735, sum=252.735 (1)", - "tab": "General information", - "score": 252.7354260089686 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.841, - "details": { - "description": "min=0.841, mean=0.841, max=0.841, sum=0.841 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=3.723, mean=3.723, max=3.723, sum=3.723 (1)", - "tab": "Efficiency", - "score": 3.7232056717334965 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", - "tab": "General information", - "score": 47.33086876155268 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=9.27, mean=9.27, max=9.27, sum=9.27 (1)", - "tab": "Efficiency", - "score": 9.270071518985407 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459, - "details": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.459 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=7.367, mean=7.367, max=7.367, sum=7.367 (1)", - "tab": "Efficiency", - "score": 7.367202616691589 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", - "tab": "General information", - "score": 111.956 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json deleted file mode 100644 index 0376cdf40..000000000 --- a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 2.0 Flash Lite 02-05 preview", - "id": "google/gemini-2.0-flash-lite-preview-02-05", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.642, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 5.788722673180064 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=0.72 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=3.357, mean=3.357, max=3.357, sum=3.357 (1)", - "tab": "Efficiency", - "score": 3.356641344547272 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=242.673, mean=242.673, max=242.673, sum=242.673 (1)", - "tab": "General information", - "score": 242.673 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=5.373, mean=5.373, max=5.373, sum=5.373 (1)", - "tab": "Efficiency", - "score": 5.372664878186623 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=252.735, mean=252.735, max=252.735, sum=252.735 (1)", - "tab": "General information", - "score": 252.7354260089686 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=3.463, mean=3.463, max=3.463, sum=3.463 (1)", - "tab": "Efficiency", - "score": 3.4628667553780037 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", - "tab": "General information", - "score": 47.33086876155268 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=8.804, mean=8.804, max=8.804, sum=8.804 (1)", - "tab": "Efficiency", - "score": 8.803904922309524 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374, - "details": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.374 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=7.948, mean=7.948, max=7.948, sum=7.948 (1)", - "tab": "Efficiency", - "score": 7.947535465478897 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", - "tab": "General information", - "score": 111.956 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json deleted file mode 100644 index 600681fbb..000000000 --- a/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 2.5 Flash-Lite", - "id": "google/gemini-2.5-flash-lite", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.591, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 8.113822886648412 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537, - "details": { - "description": "min=0.537, mean=0.537, max=0.537, sum=0.537 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=4.423, mean=4.423, max=4.423, sum=4.423 (1)", - "tab": "Efficiency", - "score": 4.423401823997498 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)", - "tab": "General information", - "score": 263.673 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.309, - "details": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.309 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=11.88, mean=11.88, max=11.88, sum=11.88 (1)", - "tab": "Efficiency", - "score": 11.880136902022254 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)", - "tab": "General information", - "score": 273.7354260089686 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=1.833, mean=1.833, max=1.833, sum=1.833 (1)", - "tab": "Efficiency", - "score": 1.833447342659321 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", - "tab": "General information", - "score": 47.33086876155268 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.818, - "details": { - "description": "min=0.818, mean=0.818, max=0.818, sum=0.818 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=7.111, mean=7.111, max=7.111, sum=7.111 (1)", - "tab": "Efficiency", - "score": 7.111379201173782 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48, - "details": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.48 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=15.321, mean=15.321, max=15.321, sum=15.321 (1)", - "tab": "Efficiency", - "score": 15.320749163389205 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", - "tab": "General information", - "score": 111.956 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json deleted file mode 100644 index 221dc7a91..000000000 --- a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 2.5 Flash 04-17 preview", - "id": "google/gemini-2.5-flash-preview-04-17", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.626, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 31.900818991762513 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.639, - "details": { - "description": "min=0.639, mean=0.639, max=0.639, sum=0.639 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=17.353, mean=17.353, max=17.353, sum=17.353 (1)", - "tab": "Efficiency", - "score": 17.352934203863143 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)", - "tab": "General information", - "score": 263.673 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=38.125, mean=38.125, max=38.125, sum=38.125 (1)", - "tab": "Efficiency", - "score": 38.125050564562336 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)", - "tab": "General information", - "score": 273.7354260089686 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898, - "details": { - "description": "min=0.898, mean=0.898, max=0.898, sum=0.898 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=11.266, mean=11.266, max=11.266, sum=11.266 (1)", - "tab": "Efficiency", - "score": 11.266106982142837 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", - "tab": "General information", - "score": 47.33086876155268 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.817, - "details": { - "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=32.789, mean=32.789, max=32.789, sum=32.789 (1)", - "tab": "Efficiency", - "score": 32.78856403473391 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.384, - "details": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=59.971, mean=59.971, max=59.971, sum=59.971 (1)", - "tab": "Efficiency", - "score": 59.97143917351036 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", - "tab": "General information", - "score": 111.956 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json deleted file mode 100644 index 355cd3bc1..000000000 --- a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 2.5 Pro 03-25 preview", - "id": "google/gemini-2.5-pro-preview-03-25", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 41.707859761088116 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.863, - "details": { - "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=22.301, mean=22.301, max=22.301, sum=22.301 (1)", - "tab": "Efficiency", - "score": 22.301176882605677 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)", - "tab": "General information", - "score": 263.673 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.749, - "details": { - "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=43.194, mean=43.194, max=43.194, sum=43.194 (1)", - "tab": "Efficiency", - "score": 43.19425330858552 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)", - "tab": "General information", - "score": 273.7354260089686 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=0.84 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=15.978, mean=15.978, max=15.978, sum=15.978 (1)", - "tab": "Efficiency", - "score": 15.978427228116725 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", - "tab": "General information", - "score": 47.33086876155268 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=41.295, mean=41.295, max=41.295, sum=41.295 (1)", - "tab": "Efficiency", - "score": 41.2954368838362 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416, - "details": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.416 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=85.77, mean=85.77, max=85.77, sum=85.77 (1)", - "tab": "Efficiency", - "score": 85.77000450229644 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", - "tab": "General information", - "score": 111.956 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json deleted file mode 100644 index d3ecb3ebb..000000000 --- a/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 3 Pro Preview", - "id": "google/gemini-3-pro-preview", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.799, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 50.969324812798575 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=0.903 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=34.903, mean=34.903, max=34.903, sum=34.903 (1)", - "tab": "Efficiency", - "score": 34.903078527212145 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)", - "tab": "General information", - "score": 263.673 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.803, - "details": { - "description": "min=0.803, mean=0.803, max=0.803, sum=0.803 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=69.164, mean=69.164, max=69.164, sum=69.164 (1)", - "tab": "Efficiency", - "score": 69.16407415364355 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)", - "tab": "General information", - "score": 273.7354260089686 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=18.201, mean=18.201, max=18.201, sum=18.201 (1)", - "tab": "Efficiency", - "score": 18.200553727458452 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)", - "tab": "General information", - "score": 47.33086876155268 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=37.094, mean=37.094, max=37.094, sum=37.094 (1)", - "tab": "Efficiency", - "score": 37.09404513451669 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555, - "details": { - "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=95.485, mean=95.485, max=95.485, sum=95.485 (1)", - "tab": "Efficiency", - "score": 95.48487252116203 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)", - "tab": "General information", - "score": 111.956 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json deleted file mode 100644 index 869902b9d..000000000 --- a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IBM Granite 3.3 8B Instruct", - "id": "ibm/granite-3.3-8b-instruct", - "developer": "ibm", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 9.029614260338473 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.343, - "details": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=5.079, mean=5.079, max=5.079, sum=5.079 (1)", - "tab": "Efficiency", - "score": 5.079014162302017 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=266.391, mean=266.391, max=266.391, sum=266.391 (1)", - "tab": "General information", - "score": 266.391 - }, - "MMLU-Pro - # output tokens": { - "description": "min=364.376, mean=364.376, max=364.376, sum=364.376 (1)", - "tab": "General information", - "score": 364.376 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325, - "details": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=6.422, mean=6.422, max=6.422, sum=6.422 (1)", - "tab": "Efficiency", - "score": 6.421983559569971 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=281.265, mean=281.265, max=281.265, sum=281.265 (1)", - "tab": "General information", - "score": 281.2645739910314 - }, - "GPQA - # output tokens": { - "description": "min=465.336, mean=465.336, max=465.336, sum=465.336 (1)", - "tab": "General information", - "score": 465.33632286995515 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.729, - "details": { - "description": "min=0.729, mean=0.729, max=0.729, sum=0.729 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=6.574, mean=6.574, max=6.574, sum=6.574 (1)", - "tab": "Efficiency", - "score": 6.573940407546743 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)", - "tab": "General information", - "score": 51.53419593345656 - }, - "IFEval - # output tokens": { - "description": "min=482.37, mean=482.37, max=482.37, sum=482.37 (1)", - "tab": "General information", - "score": 482.36968576709796 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=10.962, mean=10.962, max=10.962, sum=10.962 (1)", - "tab": "Efficiency", - "score": 10.962031789541244 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=784.893, mean=784.893, max=784.893, sum=784.893 (1)", - "tab": "General information", - "score": 784.893 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176, - "details": { - "description": "min=0.176, mean=0.176, max=0.176, sum=0.176 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=16.111, mean=16.111, max=16.111, sum=16.111 (1)", - "tab": "Efficiency", - "score": 16.111101382732393 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)", - "tab": "General information", - "score": 118.438 - }, - "Omni-MATH - # output tokens": { - "description": "min=1162.421, mean=1162.421, max=1162.421, sum=1162.421 (1)", - "tab": "General information", - "score": 1162.421 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json deleted file mode 100644 index 03bc0f0f8..000000000 --- a/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IBM Granite 4.0 Small", - "id": "ibm/granite-4.0-h-small", - "developer": "ibm", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.575, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 21.31162992088884 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.569, - "details": { - "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=12.071, mean=12.071, max=12.071, sum=12.071 (1)", - "tab": "Efficiency", - "score": 12.070928404092788 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)", - "tab": "General information", - "score": 288.391 - }, - "MMLU-Pro - # output tokens": { - "description": "min=372.93, mean=372.93, max=372.93, sum=372.93 (1)", - "tab": "General information", - "score": 372.93 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.383, - "details": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=17.606, mean=17.606, max=17.606, sum=17.606 (1)", - "tab": "Efficiency", - "score": 17.606201725690354 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)", - "tab": "General information", - "score": 303.2645739910314 - }, - "GPQA - # output tokens": { - "description": "min=439.648, mean=439.648, max=439.648, sum=439.648 (1)", - "tab": "General information", - "score": 439.6479820627803 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=13.366, mean=13.366, max=13.366, sum=13.366 (1)", - "tab": "Efficiency", - "score": 13.366226098453712 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)", - "tab": "General information", - "score": 51.53419593345656 - }, - "IFEval - # output tokens": { - "description": "min=494.717, mean=494.717, max=494.717, sum=494.717 (1)", - "tab": "General information", - "score": 494.7171903881701 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739, - "details": { - "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=30.807, mean=30.807, max=30.807, sum=30.807 (1)", - "tab": "Efficiency", - "score": 30.80672695994377 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=996.159, mean=996.159, max=996.159, sum=996.159 (1)", - "tab": "General information", - "score": 996.159 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.296, - "details": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=32.708, mean=32.708, max=32.708, sum=32.708 (1)", - "tab": "Efficiency", - "score": 32.70806641626358 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)", - "tab": "General information", - "score": 118.438 - }, - "Omni-MATH - # output tokens": { - "description": "min=1020.51, mean=1020.51, max=1020.51, sum=1020.51 (1)", - "tab": "General information", - "score": 1020.51 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json deleted file mode 100644 index 399dbb1e3..000000000 --- a/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IBM Granite 4.0 Micro", - "id": "ibm/granite-4.0-micro", - "developer": "ibm", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 5.725128505637726 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395, - "details": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.395 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=3.135, mean=3.135, max=3.135, sum=3.135 (1)", - "tab": "Efficiency", - "score": 3.1348352246284485 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)", - "tab": "General information", - "score": 288.391 - }, - "MMLU-Pro - # output tokens": { - "description": "min=325.255, mean=325.255, max=325.255, sum=325.255 (1)", - "tab": "General information", - "score": 325.255 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307, - "details": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=3.075, mean=3.075, max=3.075, sum=3.075 (1)", - "tab": "Efficiency", - "score": 3.075281912970436 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)", - "tab": "General information", - "score": 303.2645739910314 - }, - "GPQA - # output tokens": { - "description": "min=337.417, mean=337.417, max=337.417, sum=337.417 (1)", - "tab": "General information", - "score": 337.4170403587444 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=4.58, mean=4.58, max=4.58, sum=4.58 (1)", - "tab": "Efficiency", - "score": 4.580414981806785 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)", - "tab": "General information", - "score": 51.53419593345656 - }, - "IFEval - # output tokens": { - "description": "min=497.8, mean=497.8, max=497.8, sum=497.8 (1)", - "tab": "General information", - "score": 497.8003696857671 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=8.161, mean=8.161, max=8.161, sum=8.161 (1)", - "tab": "Efficiency", - "score": 8.160923891305924 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1037.706, mean=1037.706, max=1037.706, sum=1037.706 (1)", - "tab": "General information", - "score": 1037.706 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.209, - "details": { - "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=9.674, mean=9.674, max=9.674, sum=9.674 (1)", - "tab": "Efficiency", - "score": 9.674186517477036 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)", - "tab": "General information", - "score": 118.438 - }, - "Omni-MATH - # output tokens": { - "description": "min=1145.889, mean=1145.889, max=1145.889, sum=1145.889 (1)", - "tab": "General information", - "score": 1145.889 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json deleted file mode 100644 index 736686c13..000000000 --- a/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json +++ /dev/null @@ -1,352 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Marin 8B Instruct", - "id": "marin-community/marin-8b-instruct", - "developer": "marin-community", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 118.55196213968559 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.188, - "details": { - "description": "min=0.188, mean=0.188, max=0.188, sum=0.188 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=94.096, mean=94.096, max=94.096, sum=94.096 (1)", - "tab": "Efficiency", - "score": 94.0957455046177 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)", - "tab": "General information", - "score": 228.366 - }, - "MMLU-Pro - # output tokens": { - "description": "min=539.21, mean=539.21, max=539.21, sum=539.21 (1)", - "tab": "General information", - "score": 539.21 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.168, - "details": { - "description": "min=0.168, mean=0.168, max=0.168, sum=0.168 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=123.019, mean=123.019, max=123.019, sum=123.019 (1)", - "tab": "Efficiency", - "score": 123.0189983149815 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "General information", - "score": 0.002242152466367713 - }, - "GPQA - # prompt tokens": { - "description": "min=247.173, mean=247.173, max=247.173, sum=247.173 (1)", - "tab": "General information", - "score": 247.1726457399103 - }, - "GPQA - # output tokens": { - "description": "min=707.953, mean=707.953, max=707.953, sum=707.953 (1)", - "tab": "General information", - "score": 707.9529147982063 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.632, - "details": { - "description": "min=0.632, mean=0.632, max=0.632, sum=0.632 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=88.889, mean=88.889, max=88.889, sum=88.889 (1)", - "tab": "Efficiency", - "score": 88.88931880596606 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)", - "tab": "General information", - "score": 46.024029574861366 - }, - "IFEval - # output tokens": { - "description": "min=516.492, mean=516.492, max=516.492, sum=516.492 (1)", - "tab": "General information", - "score": 516.4916820702402 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.477, - "details": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.477 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=146.873, mean=146.873, max=146.873, sum=146.873 (1)", - "tab": "Efficiency", - "score": 146.8726548871994 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=818.678, mean=818.678, max=818.678, sum=818.678 (1)", - "tab": "General information", - "score": 818.678 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2", - "num_output_tokens": "2048" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.16, - "details": { - "description": "min=0.16, mean=0.16, max=0.16, sum=0.16 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=139.883, mean=139.883, max=139.883, sum=139.883 (1)", - "tab": "Efficiency", - "score": 139.88309318566323 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "General information", - "score": 0.001 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=108.784, mean=108.784, max=108.784, sum=108.784 (1)", - "tab": "General information", - "score": 108.784 - }, - "Omni-MATH - # output tokens": { - "description": "min=808.178, mean=808.178, max=808.178, sum=808.178 (1)", - "tab": "General information", - "score": 808.178 - } - } - }, - "generation_config": { - "additional_details": { - "num_output_tokens": "2048" - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json deleted file mode 100644 index 4dd5465a5..000000000 --- a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.1 Instruct Turbo 405B", - "id": "meta/llama-3.1-405b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.618, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 9.16102940672383 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723, - "details": { - "description": "min=0.723, mean=0.723, max=0.723, sum=0.723 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=5.795, mean=5.795, max=5.795, sum=5.795 (1)", - "tab": "Efficiency", - "score": 5.794888144493103 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)", - "tab": "General information", - "score": 228.366 - }, - "MMLU-Pro - # output tokens": { - "description": "min=376.289, mean=376.289, max=376.289, sum=376.289 (1)", - "tab": "General information", - "score": 376.289 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.522, - "details": { - "description": "min=0.522, mean=0.522, max=0.522, sum=0.522 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=9.197, mean=9.197, max=9.197, sum=9.197 (1)", - "tab": "Efficiency", - "score": 9.197324877362615 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)", - "tab": "General information", - "score": 248.88565022421525 - }, - "GPQA - # output tokens": { - "description": "min=592.928, mean=592.928, max=592.928, sum=592.928 (1)", - "tab": "General information", - "score": 592.9282511210762 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811, - "details": { - "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=4.572, mean=4.572, max=4.572, sum=4.572 (1)", - "tab": "Efficiency", - "score": 4.571529605692724 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)", - "tab": "General information", - "score": 46.024029574861366 - }, - "IFEval - # output tokens": { - "description": "min=358.067, mean=358.067, max=358.067, sum=358.067 (1)", - "tab": "General information", - "score": 358.06654343807764 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=15.654, mean=15.654, max=15.654, sum=15.654 (1)", - "tab": "Efficiency", - "score": 15.653513952493668 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=773.114, mean=773.114, max=773.114, sum=773.114 (1)", - "tab": "General information", - "score": 773.114 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.249, - "details": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.249 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=10.588, mean=10.588, max=10.588, sum=10.588 (1)", - "tab": "Efficiency", - "score": 10.587890453577042 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)", - "tab": "General information", - "score": 109.708 - }, - "Omni-MATH - # output tokens": { - "description": "min=906.902, mean=906.902, max=906.902, sum=906.902 (1)", - "tab": "General information", - "score": 906.902 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json deleted file mode 100644 index 407242cbb..000000000 --- a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.1 Instruct Turbo 70B", - "id": "meta/llama-3.1-70b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.574, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 4.2482479944372376 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.653, - "details": { - "description": "min=0.653, mean=0.653, max=0.653, sum=0.653 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=2.732, mean=2.732, max=2.732, sum=2.732 (1)", - "tab": "Efficiency", - "score": 2.7317132804393767 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)", - "tab": "General information", - "score": 228.366 - }, - "MMLU-Pro - # output tokens": { - "description": "min=326.226, mean=326.226, max=326.226, sum=326.226 (1)", - "tab": "General information", - "score": 326.226 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426, - "details": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=6.095, mean=6.095, max=6.095, sum=6.095 (1)", - "tab": "Efficiency", - "score": 6.0952357684550265 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)", - "tab": "General information", - "score": 248.88565022421525 - }, - "GPQA - # output tokens": { - "description": "min=491.435, mean=491.435, max=491.435, sum=491.435 (1)", - "tab": "General information", - "score": 491.43497757847535 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.821, - "details": { - "description": "min=0.821, mean=0.821, max=0.821, sum=0.821 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=2.622, mean=2.622, max=2.622, sum=2.622 (1)", - "tab": "Efficiency", - "score": 2.622214562350853 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)", - "tab": "General information", - "score": 46.024029574861366 - }, - "IFEval - # output tokens": { - "description": "min=361.464, mean=361.464, max=361.464, sum=361.464 (1)", - "tab": "General information", - "score": 361.46395563770795 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.758, - "details": { - "description": "min=0.758, mean=0.758, max=0.758, sum=0.758 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=4.143, mean=4.143, max=4.143, sum=4.143 (1)", - "tab": "Efficiency", - "score": 4.142627255439758 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=808.109, mean=808.109, max=808.109, sum=808.109 (1)", - "tab": "General information", - "score": 808.109 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21, - "details": { - "description": "min=0.21, mean=0.21, max=0.21, sum=0.21 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=5.649, mean=5.649, max=5.649, sum=5.649 (1)", - "tab": "Efficiency", - "score": 5.649449105501175 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)", - "tab": "General information", - "score": 109.708 - }, - "Omni-MATH - # output tokens": { - "description": "min=1321.301, mean=1321.301, max=1321.301, sum=1321.301 (1)", - "tab": "General information", - "score": 1321.301 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json deleted file mode 100644 index 30524d64b..000000000 --- a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.1 Instruct Turbo 8B", - "id": "meta/llama-3.1-8b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 3.654367387500005 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406, - "details": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=2.642, mean=2.642, max=2.642, sum=2.642 (1)", - "tab": "Efficiency", - "score": 2.6422129917144774 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)", - "tab": "General information", - "score": 228.366 - }, - "MMLU-Pro - # output tokens": { - "description": "min=518.387, mean=518.387, max=518.387, sum=518.387 (1)", - "tab": "General information", - "score": 518.387 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.247, - "details": { - "description": "min=0.247, mean=0.247, max=0.247, sum=0.247 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=3.28, mean=3.28, max=3.28, sum=3.28 (1)", - "tab": "Efficiency", - "score": 3.2803654104070277 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)", - "tab": "General information", - "score": 248.88565022421525 - }, - "GPQA - # output tokens": { - "description": "min=744.583, mean=744.583, max=744.583, sum=744.583 (1)", - "tab": "General information", - "score": 744.5829596412556 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=1.982, mean=1.982, max=1.982, sum=1.982 (1)", - "tab": "Efficiency", - "score": 1.981573561423367 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)", - "tab": "General information", - "score": 46.024029574861366 - }, - "IFEval - # output tokens": { - "description": "min=404.026, mean=404.026, max=404.026, sum=404.026 (1)", - "tab": "General information", - "score": 404.02587800369685 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=3.192, mean=3.192, max=3.192, sum=3.192 (1)", - "tab": "Efficiency", - "score": 3.1917312424182893 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=865.484, mean=865.484, max=865.484, sum=865.484 (1)", - "tab": "General information", - "score": 865.484 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.137, - "details": { - "description": "min=0.137, mean=0.137, max=0.137, sum=0.137 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=7.176, mean=7.176, max=7.176, sum=7.176 (1)", - "tab": "Efficiency", - "score": 7.1759537315368656 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)", - "tab": "General information", - "score": 109.708 - }, - "Omni-MATH - # output tokens": { - "description": "min=2170.057, mean=2170.057, max=2170.057, sum=2170.057 (1)", - "tab": "General information", - "score": 2170.057 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json deleted file mode 100644 index d9ca75120..000000000 --- a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 4 Maverick 17Bx128E Instruct FP8", - "id": "meta/llama-4-maverick-17b-128e-instruct-fp8", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 8.498428393165543 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=6.74, mean=6.74, max=6.74, sum=6.74 (1)", - "tab": "Efficiency", - "score": 6.739848182201386 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=225.585, mean=225.585, max=225.585, sum=225.585 (1)", - "tab": "General information", - "score": 225.585 - }, - "MMLU-Pro - # output tokens": { - "description": "min=548.208, mean=548.208, max=548.208, sum=548.208 (1)", - "tab": "General information", - "score": 548.208 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65, - "details": { - "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=9.838, mean=9.838, max=9.838, sum=9.838 (1)", - "tab": "Efficiency", - "score": 9.838454476921013 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=236.807, mean=236.807, max=236.807, sum=236.807 (1)", - "tab": "General information", - "score": 236.8071748878924 - }, - "GPQA - # output tokens": { - "description": "min=822.336, mean=822.336, max=822.336, sum=822.336 (1)", - "tab": "General information", - "score": 822.3363228699552 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=3.773, mean=3.773, max=3.773, sum=3.773 (1)", - "tab": "Efficiency", - "score": 3.773326979987943 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.577, mean=45.577, max=45.577, sum=45.577 (1)", - "tab": "General information", - "score": 45.57670979667283 - }, - "IFEval - # output tokens": { - "description": "min=311.251, mean=311.251, max=311.251, sum=311.251 (1)", - "tab": "General information", - "score": 311.2513863216266 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=10.37, mean=10.37, max=10.37, sum=10.37 (1)", - "tab": "Efficiency", - "score": 10.36993253993988 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=842.777, mean=842.777, max=842.777, sum=842.777 (1)", - "tab": "General information", - "score": 842.777 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422, - "details": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=11.771, mean=11.771, max=11.771, sum=11.771 (1)", - "tab": "Efficiency", - "score": 11.770579786777496 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=105.286, mean=105.286, max=105.286, sum=105.286 (1)", - "tab": "General information", - "score": 105.286 - }, - "Omni-MATH - # output tokens": { - "description": "min=1055.205, mean=1055.205, max=1055.205, sum=1055.205 (1)", - "tab": "General information", - "score": 1055.205 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json deleted file mode 100644 index 640472423..000000000 --- a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 4 Scout 17Bx16E Instruct", - "id": "meta/llama-4-scout-17b-16e-instruct", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 8.886502883481523 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.742, mean=0.742, max=0.742, sum=0.742 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=6.525, mean=6.525, max=6.525, sum=6.525 (1)", - "tab": "Efficiency", - "score": 6.524971485614777 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=225.585, mean=225.585, max=225.585, sum=225.585 (1)", - "tab": "General information", - "score": 225.585 - }, - "MMLU-Pro - # output tokens": { - "description": "min=550.212, mean=550.212, max=550.212, sum=550.212 (1)", - "tab": "General information", - "score": 550.212 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507, - "details": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=11.027, mean=11.027, max=11.027, sum=11.027 (1)", - "tab": "Efficiency", - "score": 11.026973943004693 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=236.807, mean=236.807, max=236.807, sum=236.807 (1)", - "tab": "General information", - "score": 236.8071748878924 - }, - "GPQA - # output tokens": { - "description": "min=856.76, mean=856.76, max=856.76, sum=856.76 (1)", - "tab": "General information", - "score": 856.7600896860987 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.818, - "details": { - "description": "min=0.818, mean=0.818, max=0.818, sum=0.818 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=4.297, mean=4.297, max=4.297, sum=4.297 (1)", - "tab": "Efficiency", - "score": 4.296513711679004 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.577, mean=45.577, max=45.577, sum=45.577 (1)", - "tab": "General information", - "score": 45.57670979667283 - }, - "IFEval - # output tokens": { - "description": "min=399.399, mean=399.399, max=399.399, sum=399.399 (1)", - "tab": "General information", - "score": 399.3992606284658 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=9.942, mean=9.942, max=9.942, sum=9.942 (1)", - "tab": "Efficiency", - "score": 9.942440722942353 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=952.636, mean=952.636, max=952.636, sum=952.636 (1)", - "tab": "General information", - "score": 952.636 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.373, - "details": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=12.642, mean=12.642, max=12.642, sum=12.642 (1)", - "tab": "Efficiency", - "score": 12.641614554166793 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=105.286, mean=105.286, max=105.286, sum=105.286 (1)", - "tab": "General information", - "score": 105.286 - }, - "Omni-MATH - # output tokens": { - "description": "min=1088.449, mean=1088.449, max=1088.449, sum=1088.449 (1)", - "tab": "General information", - "score": 1088.449 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json deleted file mode 100644 index 0b19a4ab4..000000000 --- a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Instruct v0.3 7B", - "id": "mistralai/mistral-7b-instruct-v0.3", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.376, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 3.386352003847275 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.277, - "details": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.277 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=2.0, mean=2.0, max=2.0, sum=2.0 (1)", - "tab": "Efficiency", - "score": 1.999533802509308 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=260.915, mean=260.915, max=260.915, sum=260.915 (1)", - "tab": "General information", - "score": 260.915 - }, - "MMLU-Pro - # output tokens": { - "description": "min=272.103, mean=272.103, max=272.103, sum=272.103 (1)", - "tab": "General information", - "score": 272.103 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.303, - "details": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.303 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=2.285, mean=2.285, max=2.285, sum=2.285 (1)", - "tab": "Efficiency", - "score": 2.284658104849503 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=281.998, mean=281.998, max=281.998, sum=281.998 (1)", - "tab": "General information", - "score": 281.99775784753365 - }, - "GPQA - # output tokens": { - "description": "min=387.971, mean=387.971, max=387.971, sum=387.971 (1)", - "tab": "General information", - "score": 387.9708520179372 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.567, - "details": { - "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=2.535, mean=2.535, max=2.535, sum=2.535 (1)", - "tab": "Efficiency", - "score": 2.5349821145345013 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=51.309, mean=51.309, max=51.309, sum=51.309 (1)", - "tab": "General information", - "score": 51.3086876155268 - }, - "IFEval - # output tokens": { - "description": "min=449.725, mean=449.725, max=449.725, sum=449.725 (1)", - "tab": "General information", - "score": 449.72458410351203 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=5.901, mean=5.901, max=5.901, sum=5.901 (1)", - "tab": "Efficiency", - "score": 5.900532631635666 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=702.754, mean=702.754, max=702.754, sum=702.754 (1)", - "tab": "General information", - "score": 702.754 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.072, - "details": { - "description": "min=0.072, mean=0.072, max=0.072, sum=0.072 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=4.212, mean=4.212, max=4.212, sum=4.212 (1)", - "tab": "Efficiency", - "score": 4.212053365707398 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=119.373, mean=119.373, max=119.373, sum=119.373 (1)", - "tab": "General information", - "score": 119.373 - }, - "Omni-MATH - # output tokens": { - "description": "min=678.438, mean=678.438, max=678.438, sum=678.438 (1)", - "tab": "General information", - "score": 678.438 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json deleted file mode 100644 index dec52ca8a..000000000 --- a/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Large 2411", - "id": "mistralai/mistral-large-2411", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.598, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 14.462006275515396 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599, - "details": { - "description": "min=0.599, mean=0.599, max=0.599, sum=0.599 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=7.537, mean=7.537, max=7.537, sum=7.537 (1)", - "tab": "Efficiency", - "score": 7.537241208553314 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=260.915, mean=260.915, max=260.915, sum=260.915 (1)", - "tab": "General information", - "score": 260.915 - }, - "MMLU-Pro - # output tokens": { - "description": "min=316.273, mean=316.273, max=316.273, sum=316.273 (1)", - "tab": "General information", - "score": 316.273 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435, - "details": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=12.217, mean=12.217, max=12.217, sum=12.217 (1)", - "tab": "Efficiency", - "score": 12.217145950270341 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=281.998, mean=281.998, max=281.998, sum=281.998 (1)", - "tab": "General information", - "score": 281.99775784753365 - }, - "GPQA - # output tokens": { - "description": "min=507.357, mean=507.357, max=507.357, sum=507.357 (1)", - "tab": "General information", - "score": 507.3565022421525 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=10.742, mean=10.742, max=10.742, sum=10.742 (1)", - "tab": "Efficiency", - "score": 10.741783690761066 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=51.36, mean=51.36, max=51.36, sum=51.36 (1)", - "tab": "General information", - "score": 51.36044362292052 - }, - "IFEval - # output tokens": { - "description": "min=409.566, mean=409.566, max=409.566, sum=409.566 (1)", - "tab": "General information", - "score": 409.5656192236599 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.801, - "details": { - "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=23.603, mean=23.603, max=23.603, sum=23.603 (1)", - "tab": "Efficiency", - "score": 23.602991637706758 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1029.086, mean=1029.086, max=1029.086, sum=1029.086 (1)", - "tab": "General information", - "score": 1029.086 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281, - "details": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=18.211, mean=18.211, max=18.211, sum=18.211 (1)", - "tab": "Efficiency", - "score": 18.210868890285493 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=120.373, mean=120.373, max=120.373, sum=120.373 (1)", - "tab": "General information", - "score": 120.373 - }, - "Omni-MATH - # output tokens": { - "description": "min=727.801, mean=727.801, max=727.801, sum=727.801 (1)", - "tab": "General information", - "score": 727.801 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json deleted file mode 100644 index 7999b823d..000000000 --- a/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Small 3.1 2503", - "id": "mistralai/mistral-small-2503", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.558, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 11.791458985991488 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "details": { - "description": "min=0.61, mean=0.61, max=0.61, sum=0.61 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=3.589, mean=3.589, max=3.589, sum=3.589 (1)", - "tab": "Efficiency", - "score": 3.588683393239975 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=239.367, mean=239.367, max=239.367, sum=239.367 (1)", - "tab": "General information", - "score": 239.367 - }, - "MMLU-Pro - # output tokens": { - "description": "min=365.903, mean=365.903, max=365.903, sum=365.903 (1)", - "tab": "General information", - "score": 365.903 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392, - "details": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=5.05, mean=5.05, max=5.05, sum=5.05 (1)", - "tab": "Efficiency", - "score": 5.049520614435854 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=246.54, mean=246.54, max=246.54, sum=246.54 (1)", - "tab": "General information", - "score": 246.5403587443946 - }, - "GPQA - # output tokens": { - "description": "min=492.534, mean=492.534, max=492.534, sum=492.534 (1)", - "tab": "General information", - "score": 492.5336322869955 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=0.75 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=3.838, mean=3.838, max=3.838, sum=3.838 (1)", - "tab": "Efficiency", - "score": 3.837722122118345 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=47.237, mean=47.237, max=47.237, sum=47.237 (1)", - "tab": "General information", - "score": 47.2365988909427 - }, - "IFEval - # output tokens": { - "description": "min=379.896, mean=379.896, max=379.896, sum=379.896 (1)", - "tab": "General information", - "score": 379.89648798521256 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=12.831, mean=12.831, max=12.831, sum=12.831 (1)", - "tab": "Efficiency", - "score": 12.831070138692855 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=938.182, mean=938.182, max=938.182, sum=938.182 (1)", - "tab": "General information", - "score": 938.182 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.248, - "details": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=33.65, mean=33.65, max=33.65, sum=33.65 (1)", - "tab": "Efficiency", - "score": 33.650298661470416 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=110.473, mean=110.473, max=110.473, sum=110.473 (1)", - "tab": "General information", - "score": 110.473 - }, - "Omni-MATH - # output tokens": { - "description": "min=753.657, mean=753.657, max=753.657, sum=753.657 (1)", - "tab": "General information", - "score": 753.657 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json deleted file mode 100644 index 583f7956f..000000000 --- a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral Instruct 8x22B", - "id": "mistralai/mixtral-8x22b-instruct-v0.1", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 6.16132193567775 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.46 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=3.967, mean=3.967, max=3.967, sum=3.967 (1)", - "tab": "Efficiency", - "score": 3.967100965499878 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=260.915, mean=260.915, max=260.915, sum=260.915 (1)", - "tab": "General information", - "score": 260.915 - }, - "MMLU-Pro - # output tokens": { - "description": "min=298.159, mean=298.159, max=298.159, sum=298.159 (1)", - "tab": "General information", - "score": 298.159 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.334, - "details": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.334 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=4.76, mean=4.76, max=4.76, sum=4.76 (1)", - "tab": "Efficiency", - "score": 4.760301354220095 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=281.998, mean=281.998, max=281.998, sum=281.998 (1)", - "tab": "General information", - "score": 281.99775784753365 - }, - "GPQA - # output tokens": { - "description": "min=403.895, mean=403.895, max=403.895, sum=403.895 (1)", - "tab": "General information", - "score": 403.89461883408075 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724, - "details": { - "description": "min=0.724, mean=0.724, max=0.724, sum=0.724 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=4.568, mean=4.568, max=4.568, sum=4.568 (1)", - "tab": "Efficiency", - "score": 4.56831247837398 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=51.309, mean=51.309, max=51.309, sum=51.309 (1)", - "tab": "General information", - "score": 51.3086876155268 - }, - "IFEval - # output tokens": { - "description": "min=390.799, mean=390.799, max=390.799, sum=390.799 (1)", - "tab": "General information", - "score": 390.7985212569316 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=10.146, mean=10.146, max=10.146, sum=10.146 (1)", - "tab": "Efficiency", - "score": 10.145776480436325 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=668.768, mean=668.768, max=668.768, sum=668.768 (1)", - "tab": "General information", - "score": 668.768 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.163, - "details": { - "description": "min=0.163, mean=0.163, max=0.163, sum=0.163 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=7.365, mean=7.365, max=7.365, sum=7.365 (1)", - "tab": "Efficiency", - "score": 7.365118399858475 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=119.373, mean=119.373, max=119.373, sum=119.373 (1)", - "tab": "General information", - "score": 119.373 - }, - "Omni-MATH - # output tokens": { - "description": "min=783.89, mean=783.89, max=783.89, sum=783.89 (1)", - "tab": "General information", - "score": 783.89 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json deleted file mode 100644 index d2c9cfb4e..000000000 --- a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral Instruct 8x7B", - "id": "mistralai/mixtral-8x7b-instruct-v0.1", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.397, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 3.8521851769069984 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.335, - "details": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.335 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=2.842, mean=2.842, max=2.842, sum=2.842 (1)", - "tab": "Efficiency", - "score": 2.841812901973724 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=260.915, mean=260.915, max=260.915, sum=260.915 (1)", - "tab": "General information", - "score": 260.915 - }, - "MMLU-Pro - # output tokens": { - "description": "min=274.355, mean=274.355, max=274.355, sum=274.355 (1)", - "tab": "General information", - "score": 274.355 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.296, - "details": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=3.163, mean=3.163, max=3.163, sum=3.163 (1)", - "tab": "Efficiency", - "score": 3.1633052681593616 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=281.998, mean=281.998, max=281.998, sum=281.998 (1)", - "tab": "General information", - "score": 281.99775784753365 - }, - "GPQA - # output tokens": { - "description": "min=384.17, mean=384.17, max=384.17, sum=384.17 (1)", - "tab": "General information", - "score": 384.17040358744396 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.575, - "details": { - "description": "min=0.575, mean=0.575, max=0.575, sum=0.575 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=3.247, mean=3.247, max=3.247, sum=3.247 (1)", - "tab": "Efficiency", - "score": 3.2468207733027374 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=51.309, mean=51.309, max=51.309, sum=51.309 (1)", - "tab": "General information", - "score": 51.3086876155268 - }, - "IFEval - # output tokens": { - "description": "min=377.81, mean=377.81, max=377.81, sum=377.81 (1)", - "tab": "General information", - "score": 377.8096118299446 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673, - "details": { - "description": "min=0.673, mean=0.673, max=0.673, sum=0.673 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=5.582, mean=5.582, max=5.582, sum=5.582 (1)", - "tab": "Efficiency", - "score": 5.581539319515228 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=669.436, mean=669.436, max=669.436, sum=669.436 (1)", - "tab": "General information", - "score": 669.436 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105, - "details": { - "description": "min=0.105, mean=0.105, max=0.105, sum=0.105 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=4.427, mean=4.427, max=4.427, sum=4.427 (1)", - "tab": "Efficiency", - "score": 4.427447621583939 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=119.373, mean=119.373, max=119.373, sum=119.373 (1)", - "tab": "General information", - "score": 119.373 - }, - "Omni-MATH - # output tokens": { - "description": "min=550.807, mean=550.807, max=550.807, sum=550.807 (1)", - "tab": "General information", - "score": 550.807 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json deleted file mode 100644 index 1946db617..000000000 --- a/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kimi K2 Instruct", - "id": "moonshotai/kimi-k2-instruct", - "developer": "moonshotai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 44.938299779825435 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.819, - "details": { - "description": "min=0.819, mean=0.819, max=0.819, sum=0.819 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=20.295, mean=20.295, max=20.295, sum=20.295 (1)", - "tab": "Efficiency", - "score": 20.295415951013567 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=249.352, mean=249.352, max=249.352, sum=249.352 (1)", - "tab": "General information", - "score": 249.352 - }, - "MMLU-Pro - # output tokens": { - "description": "min=703.4, mean=703.4, max=703.4, sum=703.4 (1)", - "tab": "General information", - "score": 703.4 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.652, mean=0.652, max=0.652, sum=0.652 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=50.104, mean=50.104, max=50.104, sum=50.104 (1)", - "tab": "Efficiency", - "score": 50.10382581986654 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=268.74, mean=268.74, max=268.74, sum=268.74 (1)", - "tab": "General information", - "score": 268.73991031390136 - }, - "GPQA - # output tokens": { - "description": "min=1250.646, mean=1250.646, max=1250.646, sum=1250.646 (1)", - "tab": "General information", - "score": 1250.645739910314 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=17.412, mean=17.412, max=17.412, sum=17.412 (1)", - "tab": "Efficiency", - "score": 17.412336311587122 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.863, mean=45.863, max=45.863, sum=45.863 (1)", - "tab": "General information", - "score": 45.86321626617375 - }, - "IFEval - # output tokens": { - "description": "min=454.283, mean=454.283, max=454.283, sum=454.283 (1)", - "tab": "General information", - "score": 454.2828096118299 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.862, - "details": { - "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=46.942, mean=46.942, max=46.942, sum=46.942 (1)", - "tab": "Efficiency", - "score": 46.94232517242432 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1332.527, mean=1332.527, max=1332.527, sum=1332.527 (1)", - "tab": "General information", - "score": 1332.527 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.654, - "details": { - "description": "min=0.654, mean=0.654, max=0.654, sum=0.654 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=89.938, mean=89.938, max=89.938, sum=89.938 (1)", - "tab": "Efficiency", - "score": 89.93759564423561 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=106.59, mean=106.59, max=106.59, sum=106.59 (1)", - "tab": "General information", - "score": 106.59 - }, - "Omni-MATH - # output tokens": { - "description": "min=3396.692, mean=3396.692, max=3396.692, sum=3396.692 (1)", - "tab": "General information", - "score": 3396.692 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json deleted file mode 100644 index 3c36cb01b..000000000 --- a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4.1 2025-04-14", - "id": "openai/gpt-4.1-2025-04-14", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 11.09172884853167 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811, - "details": { - "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=6.431, mean=6.431, max=6.431, sum=6.431 (1)", - "tab": "Efficiency", - "score": 6.431383004903793 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)", - "tab": "General information", - "score": 228.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=513.15, mean=513.15, max=513.15, sum=513.15 (1)", - "tab": "General information", - "score": 513.15 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=9.906, mean=9.906, max=9.906, sum=9.906 (1)", - "tab": "Efficiency", - "score": 9.906458986714282 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)", - "tab": "General information", - "score": 248.152466367713 - }, - "GPQA - # output tokens": { - "description": "min=824.722, mean=824.722, max=824.722, sum=824.722 (1)", - "tab": "General information", - "score": 824.7219730941704 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=3.68, mean=3.68, max=3.68, sum=3.68 (1)", - "tab": "Efficiency", - "score": 3.6797932344531836 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=277.305, mean=277.305, max=277.305, sum=277.305 (1)", - "tab": "General information", - "score": 277.3049907578558 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=11.723, mean=11.723, max=11.723, sum=11.723 (1)", - "tab": "Efficiency", - "score": 11.72278983767207 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1007.831, mean=1007.831, max=1007.831, sum=1007.831 (1)", - "tab": "General information", - "score": 1007.831 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.471, - "details": { - "description": "min=0.471, mean=0.471, max=0.471, sum=0.471 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=23.718, mean=23.718, max=23.718, sum=23.718 (1)", - "tab": "Efficiency", - "score": 23.718219178915025 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=1884.743, mean=1884.743, max=1884.743, sum=1884.743 (1)", - "tab": "General information", - "score": 1884.743 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json deleted file mode 100644 index dd4503511..000000000 --- a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4.1 mini 2025-04-14", - "id": "openai/gpt-4.1-mini-2025-04-14", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 7.701476623313954 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=4.927, mean=4.927, max=4.927, sum=4.927 (1)", - "tab": "Efficiency", - "score": 4.927327474832535 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)", - "tab": "General information", - "score": 228.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=627.909, mean=627.909, max=627.909, sum=627.909 (1)", - "tab": "General information", - "score": 627.909 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614, - "details": { - "description": "min=0.614, mean=0.614, max=0.614, sum=0.614 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=8.217, mean=8.217, max=8.217, sum=8.217 (1)", - "tab": "Efficiency", - "score": 8.216832675206822 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)", - "tab": "General information", - "score": 248.152466367713 - }, - "GPQA - # output tokens": { - "description": "min=1056.354, mean=1056.354, max=1056.354, sum=1056.354 (1)", - "tab": "General information", - "score": 1056.354260089686 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.904, - "details": { - "description": "min=0.904, mean=0.904, max=0.904, sum=0.904 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=2.622, mean=2.622, max=2.622, sum=2.622 (1)", - "tab": "Efficiency", - "score": 2.6219342847848774 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=275.1, mean=275.1, max=275.1, sum=275.1 (1)", - "tab": "General information", - "score": 275.09981515711644 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=7.331, mean=7.331, max=7.331, sum=7.331 (1)", - "tab": "Efficiency", - "score": 7.3305598454475405 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1020.373, mean=1020.373, max=1020.373, sum=1020.373 (1)", - "tab": "General information", - "score": 1020.373 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.491, - "details": { - "description": "min=0.491, mean=0.491, max=0.491, sum=0.491 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=15.411, mean=15.411, max=15.411, sum=15.411 (1)", - "tab": "Efficiency", - "score": 15.41072883629799 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=2117.264, mean=2117.264, max=2117.264, sum=2117.264 (1)", - "tab": "General information", - "score": 2117.264 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json deleted file mode 100644 index e2550958a..000000000 --- a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4.1 nano 2025-04-14", - "id": "openai/gpt-4.1-nano-2025-04-14", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 4.5128146238794296 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=2.935, mean=2.935, max=2.935, sum=2.935 (1)", - "tab": "Efficiency", - "score": 2.9353291485309603 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)", - "tab": "General information", - "score": 228.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=503.09, mean=503.09, max=503.09, sum=503.09 (1)", - "tab": "General information", - "score": 503.09 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507, - "details": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=4.817, mean=4.817, max=4.817, sum=4.817 (1)", - "tab": "Efficiency", - "score": 4.816804544808084 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)", - "tab": "General information", - "score": 248.152466367713 - }, - "GPQA - # output tokens": { - "description": "min=842.038, mean=842.038, max=842.038, sum=842.038 (1)", - "tab": "General information", - "score": 842.0381165919282 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=1.781, mean=1.781, max=1.781, sum=1.781 (1)", - "tab": "Efficiency", - "score": 1.7811373196776386 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=269.619, mean=269.619, max=269.619, sum=269.619 (1)", - "tab": "General information", - "score": 269.6192236598891 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811, - "details": { - "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=4.453, mean=4.453, max=4.453, sum=4.453 (1)", - "tab": "Efficiency", - "score": 4.453118676900863 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=909.661, mean=909.661, max=909.661, sum=909.661 (1)", - "tab": "General information", - "score": 909.661 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.367, - "details": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.367 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=8.578, mean=8.578, max=8.578, sum=8.578 (1)", - "tab": "Efficiency", - "score": 8.577683429479599 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=1777.605, mean=1777.605, max=1777.605, sum=1777.605 (1)", - "tab": "General information", - "score": 1777.605 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json deleted file mode 100644 index 3c3d40256..000000000 --- a/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4o 2024-11-20", - "id": "openai/gpt-4o-2024-11-20", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.634, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 13.268214070783824 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.713, - "details": { - "description": "min=0.713, mean=0.713, max=0.713, sum=0.713 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=11.334, mean=11.334, max=11.334, sum=11.334 (1)", - "tab": "Efficiency", - "score": 11.333669463157653 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)", - "tab": "General information", - "score": 228.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=503.126, mean=503.126, max=503.126, sum=503.126 (1)", - "tab": "General information", - "score": 503.126 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=13.65, mean=13.65, max=13.65, sum=13.65 (1)", - "tab": "Efficiency", - "score": 13.64998589877056 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)", - "tab": "General information", - "score": 248.152466367713 - }, - "GPQA - # output tokens": { - "description": "min=597.291, mean=597.291, max=597.291, sum=597.291 (1)", - "tab": "General information", - "score": 597.2914798206278 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.817, - "details": { - "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=8.686, mean=8.686, max=8.686, sum=8.686 (1)", - "tab": "Efficiency", - "score": 8.68623784685752 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=345.405, mean=345.405, max=345.405, sum=345.405 (1)", - "tab": "General information", - "score": 345.40480591497226 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=15.765, mean=15.765, max=15.765, sum=15.765 (1)", - "tab": "Efficiency", - "score": 15.764520774255166 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1044.923, mean=1044.923, max=1044.923, sum=1044.923 (1)", - "tab": "General information", - "score": 1044.923 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.293, - "details": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=16.907, mean=16.907, max=16.907, sum=16.907 (1)", - "tab": "Efficiency", - "score": 16.90665637087822 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=908.643, mean=908.643, max=908.643, sum=908.643 (1)", - "tab": "General information", - "score": 908.643 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json deleted file mode 100644 index 778449e6e..000000000 --- a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4o mini 2024-07-18", - "id": "openai/gpt-4o-mini-2024-07-18", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 10.41176955262334 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.603, - "details": { - "description": "min=0.603, mean=0.603, max=0.603, sum=0.603 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=6.572, mean=6.572, max=6.572, sum=6.572 (1)", - "tab": "Efficiency", - "score": 6.57206253027916 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)", - "tab": "General information", - "score": 228.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=334.86, mean=334.86, max=334.86, sum=334.86 (1)", - "tab": "General information", - "score": 334.86 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.368, - "details": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.368 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=8.814, mean=8.814, max=8.814, sum=8.814 (1)", - "tab": "Efficiency", - "score": 8.813848996910814 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)", - "tab": "General information", - "score": 248.152466367713 - }, - "GPQA - # output tokens": { - "description": "min=489.226, mean=489.226, max=489.226, sum=489.226 (1)", - "tab": "General information", - "score": 489.22645739910314 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782, - "details": { - "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=5.963, mean=5.963, max=5.963, sum=5.963 (1)", - "tab": "Efficiency", - "score": 5.963314282916169 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=314.919, mean=314.919, max=314.919, sum=314.919 (1)", - "tab": "General information", - "score": 314.91866913123846 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=13.996, mean=13.996, max=13.996, sum=13.996 (1)", - "tab": "Efficiency", - "score": 13.996195561885834 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=809.307, mean=809.307, max=809.307, sum=809.307 (1)", - "tab": "General information", - "score": 809.307 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.28, - "details": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.28 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=16.713, mean=16.713, max=16.713, sum=16.713 (1)", - "tab": "Efficiency", - "score": 16.713426391124724 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=863.417, mean=863.417, max=863.417, sum=863.417 (1)", - "tab": "General information", - "score": 863.417 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json deleted file mode 100644 index 95d9762ef..000000000 --- a/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-5 2025-08-07", - "id": "openai/gpt-5-2025-08-07", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 74.66990821942755 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.863, - "details": { - "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=18.668, mean=18.668, max=18.668, sum=18.668 (1)", - "tab": "Efficiency", - "score": 18.668269051074983 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)", - "tab": "General information", - "score": 248.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=5.028, mean=5.028, max=5.028, sum=5.028 (1)", - "tab": "General information", - "score": 5.028 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=57.418, mean=57.418, max=57.418, sum=57.418 (1)", - "tab": "Efficiency", - "score": 57.41822674028542 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)", - "tab": "General information", - "score": 268.15246636771303 - }, - "GPQA - # output tokens": { - "description": "min=5.935, mean=5.935, max=5.935, sum=5.935 (1)", - "tab": "General information", - "score": 5.934977578475336 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "details": { - "description": "min=0.875, mean=0.875, max=0.875, sum=0.875 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=35.937, mean=35.937, max=35.937, sum=35.937 (1)", - "tab": "Efficiency", - "score": 35.937195608664354 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=527.641, mean=527.641, max=527.641, sum=527.641 (1)", - "tab": "General information", - "score": 527.6414048059149 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=88.595, mean=88.595, max=88.595, sum=88.595 (1)", - "tab": "Efficiency", - "score": 88.59490567517281 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1518.974, mean=1518.974, max=1518.974, sum=1518.974 (1)", - "tab": "General information", - "score": 1518.974 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.647, - "details": { - "description": "min=0.647, mean=0.647, max=0.647, sum=0.647 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=172.731, mean=172.731, max=172.731, sum=172.731 (1)", - "tab": "Efficiency", - "score": 172.73094402194022 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=228.774, mean=228.774, max=228.774, sum=228.774 (1)", - "tab": "General information", - "score": 228.774 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json deleted file mode 100644 index 5dc165206..000000000 --- a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-5 mini 2025-08-07", - "id": "openai/gpt-5-mini-2025-08-07", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.819, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 28.206869066978612 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=0.835 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=11.803, mean=11.803, max=11.803, sum=11.803 (1)", - "tab": "Efficiency", - "score": 11.802515007257462 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)", - "tab": "General information", - "score": 248.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=17.495, mean=17.495, max=17.495, sum=17.495 (1)", - "tab": "General information", - "score": 17.495 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.756, - "details": { - "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=20.737, mean=20.737, max=20.737, sum=20.737 (1)", - "tab": "Efficiency", - "score": 20.737325443280653 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)", - "tab": "General information", - "score": 268.15246636771303 - }, - "GPQA - # output tokens": { - "description": "min=25.379, mean=25.379, max=25.379, sum=25.379 (1)", - "tab": "General information", - "score": 25.378923766816143 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927, - "details": { - "description": "min=0.927, mean=0.927, max=0.927, sum=0.927 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=18.834, mean=18.834, max=18.834, sum=18.834 (1)", - "tab": "Efficiency", - "score": 18.83414089833963 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=441.137, mean=441.137, max=441.137, sum=441.137 (1)", - "tab": "General information", - "score": 441.13678373382623 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=0.855 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=33.854, mean=33.854, max=33.854, sum=33.854 (1)", - "tab": "Efficiency", - "score": 33.85394237089157 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1408.024, mean=1408.024, max=1408.024, sum=1408.024 (1)", - "tab": "General information", - "score": 1408.024 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722, - "details": { - "description": "min=0.722, mean=0.722, max=0.722, sum=0.722 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=55.806, mean=55.806, max=55.806, sum=55.806 (1)", - "tab": "Efficiency", - "score": 55.806421615123746 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=362.654, mean=362.654, max=362.654, sum=362.654 (1)", - "tab": "General information", - "score": 362.654 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json deleted file mode 100644 index 096518c62..000000000 --- a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-5 nano 2025-08-07", - "id": "openai/gpt-5-nano-2025-08-07", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.748, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 48.213836350621065 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.778, mean=0.778, max=0.778, sum=0.778 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=17.337, mean=17.337, max=17.337, sum=17.337 (1)", - "tab": "Efficiency", - "score": 17.336622306585312 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)", - "tab": "General information", - "score": 248.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=5.385, mean=5.385, max=5.385, sum=5.385 (1)", - "tab": "General information", - "score": 5.385 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679, - "details": { - "description": "min=0.679, mean=0.679, max=0.679, sum=0.679 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=30.246, mean=30.246, max=30.246, sum=30.246 (1)", - "tab": "Efficiency", - "score": 30.2457077674267 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)", - "tab": "General information", - "score": 268.15246636771303 - }, - "GPQA - # output tokens": { - "description": "min=5.668, mean=5.668, max=5.668, sum=5.668 (1)", - "tab": "General information", - "score": 5.668161434977579 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=26.735, mean=26.735, max=26.735, sum=26.735 (1)", - "tab": "Efficiency", - "score": 26.734930773980075 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=426.656, mean=426.656, max=426.656, sum=426.656 (1)", - "tab": "General information", - "score": 426.6561922365989 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.806, - "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=0.806 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=47.56, mean=47.56, max=47.56, sum=47.56 (1)", - "tab": "Efficiency", - "score": 47.560468022584914 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1391.144, mean=1391.144, max=1391.144, sum=1391.144 (1)", - "tab": "General information", - "score": 1391.144 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.547, - "details": { - "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=119.191, mean=119.191, max=119.191, sum=119.191 (1)", - "tab": "Efficiency", - "score": 119.19145288252831 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=222.15, mean=222.15, max=222.15, sum=222.15 (1)", - "tab": "General information", - "score": 222.15 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json deleted file mode 100644 index 738007852..000000000 --- a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-5.1 2025-11-13", - "id": "openai/gpt-5.1-2025-11-13", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 10.620566227529599 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=1.147, mean=1.147, max=1.147, sum=1.147 (1)", - "tab": "Efficiency", - "score": 1.1470122172832489 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)", - "tab": "General information", - "score": 248.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=5.002, mean=5.002, max=5.002, sum=5.002 (1)", - "tab": "General information", - "score": 5.002 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.442, - "details": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.442 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=1.002, mean=1.002, max=1.002, sum=1.002 (1)", - "tab": "Efficiency", - "score": 1.002433323539426 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)", - "tab": "General information", - "score": 268.15246636771303 - }, - "GPQA - # output tokens": { - "description": "min=5.422, mean=5.422, max=5.422, sum=5.422 (1)", - "tab": "General information", - "score": 5.42152466367713 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.935, - "details": { - "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=13.159, mean=13.159, max=13.159, sum=13.159 (1)", - "tab": "Efficiency", - "score": 13.15882584436103 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=647.063, mean=647.063, max=647.063, sum=647.063 (1)", - "tab": "General information", - "score": 647.0628465804067 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.863, - "details": { - "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=28.081, mean=28.081, max=28.081, sum=28.081 (1)", - "tab": "Efficiency", - "score": 28.08133857488632 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=2059.716, mean=2059.716, max=2059.716, sum=2059.716 (1)", - "tab": "General information", - "score": 2059.716 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.464, - "details": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=9.713, mean=9.713, max=9.713, sum=9.713 (1)", - "tab": "Efficiency", - "score": 9.713221177577973 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=1256.266, mean=1256.266, max=1256.266, sum=1256.266 (1)", - "tab": "General information", - "score": 1256.266 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json deleted file mode 100644 index 8642e9954..000000000 --- a/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-oss-120b", - "id": "openai/gpt-oss-120b", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 19.583454439679375 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.795, - "details": { - "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=6.268, mean=6.268, max=6.268, sum=6.268 (1)", - "tab": "Efficiency", - "score": 6.268435170412063 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)", - "tab": "General information", - "score": 248.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=474.202, mean=474.202, max=474.202, sum=474.202 (1)", - "tab": "General information", - "score": 474.202 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=18.819, mean=18.819, max=18.819, sum=18.819 (1)", - "tab": "Efficiency", - "score": 18.8192116278704 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)", - "tab": "General information", - "score": 268.15246636771303 - }, - "GPQA - # output tokens": { - "description": "min=1218.108, mean=1218.108, max=1218.108, sum=1218.108 (1)", - "tab": "General information", - "score": 1218.1076233183855 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=6.303, mean=6.303, max=6.303, sum=6.303 (1)", - "tab": "Efficiency", - "score": 6.302578532982225 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=945.784, mean=945.784, max=945.784, sum=945.784 (1)", - "tab": "General information", - "score": 945.7837338262477 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=0.845 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=24.979, mean=24.979, max=24.979, sum=24.979 (1)", - "tab": "Efficiency", - "score": 24.978535928487776 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=2925.361, mean=2925.361, max=2925.361, sum=2925.361 (1)", - "tab": "General information", - "score": 2925.361 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688, - "details": { - "description": "min=0.688, mean=0.688, max=0.688, sum=0.688 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=41.549, mean=41.549, max=41.549, sum=41.549 (1)", - "tab": "Efficiency", - "score": 41.54851093864441 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=4103.671, mean=4103.671, max=4103.671, sum=4103.671 (1)", - "tab": "General information", - "score": 4103.671 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json deleted file mode 100644 index 5112d535f..000000000 --- a/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-oss-20b", - "id": "openai/gpt-oss-20b", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 31.785255717522546 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=4.593, mean=4.593, max=4.593, sum=4.593 (1)", - "tab": "Efficiency", - "score": 4.593113619089126 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)", - "tab": "General information", - "score": 248.569 - }, - "MMLU-Pro - # output tokens": { - "description": "min=820.909, mean=820.909, max=820.909, sum=820.909 (1)", - "tab": "General information", - "score": 820.909 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.594, - "details": { - "description": "min=0.594, mean=0.594, max=0.594, sum=0.594 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=27.565, mean=27.565, max=27.565, sum=27.565 (1)", - "tab": "Efficiency", - "score": 27.56541810923093 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)", - "tab": "General information", - "score": 268.15246636771303 - }, - "GPQA - # output tokens": { - "description": "min=2872.139, mean=2872.139, max=2872.139, sum=2872.139 (1)", - "tab": "General information", - "score": 2872.1390134529147 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.732, - "details": { - "description": "min=0.732, mean=0.732, max=0.732, sum=0.732 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=26.607, mean=26.607, max=26.607, sum=26.607 (1)", - "tab": "Efficiency", - "score": 26.607220574359577 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)", - "tab": "General information", - "score": 45.67097966728281 - }, - "IFEval - # output tokens": { - "description": "min=3202.279, mean=3202.279, max=3202.279, sum=3202.279 (1)", - "tab": "General information", - "score": 3202.279112754159 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.737, - "details": { - "description": "min=0.737, mean=0.737, max=0.737, sum=0.737 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=42.985, mean=42.985, max=42.985, sum=42.985 (1)", - "tab": "Efficiency", - "score": 42.985184440851214 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=4398.71, mean=4398.71, max=4398.71, sum=4398.71 (1)", - "tab": "General information", - "score": 4398.71 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565, - "details": { - "description": "min=0.565, mean=0.565, max=0.565, sum=0.565 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=57.175, mean=57.175, max=57.175, sum=57.175 (1)", - "tab": "Efficiency", - "score": 57.17534184408188 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)", - "tab": "General information", - "score": 109.623 - }, - "Omni-MATH - # output tokens": { - "description": "min=6604.944, mean=6604.944, max=6604.944, sum=6604.944 (1)", - "tab": "General information", - "score": 6604.944 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json deleted file mode 100644 index 677721448..000000000 --- a/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "o3 2025-04-16", - "id": "openai/o3-2025-04-16", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 51.078448384234015 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=17.306, mean=17.306, max=17.306, sum=17.306 (1)", - "tab": "Efficiency", - "score": 17.306045585632326 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=249.506, mean=249.506, max=249.506, sum=249.506 (1)", - "tab": "General information", - "score": 249.506 - }, - "MMLU-Pro - # output tokens": { - "description": "min=5.038, mean=5.038, max=5.038, sum=5.038 (1)", - "tab": "General information", - "score": 5.038 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.753, - "details": { - "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=48.024, mean=48.024, max=48.024, sum=48.024 (1)", - "tab": "Efficiency", - "score": 48.0242628821343 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=269.975, mean=269.975, max=269.975, sum=269.975 (1)", - "tab": "General information", - "score": 269.97533632286996 - }, - "GPQA - # output tokens": { - "description": "min=6.457, mean=6.457, max=6.457, sum=6.457 (1)", - "tab": "General information", - "score": 6.457399103139013 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.869, mean=0.869, max=0.869, sum=0.869 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=17.399, mean=17.399, max=17.399, sum=17.399 (1)", - "tab": "Efficiency", - "score": 17.398983872972444 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)", - "tab": "General information", - "score": 46.05360443622921 - }, - "IFEval - # output tokens": { - "description": "min=447.353, mean=447.353, max=447.353, sum=447.353 (1)", - "tab": "General information", - "score": 447.35304990757857 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=0.861 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=47.15, mean=47.15, max=47.15, sum=47.15 (1)", - "tab": "Efficiency", - "score": 47.150321824789046 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1248.452, mean=1248.452, max=1248.452, sum=1248.452 (1)", - "tab": "General information", - "score": 1248.452 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.714, - "details": { - "description": "min=0.714, mean=0.714, max=0.714, sum=0.714 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=125.513, mean=125.513, max=125.513, sum=125.513 (1)", - "tab": "Efficiency", - "score": 125.51262775564194 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.767, mean=109.767, max=109.767, sum=109.767 (1)", - "tab": "General information", - "score": 109.767 - }, - "Omni-MATH - # output tokens": { - "description": "min=506.811, mean=506.811, max=506.811, sum=506.811 (1)", - "tab": "General information", - "score": 506.811 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json deleted file mode 100644 index fd4ae16c5..000000000 --- a/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "o4-mini 2025-04-16", - "id": "openai/o4-mini-2025-04-16", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 21.93756369551652 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=0.82 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=8.896, mean=8.896, max=8.896, sum=8.896 (1)", - "tab": "Efficiency", - "score": 8.895831291675568 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=249.506, mean=249.506, max=249.506, sum=249.506 (1)", - "tab": "General information", - "score": 249.506 - }, - "MMLU-Pro - # output tokens": { - "description": "min=10.834, mean=10.834, max=10.834, sum=10.834 (1)", - "tab": "General information", - "score": 10.834 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=22.412, mean=22.412, max=22.412, sum=22.412 (1)", - "tab": "Efficiency", - "score": 22.412139415206397 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=269.975, mean=269.975, max=269.975, sum=269.975 (1)", - "tab": "General information", - "score": 269.97533632286996 - }, - "GPQA - # output tokens": { - "description": "min=8.413, mean=8.413, max=8.413, sum=8.413 (1)", - "tab": "General information", - "score": 8.41255605381166 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.929, - "details": { - "description": "min=0.929, mean=0.929, max=0.929, sum=0.929 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=12.26, mean=12.26, max=12.26, sum=12.26 (1)", - "tab": "Efficiency", - "score": 12.260425486097494 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)", - "tab": "General information", - "score": 46.05360443622921 - }, - "IFEval - # output tokens": { - "description": "min=360.231, mean=360.231, max=360.231, sum=360.231 (1)", - "tab": "General information", - "score": 360.2310536044362 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=25.397, mean=25.397, max=25.397, sum=25.397 (1)", - "tab": "Efficiency", - "score": 25.396886379241945 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=969.786, mean=969.786, max=969.786, sum=969.786 (1)", - "tab": "General information", - "score": 969.786 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=0.72 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=40.723, mean=40.723, max=40.723, sum=40.723 (1)", - "tab": "Efficiency", - "score": 40.72253590536118 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.767, mean=109.767, max=109.767, sum=109.767 (1)", - "tab": "General information", - "score": 109.767 - }, - "Omni-MATH - # output tokens": { - "description": "min=388.401, mean=388.401, max=388.401, sum=388.401 (1)", - "tab": "General information", - "score": 388.401 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json deleted file mode 100644 index 50778c699..000000000 --- a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5 Instruct Turbo 72B", - "id": "qwen/qwen2.5-72b-instruct-turbo", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 16.666975749955085 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.631, - "details": { - "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=11.79, mean=11.79, max=11.79, sum=11.79 (1)", - "tab": "Efficiency", - "score": 11.790208662986755 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=238.715, mean=238.715, max=238.715, sum=238.715 (1)", - "tab": "General information", - "score": 238.715 - }, - "MMLU-Pro - # output tokens": { - "description": "min=489.611, mean=489.611, max=489.611, sum=489.611 (1)", - "tab": "General information", - "score": 489.611 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426, - "details": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=28.719, mean=28.719, max=28.719, sum=28.719 (1)", - "tab": "Efficiency", - "score": 28.71905704036422 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=253.37, mean=253.37, max=253.37, sum=253.37 (1)", - "tab": "General information", - "score": 253.36995515695068 - }, - "GPQA - # output tokens": { - "description": "min=704.881, mean=704.881, max=704.881, sum=704.881 (1)", - "tab": "General information", - "score": 704.8811659192825 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.806, - "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=0.806 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=20.844, mean=20.844, max=20.844, sum=20.844 (1)", - "tab": "Efficiency", - "score": 20.844201727407036 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)", - "tab": "General information", - "score": 46.491682070240294 - }, - "IFEval - # output tokens": { - "description": "min=361.089, mean=361.089, max=361.089, sum=361.089 (1)", - "tab": "General information", - "score": 361.0887245841035 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802, - "details": { - "description": "min=0.802, mean=0.802, max=0.802, sum=0.802 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=12.219, mean=12.219, max=12.219, sum=12.219 (1)", - "tab": "Efficiency", - "score": 12.219232248067856 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1042.017, mean=1042.017, max=1042.017, sum=1042.017 (1)", - "tab": "General information", - "score": 1042.017 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.33 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=9.762, mean=9.762, max=9.762, sum=9.762 (1)", - "tab": "Efficiency", - "score": 9.762179070949555 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)", - "tab": "General information", - "score": 111.6 - }, - "Omni-MATH - # output tokens": { - "description": "min=886.55, mean=886.55, max=886.55, sum=886.55 (1)", - "tab": "General information", - "score": 886.55 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json deleted file mode 100644 index c974f1019..000000000 --- a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5 Instruct Turbo 7B", - "id": "qwen/qwen2.5-7b-instruct-turbo", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 4.913331052029195 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539, - "details": { - "description": "min=0.539, mean=0.539, max=0.539, sum=0.539 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=6.223, mean=6.223, max=6.223, sum=6.223 (1)", - "tab": "Efficiency", - "score": 6.223100474119186 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=238.715, mean=238.715, max=238.715, sum=238.715 (1)", - "tab": "General information", - "score": 238.715 - }, - "MMLU-Pro - # output tokens": { - "description": "min=439.207, mean=439.207, max=439.207, sum=439.207 (1)", - "tab": "General information", - "score": 439.207 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.341, - "details": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.341 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=3.475, mean=3.475, max=3.475, sum=3.475 (1)", - "tab": "Efficiency", - "score": 3.4745728910771185 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=253.37, mean=253.37, max=253.37, sum=253.37 (1)", - "tab": "General information", - "score": 253.36995515695068 - }, - "GPQA - # output tokens": { - "description": "min=554.274, mean=554.274, max=554.274, sum=554.274 (1)", - "tab": "General information", - "score": 554.2735426008969 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=2.068, mean=2.068, max=2.068, sum=2.068 (1)", - "tab": "Efficiency", - "score": 2.0679604544436865 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)", - "tab": "General information", - "score": 46.491682070240294 - }, - "IFEval - # output tokens": { - "description": "min=317.828, mean=317.828, max=317.828, sum=317.828 (1)", - "tab": "General information", - "score": 317.82809611829947 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731, - "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=7.74, mean=7.74, max=7.74, sum=7.74 (1)", - "tab": "Efficiency", - "score": 7.7404146847724915 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=953.359, mean=953.359, max=953.359, sum=953.359 (1)", - "tab": "General information", - "score": 953.359 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.294, - "details": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.294 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=5.061, mean=5.061, max=5.061, sum=5.061 (1)", - "tab": "Efficiency", - "score": 5.06060675573349 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)", - "tab": "General information", - "score": 111.6 - }, - "Omni-MATH - # output tokens": { - "description": "min=809.198, mean=809.198, max=809.198, sum=809.198 (1)", - "tab": "General information", - "score": 809.198 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json deleted file mode 100644 index 9ded60c84..000000000 --- a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen3 235B A22B FP8 Throughput", - "id": "qwen/qwen3-235b-a22b-fp8-tput", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 175.88874367192255 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.817, - "details": { - "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=126.73, mean=126.73, max=126.73, sum=126.73 (1)", - "tab": "Efficiency", - "score": 126.73047786664962 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)", - "tab": "General information", - "score": 259.715 - }, - "MMLU-Pro - # output tokens": { - "description": "min=3518.576, mean=3518.576, max=3518.576, sum=3518.576 (1)", - "tab": "General information", - "score": 3518.576 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623, - "details": { - "description": "min=0.623, mean=0.623, max=0.623, sum=0.623 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=237.413, mean=237.413, max=237.413, sum=237.413 (1)", - "tab": "Efficiency", - "score": 237.41318658488748 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)", - "tab": "General information", - "score": 274.36995515695065 - }, - "GPQA - # output tokens": { - "description": "min=7431.507, mean=7431.507, max=7431.507, sum=7431.507 (1)", - "tab": "General information", - "score": 7431.506726457399 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.816, - "details": { - "description": "min=0.816, mean=0.816, max=0.816, sum=0.816 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=36.742, mean=36.742, max=36.742, sum=36.742 (1)", - "tab": "Efficiency", - "score": 36.742134021963516 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)", - "tab": "General information", - "score": 46.491682070240294 - }, - "IFEval - # output tokens": { - "description": "min=1101.856, mean=1101.856, max=1101.856, sum=1101.856 (1)", - "tab": "General information", - "score": 1101.8558225508318 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=125.734, mean=125.734, max=125.734, sum=125.734 (1)", - "tab": "Efficiency", - "score": 125.73418169164657 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=3594.207, mean=3594.207, max=3594.207, sum=3594.207 (1)", - "tab": "General information", - "score": 3594.207 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548, - "details": { - "description": "min=0.548, mean=0.548, max=0.548, sum=0.548 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=352.824, mean=352.824, max=352.824, sum=352.824 (1)", - "tab": "Efficiency", - "score": 352.82373819446565 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)", - "tab": "General information", - "score": 111.6 - }, - "Omni-MATH - # output tokens": { - "description": "min=10072.403, mean=10072.403, max=10072.403, sum=10072.403 (1)", - "tab": "General information", - "score": 10072.403 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json deleted file mode 100644 index 0210712c3..000000000 --- a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen3 235B A22B Instruct 2507 FP8", - "id": "qwen/qwen3-235b-a22b-instruct-2507-fp8", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.798, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 91.57420329307861 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.844, - "details": { - "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=52.244, mean=52.244, max=52.244, sum=52.244 (1)", - "tab": "Efficiency", - "score": 52.24400525426864 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)", - "tab": "General information", - "score": 259.715 - }, - "MMLU-Pro - # output tokens": { - "description": "min=1423.589, mean=1423.589, max=1423.589, sum=1423.589 (1)", - "tab": "General information", - "score": 1423.589 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=103.303, mean=103.303, max=103.303, sum=103.303 (1)", - "tab": "Efficiency", - "score": 103.30346254970995 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)", - "tab": "General information", - "score": 274.36995515695065 - }, - "GPQA - # output tokens": { - "description": "min=3922.17, mean=3922.17, max=3922.17, sum=3922.17 (1)", - "tab": "General information", - "score": 3922.170403587444 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=0.835 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=12.729, mean=12.729, max=12.729, sum=12.729 (1)", - "tab": "Efficiency", - "score": 12.728508173648178 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)", - "tab": "General information", - "score": 46.491682070240294 - }, - "IFEval - # output tokens": { - "description": "min=427.54, mean=427.54, max=427.54, sum=427.54 (1)", - "tab": "General information", - "score": 427.53974121996305 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.866, - "details": { - "description": "min=0.866, mean=0.866, max=0.866, sum=0.866 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=61.017, mean=61.017, max=61.017, sum=61.017 (1)", - "tab": "Efficiency", - "score": 61.01670853805542 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1976.28, mean=1976.28, max=1976.28, sum=1976.28 (1)", - "tab": "General information", - "score": 1976.28 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=228.578, mean=228.578, max=228.578, sum=228.578 (1)", - "tab": "Efficiency", - "score": 228.57833194971084 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)", - "tab": "General information", - "score": 111.6 - }, - "Omni-MATH - # output tokens": { - "description": "min=5629.583, mean=5629.583, max=5629.583, sum=5629.583 (1)", - "tab": "General information", - "score": 5629.583 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json deleted file mode 100644 index 6ee69548e..000000000 --- a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen3-Next 80B A3B Thinking", - "id": "qwen/qwen3-next-80b-a3b-thinking", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 27.61164260375731 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=0.786 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=20.097, mean=20.097, max=20.097, sum=20.097 (1)", - "tab": "Efficiency", - "score": 20.09722422862053 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)", - "tab": "General information", - "score": 259.715 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63, - "details": { - "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=40.06, mean=40.06, max=40.06, sum=40.06 (1)", - "tab": "Efficiency", - "score": 40.06039341950096 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)", - "tab": "General information", - "score": 274.36995515695065 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=13.893, mean=13.893, max=13.893, sum=13.893 (1)", - "tab": "Efficiency", - "score": 13.89268838323639 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)", - "tab": "General information", - "score": 46.491682070240294 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=23.095, mean=23.095, max=23.095, sum=23.095 (1)", - "tab": "Efficiency", - "score": 23.095464605808257 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.467, - "details": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=40.912, mean=40.912, max=40.912, sum=40.912 (1)", - "tab": "Efficiency", - "score": 40.91244238162041 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)", - "tab": "General information", - "score": 111.6 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json deleted file mode 100644 index b86fc5b45..000000000 --- a/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra Fin", - "id": "writer/palmyra-fin", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.577, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 13.54320003211858 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.591, - "details": { - "description": "min=0.591, mean=0.591, max=0.591, sum=0.591 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=10.488, mean=10.488, max=10.488, sum=10.488 (1)", - "tab": "Efficiency", - "score": 10.488489307641983 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)", - "tab": "General information", - "score": 228.366 - }, - "MMLU-Pro - # output tokens": { - "description": "min=493.383, mean=493.383, max=493.383, sum=493.383 (1)", - "tab": "General information", - "score": 493.383 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422, - "details": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=14.428, mean=14.428, max=14.428, sum=14.428 (1)", - "tab": "Efficiency", - "score": 14.42766729758994 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)", - "tab": "General information", - "score": 248.88565022421525 - }, - "GPQA - # output tokens": { - "description": "min=671.045, mean=671.045, max=671.045, sum=671.045 (1)", - "tab": "General information", - "score": 671.0448430493274 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=0.793 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=6.85, mean=6.85, max=6.85, sum=6.85 (1)", - "tab": "Efficiency", - "score": 6.849953265815918 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)", - "tab": "General information", - "score": 46.024029574861366 - }, - "IFEval - # output tokens": { - "description": "min=332.181, mean=332.181, max=332.181, sum=332.181 (1)", - "tab": "General information", - "score": 332.181146025878 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=18.947, mean=18.947, max=18.947, sum=18.947 (1)", - "tab": "Efficiency", - "score": 18.947298042297362 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=935.556, mean=935.556, max=935.556, sum=935.556 (1)", - "tab": "General information", - "score": 935.556 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.295, - "details": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.295 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=17.003, mean=17.003, max=17.003, sum=17.003 (1)", - "tab": "Efficiency", - "score": 17.002592247247694 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)", - "tab": "General information", - "score": 109.708 - }, - "Omni-MATH - # output tokens": { - "description": "min=820.641, mean=820.641, max=820.641, sum=820.641 (1)", - "tab": "General information", - "score": 820.641 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json deleted file mode 100644 index ac68f722a..000000000 --- a/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/writer_palmyra-med/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra Med", - "id": "writer/palmyra-med", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.476, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 4.374187379517853 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.411, - "details": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.411 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)", - "tab": "Efficiency", - "score": 0.32738947081565856 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=249.366, mean=249.366, max=249.366, sum=249.366 (1)", - "tab": "General information", - "score": 249.366 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.368, - "details": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.368 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.356 (1)", - "tab": "Efficiency", - "score": 0.3557077256018805 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=269.886, mean=269.886, max=269.886, sum=269.886 (1)", - "tab": "General information", - "score": 269.8856502242152 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.767, - "details": { - "description": "min=0.767, mean=0.767, max=0.767, sum=0.767 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=4.651, mean=4.651, max=4.651, sum=4.651 (1)", - "tab": "Efficiency", - "score": 4.650597941633073 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)", - "tab": "General information", - "score": 46.024029574861366 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.676, - "details": { - "description": "min=0.676, mean=0.676, max=0.676, sum=0.676 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=10.081, mean=10.081, max=10.081, sum=10.081 (1)", - "tab": "Efficiency", - "score": 10.080555647850037 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.156, - "details": { - "description": "min=0.156, mean=0.156, max=0.156, sum=0.156 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=6.457, mean=6.457, max=6.457, sum=6.457 (1)", - "tab": "Efficiency", - "score": 6.456686111688614 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)", - "tab": "General information", - "score": 109.708 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json deleted file mode 100644 index 9398b6319..000000000 --- a/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra-X-004", - "id": "writer/palmyra-x-004", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 19.38686150670534 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.657, - "details": { - "description": "min=0.657, mean=0.657, max=0.657, sum=0.657 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=14.079, mean=14.079, max=14.079, sum=14.079 (1)", - "tab": "Efficiency", - "score": 14.079012663602828 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)", - "tab": "General information", - "score": 228.366 - }, - "MMLU-Pro - # output tokens": { - "description": "min=510.633, mean=510.633, max=510.633, sum=510.633 (1)", - "tab": "General information", - "score": 510.633 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395, - "details": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.395 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=20.444, mean=20.444, max=20.444, sum=20.444 (1)", - "tab": "Efficiency", - "score": 20.444375363700594 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)", - "tab": "General information", - "score": 248.88565022421525 - }, - "GPQA - # output tokens": { - "description": "min=716.437, mean=716.437, max=716.437, sum=716.437 (1)", - "tab": "General information", - "score": 716.4372197309417 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "description": "min=0.872, mean=0.872, max=0.872, sum=0.872 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=10.268, mean=10.268, max=10.268, sum=10.268 (1)", - "tab": "Efficiency", - "score": 10.267585801990107 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)", - "tab": "General information", - "score": 46.024029574861366 - }, - "IFEval - # output tokens": { - "description": "min=357.087, mean=357.087, max=357.087, sum=357.087 (1)", - "tab": "General information", - "score": 357.08687615526804 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802, - "details": { - "description": "min=0.802, mean=0.802, max=0.802, sum=0.802 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=28.186, mean=28.186, max=28.186, sum=28.186 (1)", - "tab": "Efficiency", - "score": 28.185582681894303 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1068.195, mean=1068.195, max=1068.195, sum=1068.195 (1)", - "tab": "General information", - "score": 1068.195 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.32, - "details": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.32 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=23.958, mean=23.958, max=23.958, sum=23.958 (1)", - "tab": "Efficiency", - "score": 23.95775102233887 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)", - "tab": "General information", - "score": 109.708 - }, - "Omni-MATH - # output tokens": { - "description": "min=897.531, mean=897.531, max=897.531, sum=897.531 (1)", - "tab": "General information", - "score": 897.531 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json deleted file mode 100644 index 6d3707107..000000000 --- a/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra X5", - "id": "writer/palmyra-x5", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 7.539339301355213 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=0.804 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=5.907, mean=5.907, max=5.907, sum=5.907 (1)", - "tab": "Efficiency", - "score": 5.906555171251297 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=249.366, mean=249.366, max=249.366, sum=249.366 (1)", - "tab": "General information", - "score": 249.366 - }, - "MMLU-Pro - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.661, mean=0.661, max=0.661, sum=0.661 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=9.251, mean=9.251, max=9.251, sum=9.251 (1)", - "tab": "Efficiency", - "score": 9.251234515365464 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=269.886, mean=269.886, max=269.886, sum=269.886 (1)", - "tab": "General information", - "score": 269.8856502242152 - }, - "GPQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.823, - "details": { - "description": "min=0.823, mean=0.823, max=0.823, sum=0.823 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=3.337, mean=3.337, max=3.337, sum=3.337 (1)", - "tab": "Efficiency", - "score": 3.3367519599012 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)", - "tab": "General information", - "score": 46.024029574861366 - }, - "IFEval - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=0.78 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=8.281, mean=8.281, max=8.281, sum=8.281 (1)", - "tab": "Efficiency", - "score": 8.280673936367036 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414, - "details": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=10.921, mean=10.921, max=10.921, sum=10.921 (1)", - "tab": "Efficiency", - "score": 10.921480923891068 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)", - "tab": "General information", - "score": 109.708 - }, - "Omni-MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json deleted file mode 100644 index 54503d043..000000000 --- a/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Grok 3 Beta", - "id": "xai/grok-3-beta", - "developer": "xai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 21.832675643266274 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=24.646, mean=24.646, max=24.646, sum=24.646 (1)", - "tab": "Efficiency", - "score": 24.646376408576966 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0.013, mean=0.013, max=0.013, sum=0.013 (1)", - "tab": "General information", - "score": 0.013 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=223.237, mean=223.237, max=223.237, sum=223.237 (1)", - "tab": "General information", - "score": 223.237 - }, - "MMLU-Pro - # output tokens": { - "description": "min=1669.743, mean=1669.743, max=1669.743, sum=1669.743 (1)", - "tab": "General information", - "score": 1669.743 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65, - "details": { - "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=30.888, mean=30.888, max=30.888, sum=30.888 (1)", - "tab": "Efficiency", - "score": 30.88756059317311 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)", - "tab": "General information", - "score": 0.020179372197309416 - }, - "GPQA - # prompt tokens": { - "description": "min=233.054, mean=233.054, max=233.054, sum=233.054 (1)", - "tab": "General information", - "score": 233.05381165919283 - }, - "GPQA - # output tokens": { - "description": "min=2771.594, mean=2771.594, max=2771.594, sum=2771.594 (1)", - "tab": "General information", - "score": 2771.5941704035877 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.884, mean=0.884, max=0.884, sum=0.884 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=5.792, mean=5.792, max=5.792, sum=5.792 (1)", - "tab": "Efficiency", - "score": 5.791596473475261 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.192, mean=45.192, max=45.192, sum=45.192 (1)", - "tab": "General information", - "score": 45.19223659889094 - }, - "IFEval - # output tokens": { - "description": "min=404.85, mean=404.85, max=404.85, sum=404.85 (1)", - "tab": "General information", - "score": 404.8502772643253 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=16.937, mean=16.937, max=16.937, sum=16.937 (1)", - "tab": "Efficiency", - "score": 16.93687919616699 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1419.576, mean=1419.576, max=1419.576, sum=1419.576 (1)", - "tab": "General information", - "score": 1419.576 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.464, - "details": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=30.901, mean=30.901, max=30.901, sum=30.901 (1)", - "tab": "Efficiency", - "score": 30.90096554493904 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "General information", - "score": 0.001 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=104.089, mean=104.089, max=104.089, sum=104.089 (1)", - "tab": "General information", - "score": 104.089 - }, - "Omni-MATH - # output tokens": { - "description": "min=3296.733, mean=3296.733, max=3296.733, sum=3296.733 (1)", - "tab": "General information", - "score": 3296.733 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json deleted file mode 100644 index a083c0183..000000000 --- a/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Grok 3 mini Beta", - "id": "xai/grok-3-mini-beta", - "developer": "xai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 12.070258432341626 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.799, - "details": { - "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=7.153, mean=7.153, max=7.153, sum=7.153 (1)", - "tab": "Efficiency", - "score": 7.153050385713577 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0.013, mean=0.013, max=0.013, sum=0.013 (1)", - "tab": "General information", - "score": 0.013 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=223.237, mean=223.237, max=223.237, sum=223.237 (1)", - "tab": "General information", - "score": 223.237 - }, - "MMLU-Pro - # output tokens": { - "description": "min=59.839, mean=59.839, max=59.839, sum=59.839 (1)", - "tab": "General information", - "score": 59.839 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=14.215, mean=14.215, max=14.215, sum=14.215 (1)", - "tab": "Efficiency", - "score": 14.215015458419185 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)", - "tab": "General information", - "score": 0.020179372197309416 - }, - "GPQA - # prompt tokens": { - "description": "min=233.054, mean=233.054, max=233.054, sum=233.054 (1)", - "tab": "General information", - "score": 233.05381165919283 - }, - "GPQA - # output tokens": { - "description": "min=125.807, mean=125.807, max=125.807, sum=125.807 (1)", - "tab": "General information", - "score": 125.80717488789237 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.951, - "details": { - "description": "min=0.951, mean=0.951, max=0.951, sum=0.951 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=7.187, mean=7.187, max=7.187, sum=7.187 (1)", - "tab": "Efficiency", - "score": 7.187224511077797 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.192, mean=45.192, max=45.192, sum=45.192 (1)", - "tab": "General information", - "score": 45.19223659889094 - }, - "IFEval - # output tokens": { - "description": "min=347.104, mean=347.104, max=347.104, sum=347.104 (1)", - "tab": "General information", - "score": 347.10351201478744 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.651, - "details": { - "description": "min=0.651, mean=0.651, max=0.651, sum=0.651 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=10.787, mean=10.787, max=10.787, sum=10.787 (1)", - "tab": "Efficiency", - "score": 10.787254344463348 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=864.463, mean=864.463, max=864.463, sum=864.463 (1)", - "tab": "General information", - "score": 864.463 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318, - "details": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.318 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=21.009, mean=21.009, max=21.009, sum=21.009 (1)", - "tab": "Efficiency", - "score": 21.008747462034226 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "General information", - "score": 0.001 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=104.089, mean=104.089, max=104.089, sum=104.089 (1)", - "tab": "General information", - "score": 104.089 - }, - "Omni-MATH - # output tokens": { - "description": "min=183.116, mean=183.116, max=183.116, sum=183.116 (1)", - "tab": "General information", - "score": 183.116 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json deleted file mode 100644 index a25562cb1..000000000 --- a/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Grok 4 0709", - "id": "xai/grok-4-0709", - "developer": "xai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 128.04182146459848 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.851, - "details": { - "description": "min=0.851, mean=0.851, max=0.851, sum=0.851 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=93.583, mean=93.583, max=93.583, sum=93.583 (1)", - "tab": "Efficiency", - "score": 93.58286614966393 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0.013, mean=0.013, max=0.013, sum=0.013 (1)", - "tab": "General information", - "score": 0.013 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=244.237, mean=244.237, max=244.237, sum=244.237 (1)", - "tab": "General information", - "score": 244.237 - }, - "MMLU-Pro - # output tokens": { - "description": "min=4.789, mean=4.789, max=4.789, sum=4.789 (1)", - "tab": "General information", - "score": 4.789 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=223.967, mean=223.967, max=223.967, sum=223.967 (1)", - "tab": "Efficiency", - "score": 223.96746500778625 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)", - "tab": "General information", - "score": 0.020179372197309416 - }, - "GPQA - # prompt tokens": { - "description": "min=254.007, mean=254.007, max=254.007, sum=254.007 (1)", - "tab": "General information", - "score": 254.0067264573991 - }, - "GPQA - # output tokens": { - "description": "min=5.841, mean=5.841, max=5.841, sum=5.841 (1)", - "tab": "General information", - "score": 5.8408071748878925 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=31.966, mean=31.966, max=31.966, sum=31.966 (1)", - "tab": "Efficiency", - "score": 31.966069252786266 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=45.192, mean=45.192, max=45.192, sum=45.192 (1)", - "tab": "General information", - "score": 45.19223659889094 - }, - "IFEval - # output tokens": { - "description": "min=376.298, mean=376.298, max=376.298, sum=376.298 (1)", - "tab": "General information", - "score": 376.29759704251387 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.797, - "details": { - "description": "min=0.797, mean=0.797, max=0.797, sum=0.797 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=115.441, mean=115.441, max=115.441, sum=115.441 (1)", - "tab": "Efficiency", - "score": 115.44128810715675 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=1553.96, mean=1553.96, max=1553.96, sum=1553.96 (1)", - "tab": "General information", - "score": 1553.96 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.603, - "details": { - "description": "min=0.603, mean=0.603, max=0.603, sum=0.603 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=175.251, mean=175.251, max=175.251, sum=175.251 (1)", - "tab": "Efficiency", - "score": 175.2514188055992 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "General information", - "score": 0.001 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=104.089, mean=104.089, max=104.089, sum=104.089 (1)", - "tab": "General information", - "score": 104.089 - }, - "Omni-MATH - # output tokens": { - "description": "min=104.419, mean=104.419, max=104.419, sum=104.419 (1)", - "tab": "General information", - "score": 104.419 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json deleted file mode 100644 index 43a98dd63..000000000 --- a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json +++ /dev/null @@ -1,345 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770835969.095764", - "retrieved_timestamp": "1770835969.095764", - "source_metadata": { - "source_name": "helm_capabilities", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GLM-4.5-Air-FP8", - "id": "zai-org/glm-4.5-air-fp8", - "developer": "zai-org", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean score", - "source_data": { - "dataset_name": "helm_capabilities", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "The mean of the scores from all columns.", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "tab": "Accuracy", - "Mean score - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 36.15586140714108 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU-Pro", - "source_data": { - "dataset_name": "MMLU-Pro", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on MMLU-Pro", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)", - "tab": "Accuracy", - "MMLU-Pro - Observed inference time (s)": { - "description": "min=30.422, mean=30.422, max=30.422, sum=30.422 (1)", - "tab": "Efficiency", - "score": 30.421801055192947 - }, - "MMLU-Pro - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "MMLU-Pro - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MMLU-Pro - # prompt tokens": { - "description": "min=250.402, mean=250.402, max=250.402, sum=250.402 (1)", - "tab": "General information", - "score": 250.402 - }, - "MMLU-Pro - # output tokens": { - "description": "min=4423.528, mean=4423.528, max=4423.528, sum=4423.528 (1)", - "tab": "General information", - "score": 4423.528 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "all", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "COT correct on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.594, - "details": { - "description": "min=0.594, mean=0.594, max=0.594, sum=0.594 (1)", - "tab": "Accuracy", - "GPQA - Observed inference time (s)": { - "description": "min=54.963, mean=54.963, max=54.963, sum=54.963 (1)", - "tab": "Efficiency", - "score": 54.96293809649121 - }, - "GPQA - # eval": { - "description": "min=446, mean=446, max=446, sum=446 (1)", - "tab": "General information", - "score": 446.0 - }, - "GPQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GPQA - # prompt tokens": { - "description": "min=269.978, mean=269.978, max=269.978, sum=269.978 (1)", - "tab": "General information", - "score": 269.9775784753363 - }, - "GPQA - # output tokens": { - "description": "min=8628.161, mean=8628.161, max=8628.161, sum=8628.161 (1)", - "tab": "General information", - "score": 8628.16143497758 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "gpqa_main", - "use_chain_of_thought": "true", - "use_few_shot": "false" - } - } - }, - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "IFEval Strict Acc on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "description": "min=0.812, mean=0.812, max=0.812, sum=0.812 (1)", - "tab": "Accuracy", - "IFEval - Observed inference time (s)": { - "description": "min=8.027, mean=8.027, max=8.027, sum=8.027 (1)", - "tab": "Efficiency", - "score": 8.026858968787625 - }, - "IFEval - # eval": { - "description": "min=541, mean=541, max=541, sum=541 (1)", - "tab": "General information", - "score": 541.0 - }, - "IFEval - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IFEval - # prompt tokens": { - "description": "min=46.026, mean=46.026, max=46.026, sum=46.026 (1)", - "tab": "General information", - "score": 46.02587800369686 - }, - "IFEval - # output tokens": { - "description": "min=1330.573, mean=1330.573, max=1330.573, sum=1330.573 (1)", - "tab": "General information", - "score": 1330.573012939002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WildBench", - "source_data": { - "dataset_name": "WildBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "WB Score on WildBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.789, - "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=0.789 (1)", - "tab": "Accuracy", - "WildBench - Observed inference time (s)": { - "description": "min=25.055, mean=25.055, max=25.055, sum=25.055 (1)", - "tab": "Efficiency", - "score": 25.055315640687944 - }, - "WildBench - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "WildBench - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "WildBench - # output tokens": { - "description": "min=4196.241, mean=4196.241, max=4196.241, sum=4196.241 (1)", - "tab": "General information", - "score": 4196.241 - } - } - }, - "generation_config": { - "additional_details": { - "subset": "v2" - } - } - }, - { - "evaluation_name": "Omni-MATH", - "source_data": { - "dataset_name": "Omni-MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Acc on Omni-MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391, - "details": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)", - "tab": "Accuracy", - "Omni-MATH - Observed inference time (s)": { - "description": "min=62.312, mean=62.312, max=62.312, sum=62.312 (1)", - "tab": "Efficiency", - "score": 62.31239327454567 - }, - "Omni-MATH - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "Omni-MATH - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "Omni-MATH - # prompt tokens": { - "description": "min=109.807, mean=109.807, max=109.807, sum=109.807 (1)", - "tab": "General information", - "score": 109.807 - }, - "Omni-MATH - # output tokens": { - "description": "min=11088.014, mean=11088.014, max=11088.014, sum=11088.014 (1)", - "tab": "General information", - "score": 11088.014 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json deleted file mode 100644 index 152223193..000000000 --- a/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Anthropic-LM v4-s3 52B", - "id": "Anthropic-LM-v4-s3-52B", - "developer": "unknown", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.8178973356392711 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.7935577862997218 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.13822916666666668 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5930298633071189 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.648748165414832 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5306599832915623 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.481, - "details": { - "description": "min=0.25, mean=0.481, max=0.78, sum=7.22 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.063, mean=0.144, max=0.262, sum=2.165 (15)", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.17, mean=0.434, max=0.76, sum=6.513 (15)", - "tab": "Robustness", - "score": 0.43421052631578944 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.211, mean=0.447, max=0.74, sum=6.702 (15)", - "tab": "Fairness", - "score": 0.4467836257309941 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.556, mean=0.578, max=0.605, sum=8.664 (15)", - "tab": "Efficiency", - "score": 0.5775741999040572 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.814, mean=0.815, max=0.816, sum=2.446 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.035, mean=0.038, max=0.041, sum=0.114 (3)", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.751, mean=0.756, max=0.76, sum=2.269 (3)", - "tab": "Robustness", - "score": 0.7563333333333334 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.778, mean=0.782, max=0.788, sum=2.345 (3)", - "tab": "Fairness", - "score": 0.7816666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.566, mean=0.637, max=0.75, sum=1.912 (3)", - "tab": "Efficiency", - "score": 0.6371923081597224 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1.004, mean=1.004, max=1.004, sum=3.012 (3)", - "tab": "General information", - "score": 1.004 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.728, - "details": { - "description": "min=0.692, mean=0.728, max=0.748, sum=2.185 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.052, mean=0.09, max=0.14, sum=0.27 (3)", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.622, mean=0.663, max=0.693, sum=1.99 (3)", - "tab": "Robustness", - "score": 0.6634443166549867 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.614, mean=0.646, max=0.667, sum=1.939 (3)", - "tab": "Fairness", - "score": 0.6464650190039823 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=1.628, mean=1.722, max=1.839, sum=5.167 (3)", - "tab": "Efficiency", - "score": 1.7223421043622853 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3504.577, mean=3803.911, max=3972.577, sum=11411.732 (3)", - "tab": "General information", - "score": 3803.910798122066 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.572, mean=6.952, max=8.434, sum=20.856 (3)", - "tab": "General information", - "score": 6.9521126760563385 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.333, mean=0.39, max=0.419, sum=1.169 (3)", - "tab": "Bias", - "score": 0.38950617283950617 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.19, mean=0.208, max=0.218, sum=0.624 (3)", - "tab": "Bias", - "score": 0.20792828096614854 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.013, max=0.014, sum=0.039 (3)", - "tab": "Toxicity", - "score": 0.013145539906103287 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.682, mean=0.686, max=0.693, sum=2.059 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.108, mean=0.121, max=0.128, sum=0.362 (3)", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.048, mean=0.067, max=0.088, sum=0.2 (3)", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.239, mean=0.245, max=0.248, sum=0.734 (3)", - "tab": "Robustness", - "score": 0.24480135198778494 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.608, mean=0.632, max=0.646, sum=1.897 (3)", - "tab": "Robustness", - "score": 0.6323821508652113 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.228, mean=0.239, max=0.244, sum=0.716 (3)", - "tab": "Fairness", - "score": 0.23855278160903723 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.639, mean=0.642, max=0.646, sum=1.927 (3)", - "tab": "Fairness", - "score": 0.6422159112855447 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.751, mean=0.777, max=0.821, sum=2.331 (3)", - "tab": "Efficiency", - "score": 0.7770150703124993 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=1.036, mean=1.102, max=1.15, sum=3.305 (3)", - "tab": "Efficiency", - "score": 1.1015715911458346 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.032, mean=5.47, max=6.183, sum=16.409 (3)", - "tab": "General information", - "score": 5.469666666666666 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.964, mean=4.964, max=4.965, sum=14.893 (3)", - "tab": "General information", - "score": 4.964333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.021 (3)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1381.066, mean=1592.701, max=1704.681, sum=4778.103 (3)", - "tab": "General information", - "score": 1592.701 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.429, mean=5.659, max=6.028, sum=16.976 (3)", - "tab": "General information", - "score": 5.658666666666666 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.358, mean=0.386, max=0.439, sum=1.158 (3)", - "tab": "Bias", - "score": 0.38616369646117926 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0, mean=0.148, max=0.237, sum=0.443 (3)", - "tab": "Bias", - "score": 0.1475748194014448 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.367, mean=0.429, max=0.5, sum=1.287 (3)", - "tab": "Bias", - "score": 0.4288888888888889 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.459, mean=0.48, max=0.498, sum=1.441 (3)", - "tab": "Bias", - "score": 0.48032222577096423 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.024, mean=0.043, max=0.079, sum=0.129 (3)", - "tab": "Bias", - "score": 0.043024227234753555 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431, - "details": { - "description": "min=0.41, mean=0.431, max=0.443, sum=1.294 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.034, mean=0.039, max=0.048, sum=0.116 (3)", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.303, mean=0.313, max=0.324, sum=0.938 (3)", - "tab": "Robustness", - "score": 0.31252831855461766 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.338, mean=0.356, max=0.365, sum=1.067 (3)", - "tab": "Fairness", - "score": 0.35555313427706087 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=3.472, mean=3.694, max=4.123, sum=11.082 (3)", - "tab": "Efficiency", - "score": 3.6939938854166683 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=4676.788, mean=5199.788, max=5842.788, sum=15599.364 (3)", - "tab": "General information", - "score": 5199.788 - }, - "QuAC - # output tokens": { - "description": "min=32.106, mean=35.484, max=40.222, sum=106.452 (3)", - "tab": "General information", - "score": 35.484 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.604, mean=0.609, max=0.614, sum=1.827 (3)", - "tab": "Bias", - "score": 0.6088490550046614 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.405, mean=0.419, max=0.441, sum=1.257 (3)", - "tab": "Bias", - "score": 0.4190730790466706 - }, - "QuAC - Representation (race)": { - "description": "min=0.283, mean=0.321, max=0.341, sum=0.964 (3)", - "tab": "Bias", - "score": 0.32117266495855845 - }, - "QuAC - Representation (gender)": { - "description": "min=0.246, mean=0.248, max=0.249, sum=0.743 (3)", - "tab": "Bias", - "score": 0.24753349327018945 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)", - "tab": "Toxicity", - "score": 0.0016666666666666668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.32 (1)", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.766, mean=0.766, max=0.766, sum=0.766 (1)", - "tab": "Robustness", - "score": 0.766 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.695, mean=0.695, max=0.695, sum=0.695 (1)", - "tab": "Fairness", - "score": 0.695 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.549, mean=0.549, max=0.549, sum=0.549 (1)", - "tab": "Efficiency", - "score": 0.5491151875000004 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=1.306, mean=1.306, max=1.306, sum=1.306 (1)", - "tab": "General information", - "score": 1.306 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.558, - "details": { - "description": "min=0.558, mean=0.558, max=0.558, sum=0.558 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.244, mean=0.244, max=0.244, sum=0.244 (1)", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.472 (1)", - "tab": "Robustness", - "score": 0.472 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)", - "tab": "Fairness", - "score": 0.482 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.447, mean=0.447, max=0.447, sum=0.447 (1)", - "tab": "Efficiency", - "score": 0.4465652265625003 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)", - "tab": "General information", - "score": 0.132 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.368, - "details": { - "description": "min=0.298, mean=0.368, max=0.408, sum=1.472 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.052, mean=0.127, max=0.196, sum=0.507 (4)", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.263, mean=0.326, max=0.388, sum=1.304 (4)", - "tab": "Robustness", - "score": 0.3260703363914373 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.229, mean=0.3, max=0.388, sum=1.202 (4)", - "tab": "Fairness", - "score": 0.3004587155963303 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.503, mean=0.568, max=0.603, sum=2.273 (4)", - "tab": "Efficiency", - "score": 0.5683649633565078 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.75, max=5, sum=15 (4)", - "tab": "General information", - "score": 3.75 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=85.121, mean=404.621, max=529.121, sum=1618.483 (4)", - "tab": "General information", - "score": 404.62079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "min=0.625, mean=0.642, max=0.66, sum=1.925 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.28, mean=0.308, max=0.326, sum=0.925 (3)", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.589, mean=0.592, max=0.594, sum=1.776 (3)", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.305, mean=0.345, max=0.369, sum=1.036 (3)", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.589, mean=0.609, max=0.63, sum=1.828 (3)", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.55, mean=0.578, max=0.599, sum=1.733 (3)", - "tab": "Efficiency", - "score": 0.5778111061197916 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.547, mean=0.587, max=0.608, sum=1.76 (3)", - "tab": "Efficiency", - "score": 0.5865037397044573 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1, mean=1.005, max=1.014, sum=3.014 (3)", - "tab": "General information", - "score": 1.0046666666666668 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.154, - "details": { - "description": "min=0.142, mean=0.154, max=0.17, sum=0.927 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=3.898, mean=4.076, max=4.414, sum=24.459 (6)", - "tab": "Efficiency", - "score": 4.076441398798879 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=54.895, mean=58.035, max=64.039, sum=348.21 (6)", - "tab": "General information", - "score": 58.035050071530755 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.6, mean=0.616, max=0.642, sum=3.694 (6)", - "tab": "Bias", - "score": 0.6157343144185249 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.4, mean=0.412, max=0.426, sum=2.474 (6)", - "tab": "Bias", - "score": 0.41239374128525014 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.241, mean=0.252, max=0.26, sum=1.514 (6)", - "tab": "Bias", - "score": 0.2523476523476524 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.075, mean=0.093, max=0.102, sum=0.555 (6)", - "tab": "Bias", - "score": 0.09258312556525572 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.473, mean=0.492, max=0.515, sum=1.477 (3)", - "tab": "Summarization metrics", - "score": 0.4923968635744633 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.385, mean=4.692, max=4.898, sum=28.151 (6)", - "tab": "Summarization metrics", - "score": 4.691904356057608 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.315, mean=0.326, max=0.342, sum=0.979 (3)", - "tab": "Summarization metrics", - "score": 0.32642089401655566 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.953, mean=0.96, max=0.968, sum=5.762 (6)", - "tab": "Summarization metrics", - "score": 0.9602766718208816 - }, - "CNN/DailyMail - Density": { - "description": "min=9.043, mean=10.832, max=14.179, sum=64.991 (6)", - "tab": "Summarization metrics", - "score": 10.831883037736205 - }, - "CNN/DailyMail - Compression": { - "description": "min=10.561, mean=11.89, max=12.628, sum=71.339 (6)", - "tab": "Summarization metrics", - "score": 11.889831050263881 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Summarization metrics", - "score": 0.6666666666666666 - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "min=4, mean=4, max=4, sum=8 (2)", - "tab": "Summarization metrics", - "score": 4.0 - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "min=2.667, mean=2.667, max=2.667, sum=5.333 (2)", - "tab": "Summarization metrics", - "score": 2.6666666666666665 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.134, - "details": { - "description": "min=0.131, mean=0.134, max=0.137, sum=0.804 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=2.357, mean=2.408, max=2.45, sum=14.45 (6)", - "tab": "Efficiency", - "score": 2.408301637575076 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.735, max=1539.402, sum=9064.409 (6)", - "tab": "General information", - "score": 1510.734877734878 - }, - "XSUM - # output tokens": { - "description": "min=28.284, mean=28.94, max=29.546, sum=173.637 (6)", - "tab": "General information", - "score": 28.93951093951094 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.415, mean=0.439, max=0.454, sum=2.637 (6)", - "tab": "Bias", - "score": 0.43949621664675426 - }, - "XSUM - Representation (race)": { - "description": "min=0.497, mean=0.541, max=0.59, sum=3.246 (6)", - "tab": "Bias", - "score": 0.54094360657117 - }, - "XSUM - Representation (gender)": { - "description": "min=0.189, mean=0.207, max=0.22, sum=1.244 (6)", - "tab": "Bias", - "score": 0.20735056882648284 - }, - "XSUM - Toxic fraction": { - "description": "min=0.002, mean=0.004, max=0.006, sum=0.023 (6)", - "tab": "Toxicity", - "score": 0.0038610038610038615 - }, - "XSUM - SummaC": { - "description": "min=-0.278, mean=-0.271, max=-0.263, sum=-0.812 (3)", - "tab": "Summarization metrics", - "score": -0.2708329675740717 - }, - "XSUM - QAFactEval": { - "description": "min=2.934, mean=3.066, max=3.179, sum=18.394 (6)", - "tab": "Summarization metrics", - "score": 3.0656965498353155 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.434, mean=0.437, max=0.441, sum=1.311 (3)", - "tab": "Summarization metrics", - "score": 0.4370376831136327 - }, - "XSUM - Coverage": { - "description": "min=0.806, mean=0.808, max=0.811, sum=4.849 (6)", - "tab": "Summarization metrics", - "score": 0.8082245669950062 - }, - "XSUM - Density": { - "description": "min=2.656, mean=2.691, max=2.726, sum=16.146 (6)", - "tab": "Summarization metrics", - "score": 2.6910357109145138 - }, - "XSUM - Compression": { - "description": "min=14.828, mean=15.182, max=15.567, sum=91.094 (6)", - "tab": "Summarization metrics", - "score": 15.182390855675616 - }, - "XSUM - HumanEval-faithfulness": { - "description": "min=0.667, mean=0.778, max=0.889, sum=4.667 (6)", - "tab": "Summarization metrics", - "score": 0.7777777777777777 - }, - "XSUM - HumanEval-relevance": { - "description": "min=4.333, mean=4.398, max=4.444, sum=26.389 (6)", - "tab": "Summarization metrics", - "score": 4.398148148148148 - }, - "XSUM - HumanEval-coherence": { - "description": "min=4.889, mean=4.898, max=4.917, sum=29.389 (6)", - "tab": "Summarization metrics", - "score": 4.898148148148149 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.924, mean=0.934, max=0.948, sum=2.802 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.01, mean=0.015, max=0.024, sum=0.045 (3)", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.921, mean=0.928, max=0.94, sum=2.783 (3)", - "tab": "Robustness", - "score": 0.9276666666666666 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.918, mean=0.925, max=0.936, sum=2.775 (3)", - "tab": "Fairness", - "score": 0.9249999999999999 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.714, mean=0.79, max=0.897, sum=2.37 (3)", - "tab": "Efficiency", - "score": 0.7899130366753467 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1155.212, mean=1422.545, max=1836.212, sum=4267.636 (3)", - "tab": "General information", - "score": 1422.5453333333335 - }, - "IMDB - # output tokens": { - "description": "min=1.002, mean=1.014, max=1.02, sum=3.042 (3)", - "tab": "General information", - "score": 1.014 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "details": { - "description": "min=0.182, mean=0.61, max=0.939, sum=32.915 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.048, mean=0.179, max=0.449, sum=9.655 (54)", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.105, mean=0.514, max=0.854, sum=27.755 (54)", - "tab": "Robustness", - "score": 0.5139820592784173 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.105, mean=0.512, max=0.939, sum=27.636 (54)", - "tab": "Fairness", - "score": 0.5117722022150621 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.555, mean=0.594, max=0.756, sum=32.071 (54)", - "tab": "Efficiency", - "score": 0.5939081200798796 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.699, - "details": { - "description": "min=0.225, mean=0.699, max=0.95, sum=23.075 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.071, mean=0.212, max=0.648, sum=7.002 (33)", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.225, mean=0.6, max=0.95, sum=19.8 (33)", - "tab": "Robustness", - "score": 0.6000000000000001 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.225, mean=0.67, max=0.95, sum=22.1 (33)", - "tab": "Fairness", - "score": 0.6696969696969697 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.583, mean=0.883, max=2.075, sum=29.139 (33)", - "tab": "Efficiency", - "score": 0.8829963013928345 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=5, mean=5, max=5, sum=165 (33)", - "tab": "General information", - "score": 5.0 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=1279.572, max=6599.65, sum=42225.875 (33)", - "tab": "General information", - "score": 1279.5719696969697 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=2.986, max=5.3, sum=98.55 (33)", - "tab": "General information", - "score": 2.9863636363636363 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json deleted file mode 100644 index 6a9a41b41..000000000 --- a/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "J1-Grande v1 17B", - "id": "ai21/J1-Grande-v1-17B", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.433, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6221919576066971 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.4225080073800875 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.4539316449216338 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.31716008771929827 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5580147362700336 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6300489633822968 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.6689640768588138 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27, - "details": { - "description": "min=0.2, mean=0.27, max=0.35, sum=4.047 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.063, mean=0.114, max=0.154, sum=1.708 (15)", - "tab": "Calibration", - "score": 0.11389257817699022 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.15, mean=0.225, max=0.27, sum=3.377 (15)", - "tab": "Robustness", - "score": 0.22511111111111112 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.158, mean=0.232, max=0.29, sum=3.474 (15)", - "tab": "Fairness", - "score": 0.23159064327485382 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.381, mean=0.411, max=0.466, sum=6.166 (15)", - "tab": "Efficiency", - "score": 0.41104061293859656 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)", - "tab": "General information", - "score": 396.73985964912276 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722, - "details": { - "description": "min=0.712, mean=0.722, max=0.733, sum=2.165 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.139, mean=0.154, max=0.169, sum=0.462 (3)", - "tab": "Calibration", - "score": 0.15409092997354776 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.632, mean=0.643, max=0.658, sum=1.929 (3)", - "tab": "Robustness", - "score": 0.6429999999999999 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.656, mean=0.678, max=0.695, sum=2.035 (3)", - "tab": "Fairness", - "score": 0.6783333333333333 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.47, mean=0.535, max=0.624, sum=1.606 (3)", - "tab": "Efficiency", - "score": 0.5352501416015627 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)", - "tab": "General information", - "score": 694.6516666666666 - }, - "BoolQ - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672, - "details": { - "description": "min=0.664, mean=0.672, max=0.68, sum=2.016 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.039, mean=0.047, max=0.062, sum=0.141 (3)", - "tab": "Calibration", - "score": 0.04705310707412085 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.409, mean=0.477, max=0.522, sum=1.432 (3)", - "tab": "Robustness", - "score": 0.47749086119263257 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.526, mean=0.547, max=0.563, sum=1.641 (3)", - "tab": "Fairness", - "score": 0.5469545337986748 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.892, mean=0.923, max=0.955, sum=2.769 (3)", - "tab": "Efficiency", - "score": 0.9228662338615026 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)", - "tab": "General information", - "score": 2.63943661971831 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)", - "tab": "General information", - "score": 1692.2178403755868 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.324, mean=4.528, max=4.701, sum=13.583 (3)", - "tab": "General information", - "score": 4.527699530516432 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.15, mean=0.164, max=0.18, sum=0.491 (3)", - "tab": "Bias", - "score": 0.1636261091893518 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.014, max=0.017, sum=0.042 (3)", - "tab": "Toxicity", - "score": 0.014084507042253521 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.561, mean=0.578, max=0.59, sum=1.734 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.027, mean=0.029, max=0.03, sum=0.087 (3)", - "tab": "Calibration", - "score": 0.028955351873343083 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.073, mean=0.081, max=0.097, sum=0.243 (3)", - "tab": "Calibration", - "score": 0.08114120238748938 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.164, mean=0.17, max=0.175, sum=0.511 (3)", - "tab": "Robustness", - "score": 0.17025794044565556 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.449, mean=0.478, max=0.494, sum=1.433 (3)", - "tab": "Robustness", - "score": 0.4776074011626843 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.185, mean=0.187, max=0.189, sum=0.562 (3)", - "tab": "Fairness", - "score": 0.1872477522460834 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.501, mean=0.521, max=0.534, sum=1.563 (3)", - "tab": "Fairness", - "score": 0.5209919156580172 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.437, mean=0.466, max=0.494, sum=1.399 (3)", - "tab": "Efficiency", - "score": 0.46640491796874967 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.774, mean=0.873, max=0.927, sum=2.618 (3)", - "tab": "Efficiency", - "score": 0.8728225097656246 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)", - "tab": "General information", - "score": 99.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.791, mean=5.971, max=7.18, sum=17.913 (3)", - "tab": "General information", - "score": 5.971 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)", - "tab": "General information", - "score": 4.666333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)", - "tab": "General information", - "score": 1418.4566666666667 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.302, mean=6.538, max=6.976, sum=19.615 (3)", - "tab": "General information", - "score": 6.538333333333333 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.473, mean=0.521, max=0.556, sum=1.564 (3)", - "tab": "Bias", - "score": 0.5214747518446415 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0, mean=0.033, max=0.1, sum=0.1 (3)", - "tab": "Bias", - "score": 0.033333333333333326 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=1.038 (3)", - "tab": "Bias", - "score": 0.3461538461538461 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.45, mean=0.488, max=0.521, sum=1.463 (3)", - "tab": "Bias", - "score": 0.48764942579375564 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.111, mean=0.113, max=0.118, sum=0.34 (3)", - "tab": "Bias", - "score": 0.11339991677070331 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.362, - "details": { - "description": "min=0.355, mean=0.362, max=0.372, sum=1.087 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.019, mean=0.036, max=0.06, sum=0.107 (3)", - "tab": "Calibration", - "score": 0.03571925908384949 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.215, mean=0.219, max=0.227, sum=0.658 (3)", - "tab": "Robustness", - "score": 0.21921244416502939 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.266, mean=0.274, max=0.282, sum=0.821 (3)", - "tab": "Fairness", - "score": 0.27362985580399246 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.302, mean=1.413, max=1.478, sum=4.24 (3)", - "tab": "Efficiency", - "score": 1.4134776341145843 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)", - "tab": "General information", - "score": 1.8286666666666667 - }, - "QuAC - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "General information", - "score": 0.001 - }, - "QuAC - # prompt tokens": { - "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)", - "tab": "General information", - "score": 1698.7113333333334 - }, - "QuAC - # output tokens": { - "description": "min=22.154, mean=27.786, max=31.692, sum=83.357 (3)", - "tab": "General information", - "score": 27.785666666666668 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.58, mean=0.6, max=0.639, sum=1.799 (3)", - "tab": "Bias", - "score": 0.5996635891593876 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.415, mean=0.428, max=0.44, sum=1.283 (3)", - "tab": "Bias", - "score": 0.42780085419627883 - }, - "QuAC - Representation (race)": { - "description": "min=0.298, mean=0.34, max=0.378, sum=1.019 (3)", - "tab": "Bias", - "score": 0.3397817992618246 - }, - "QuAC - Representation (gender)": { - "description": "min=0.237, mean=0.242, max=0.25, sum=0.727 (3)", - "tab": "Bias", - "score": 0.24231770708576347 - }, - "QuAC - Toxic fraction": { - "description": "min=0.004, mean=0.004, max=0.004, sum=0.012 (3)", - "tab": "Toxicity", - "score": 0.004 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739, - "details": { - "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.213, mean=0.213, max=0.213, sum=0.213 (1)", - "tab": "Calibration", - "score": 0.21338082493857388 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.695, mean=0.695, max=0.695, sum=0.695 (1)", - "tab": "Robustness", - "score": 0.695 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.58, mean=0.58, max=0.58, sum=0.58 (1)", - "tab": "Fairness", - "score": 0.58 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.33 (1)", - "tab": "Efficiency", - "score": 0.3304377109375 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)", - "tab": "General information", - "score": 62.466 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.258, mean=0.258, max=0.258, sum=0.258 (1)", - "tab": "Calibration", - "score": 0.25849314658751343 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)", - "tab": "Robustness", - "score": 0.424 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.472 (1)", - "tab": "Fairness", - "score": 0.472 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)", - "tab": "Efficiency", - "score": 0.280719578125 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)", - "tab": "General information", - "score": 4.348 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.193, - "details": { - "description": "min=0.171, mean=0.193, max=0.217, sum=0.58 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.064, mean=0.091, max=0.109, sum=0.273 (3)", - "tab": "Calibration", - "score": 0.09083831911084679 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.116, mean=0.142, max=0.159, sum=0.425 (3)", - "tab": "Robustness", - "score": 0.1416921508664628 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.138, mean=0.163, max=0.182, sum=0.489 (3)", - "tab": "Fairness", - "score": 0.16309887869520898 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.384, mean=0.396, max=0.403, sum=1.189 (3)", - "tab": "Efficiency", - "score": 0.39626294915902127 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)", - "tab": "General information", - "score": 355.0152905198777 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.341, - "details": { - "description": "min=0.31, mean=0.341, max=0.389, sum=1.022 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.105, mean=0.121, max=0.133, sum=0.362 (3)", - "tab": "Robustness", - "score": 0.12069748677248683 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.27, mean=0.297, max=0.328, sum=0.89 (3)", - "tab": "Robustness", - "score": 0.29680328755123014 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.126, mean=0.138, max=0.155, sum=0.414 (3)", - "tab": "Fairness", - "score": 0.1378972222222222 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.296, mean=0.328, max=0.372, sum=0.985 (3)", - "tab": "Fairness", - "score": 0.3284974893691146 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.415, mean=0.428, max=0.44, sum=1.283 (3)", - "tab": "Efficiency", - "score": 0.4278073636067708 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.412, mean=0.424, max=0.437, sum=1.272 (3)", - "tab": "Efficiency", - "score": 0.42392066375968995 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)", - "tab": "General information", - "score": 385.63633333333337 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=2.004, mean=2.011, max=2.023, sum=6.034 (3)", - "tab": "General information", - "score": 2.0113333333333334 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)", - "tab": "General information", - "score": 373.3798449612403 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=2.023, mean=2.023, max=2.023, sum=6.07 (3)", - "tab": "General information", - "score": 2.0232558139534884 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.143, - "details": { - "description": "min=0.127, mean=0.143, max=0.163, sum=0.859 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=1.956, mean=2.074, max=2.263, sum=12.445 (6)", - "tab": "Efficiency", - "score": 2.074164002425339 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)", - "tab": "General information", - "score": 1213.0321888412018 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=61.569, mean=67.049, max=76.034, sum=402.296 (6)", - "tab": "General information", - "score": 67.04935622317596 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.608, mean=0.633, max=0.647, sum=3.801 (6)", - "tab": "Bias", - "score": 0.6334968330766649 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.39, mean=0.4, max=0.407, sum=2.398 (6)", - "tab": "Bias", - "score": 0.39959768497778553 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.263, mean=0.351, max=0.399, sum=2.104 (6)", - "tab": "Bias", - "score": 0.3506178570090534 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.115, mean=0.13, max=0.14, sum=0.782 (6)", - "tab": "Bias", - "score": 0.1303299541894603 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.514, mean=0.539, max=0.586, sum=1.617 (3)", - "tab": "Summarization metrics", - "score": 0.5391092885196874 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.706, mean=4.81, max=4.896, sum=28.859 (6)", - "tab": "Summarization metrics", - "score": 4.809910581145076 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.247, mean=0.275, max=0.302, sum=0.824 (3)", - "tab": "Summarization metrics", - "score": 0.2747429286177279 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.966, mean=0.973, max=0.984, sum=5.84 (6)", - "tab": "Summarization metrics", - "score": 0.9733042514029583 - }, - "CNN/DailyMail - Density": { - "description": "min=31.118, mean=41.027, max=60.066, sum=246.163 (6)", - "tab": "Summarization metrics", - "score": 41.02711755812993 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.092, mean=9.888, max=11.258, sum=59.326 (6)", - "tab": "Summarization metrics", - "score": 9.887609814491976 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.122, - "details": { - "description": "min=0.118, mean=0.122, max=0.127, sum=0.733 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=1.055, mean=1.07, max=1.082, sum=6.42 (6)", - "tab": "Efficiency", - "score": 1.0700079645773009 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)", - "tab": "General information", - "score": 1133.388030888031 - }, - "XSUM - # output tokens": { - "description": "min=19.975, mean=20.468, max=21.141, sum=122.807 (6)", - "tab": "General information", - "score": 20.467824967824967 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.417, mean=0.442, max=0.485, sum=2.652 (6)", - "tab": "Bias", - "score": 0.44203142536475876 - }, - "XSUM - Representation (race)": { - "description": "min=0.439, mean=0.557, max=0.667, sum=3.34 (6)", - "tab": "Bias", - "score": 0.5566296694116243 - }, - "XSUM - Representation (gender)": { - "description": "min=0.149, mean=0.171, max=0.211, sum=1.025 (6)", - "tab": "Bias", - "score": 0.17086307216738958 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)", - "tab": "Toxicity", - "score": 0.0019305019305019308 - }, - "XSUM - SummaC": { - "description": "min=-0.282, mean=-0.272, max=-0.264, sum=-0.815 (3)", - "tab": "Summarization metrics", - "score": -0.2715132814883572 - }, - "XSUM - QAFactEval": { - "description": "min=3.221, mean=3.447, max=3.575, sum=20.68 (6)", - "tab": "Summarization metrics", - "score": 3.446713620425662 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.424, mean=0.429, max=0.434, sum=1.287 (3)", - "tab": "Summarization metrics", - "score": 0.4288941077256343 - }, - "XSUM - Coverage": { - "description": "min=0.78, mean=0.783, max=0.785, sum=4.696 (6)", - "tab": "Summarization metrics", - "score": 0.7826042118856411 - }, - "XSUM - Density": { - "description": "min=2.514, mean=2.64, max=2.767, sum=15.838 (6)", - "tab": "Summarization metrics", - "score": 2.6397086455700927 - }, - "XSUM - Compression": { - "description": "min=18.382, mean=19.012, max=19.445, sum=114.069 (6)", - "tab": "Summarization metrics", - "score": 19.011567725134377 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.953, - "details": { - "description": "min=0.947, mean=0.953, max=0.957, sum=2.859 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.152, mean=0.158, max=0.166, sum=0.473 (3)", - "tab": "Calibration", - "score": 0.15775206410447826 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.932, mean=0.941, max=0.948, sum=2.822 (3)", - "tab": "Robustness", - "score": 0.9406666666666667 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.94, mean=0.946, max=0.95, sum=2.839 (3)", - "tab": "Fairness", - "score": 0.9463333333333331 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.59, mean=0.732, max=0.881, sum=2.197 (3)", - "tab": "Efficiency", - "score": 0.7321998525390631 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)", - "tab": "General information", - "score": 4.971666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)", - "tab": "General information", - "score": 1281.5773333333334 - }, - "IMDB - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529, - "details": { - "description": "min=0.014, mean=0.529, max=0.991, sum=28.55 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.228, mean=0.408, max=0.593, sum=22.008 (54)", - "tab": "Calibration", - "score": 0.4075612338805137 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.014, mean=0.417, max=0.938, sum=22.51 (54)", - "tab": "Robustness", - "score": 0.41686056018907397 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.014, mean=0.482, max=0.962, sum=26.023 (54)", - "tab": "Fairness", - "score": 0.4819034071645267 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.418, mean=0.482, max=0.621, sum=26.002 (54)", - "tab": "Efficiency", - "score": 0.48152748003997736 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)", - "tab": "General information", - "score": 532.6016121330534 - }, - "CivilComments - # output tokens": { - "description": "min=2, mean=2, max=2, sum=108 (54)", - "tab": "General information", - "score": 2.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.658, - "details": { - "description": "min=0.2, mean=0.658, max=0.975, sum=21.7 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.113, mean=0.244, max=0.466, sum=8.048 (33)", - "tab": "Calibration", - "score": 0.24386423436086976 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.513, max=0.775, sum=16.925 (33)", - "tab": "Robustness", - "score": 0.5128787878787878 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.175, mean=0.636, max=0.975, sum=21 (33)", - "tab": "Fairness", - "score": 0.6363636363636364 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.401, mean=0.59, max=0.888, sum=19.483 (33)", - "tab": "Efficiency", - "score": 0.5903971827651516 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)", - "tab": "General information", - "score": 4.657575757575757 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)", - "tab": "General information", - "score": 712.2477272727273 - }, - "RAFT - # output tokens": { - "description": "min=1.95, mean=3.59, max=6.575, sum=118.475 (33)", - "tab": "General information", - "score": 3.590151515151515 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json deleted file mode 100644 index 30c92ab94..000000000 --- a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "J1-Grande v2 beta 17B", - "id": "ai21/J1-Grande-v2-beta-17B", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6340622537431048 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.7106770870953296 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.6771299149497148 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5919924787763542 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5063399563399563 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.6776315789473685 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445, - "details": { - "description": "min=0.23, mean=0.445, max=0.8, sum=6.677 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.067, mean=0.139, max=0.205, sum=2.09 (15)", - "tab": "Calibration", - "score": 0.13930239849591303 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.2, mean=0.392, max=0.73, sum=5.887 (15)", - "tab": "Robustness", - "score": 0.39245614035087717 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.19, mean=0.409, max=0.77, sum=6.142 (15)", - "tab": "Fairness", - "score": 0.4094619883040936 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)", - "tab": "General information", - "score": 396.73985964912276 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "description": "min=0.799, mean=0.812, max=0.823, sum=2.437 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.155, mean=0.167, max=0.185, sum=0.5 (3)", - "tab": "Calibration", - "score": 0.16655399552246586 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.669, mean=0.692, max=0.714, sum=2.077 (3)", - "tab": "Robustness", - "score": 0.6923333333333334 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.751, mean=0.764, max=0.784, sum=2.291 (3)", - "tab": "Fairness", - "score": 0.7636666666666668 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)", - "tab": "General information", - "score": 694.6516666666666 - }, - "BoolQ - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725, - "details": { - "description": "min=0.712, mean=0.725, max=0.736, sum=2.176 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.034, mean=0.041, max=0.05, sum=0.122 (3)", - "tab": "Calibration", - "score": 0.040831012535009516 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.484, mean=0.565, max=0.616, sum=1.694 (3)", - "tab": "Robustness", - "score": 0.5646966401263148 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.622, mean=0.647, max=0.665, sum=1.941 (3)", - "tab": "Fairness", - "score": 0.6470593497686433 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)", - "tab": "General information", - "score": 2.63943661971831 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)", - "tab": "General information", - "score": 1692.2178403755868 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.194, mean=4.6, max=5.011, sum=13.8 (3)", - "tab": "General information", - "score": 4.6 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.25, mean=0.3, max=0.4, sum=0.9 (3)", - "tab": "Bias", - "score": 0.3 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.156, mean=0.179, max=0.205, sum=0.536 (3)", - "tab": "Bias", - "score": 0.1787801116945903 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.014, max=0.017, sum=0.042 (3)", - "tab": "Toxicity", - "score": 0.014084507042253521 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.622, mean=0.625, max=0.628, sum=1.874 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.031, mean=0.036, max=0.043, sum=0.107 (3)", - "tab": "Calibration", - "score": 0.035782131071618734 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.051, mean=0.065, max=0.075, sum=0.196 (3)", - "tab": "Calibration", - "score": 0.06520649617008285 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.23, mean=0.235, max=0.241, sum=0.705 (3)", - "tab": "Robustness", - "score": 0.2349124459413927 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.556, mean=0.56, max=0.568, sum=1.681 (3)", - "tab": "Robustness", - "score": 0.5603824984507094 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.269, mean=0.27, max=0.27, sum=0.81 (3)", - "tab": "Fairness", - "score": 0.269872960171523 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.562, mean=0.571, max=0.578, sum=1.714 (3)", - "tab": "Fairness", - "score": 0.5712438797598854 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)", - "tab": "General information", - "score": 99.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.471, mean=5.282, max=6.145, sum=15.846 (3)", - "tab": "General information", - "score": 5.282 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)", - "tab": "General information", - "score": 4.666333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)", - "tab": "General information", - "score": 1418.4566666666667 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.132, mean=5.27, max=5.521, sum=15.809 (3)", - "tab": "General information", - "score": 5.269666666666667 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.288, mean=0.392, max=0.491, sum=1.177 (3)", - "tab": "Bias", - "score": 0.3923268084547134 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.026, mean=0.174, max=0.318, sum=0.522 (3)", - "tab": "Bias", - "score": 0.17397232083140401 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.1, mean=0.167, max=0.3, sum=0.5 (3)", - "tab": "Bias", - "score": 0.16666666666666666 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.478, mean=0.488, max=0.498, sum=1.465 (3)", - "tab": "Bias", - "score": 0.48822694742885336 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.369, mean=0.381, max=0.394, sum=1.143 (3)", - "tab": "Bias", - "score": 0.38112988257848074 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)", - "tab": "Toxicity", - "score": 0.0013333333333333333 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392, - "details": { - "description": "min=0.375, mean=0.392, max=0.411, sum=1.177 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.031, mean=0.04, max=0.051, sum=0.121 (3)", - "tab": "Calibration", - "score": 0.04046561186462396 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.232, mean=0.251, max=0.261, sum=0.752 (3)", - "tab": "Robustness", - "score": 0.2506588392587418 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.297, mean=0.308, max=0.319, sum=0.923 (3)", - "tab": "Fairness", - "score": 0.30759220119907554 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)", - "tab": "General information", - "score": 1.8286666666666667 - }, - "QuAC - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "General information", - "score": 0.001 - }, - "QuAC - # prompt tokens": { - "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)", - "tab": "General information", - "score": 1698.7113333333334 - }, - "QuAC - # output tokens": { - "description": "min=19.318, mean=23.053, max=25.3, sum=69.158 (3)", - "tab": "General information", - "score": 23.052666666666667 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.583, mean=0.628, max=0.66, sum=1.884 (3)", - "tab": "Bias", - "score": 0.6279609279609281 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.396, mean=0.411, max=0.426, sum=1.232 (3)", - "tab": "Bias", - "score": 0.41081218336807646 - }, - "QuAC - Representation (race)": { - "description": "min=0.302, mean=0.327, max=0.359, sum=0.981 (3)", - "tab": "Bias", - "score": 0.3270316371542728 - }, - "QuAC - Representation (gender)": { - "description": "min=0.198, mean=0.225, max=0.241, sum=0.676 (3)", - "tab": "Bias", - "score": 0.22518777152451866 - }, - "QuAC - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.004, sum=0.01 (3)", - "tab": "Toxicity", - "score": 0.0033333333333333335 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.764, - "details": { - "description": "min=0.764, mean=0.764, max=0.764, sum=0.764 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.226, mean=0.226, max=0.226, sum=0.226 (1)", - "tab": "Calibration", - "score": 0.2263163700416937 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.732, mean=0.732, max=0.732, sum=0.732 (1)", - "tab": "Robustness", - "score": 0.732 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.623, mean=0.623, max=0.623, sum=0.623 (1)", - "tab": "Fairness", - "score": 0.623 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)", - "tab": "General information", - "score": 62.466 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=0.56 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.215, mean=0.215, max=0.215, sum=0.215 (1)", - "tab": "Calibration", - "score": 0.21479287621696264 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.474 (1)", - "tab": "Robustness", - "score": 0.474 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)", - "tab": "Fairness", - "score": 0.478 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)", - "tab": "General information", - "score": 4.348 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306, - "details": { - "description": "min=0.266, mean=0.306, max=0.333, sum=0.917 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.101, mean=0.123, max=0.157, sum=0.37 (3)", - "tab": "Calibration", - "score": 0.1233746034244333 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.216, mean=0.252, max=0.294, sum=0.755 (3)", - "tab": "Robustness", - "score": 0.25178389398572887 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.216, mean=0.242, max=0.271, sum=0.725 (3)", - "tab": "Fairness", - "score": 0.24159021406727832 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)", - "tab": "General information", - "score": 355.0152905198777 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.401, mean=0.46, max=0.51, sum=1.38 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.207, mean=0.222, max=0.244, sum=0.666 (3)", - "tab": "Robustness", - "score": 0.22205343915343892 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.361, mean=0.407, max=0.448, sum=1.222 (3)", - "tab": "Robustness", - "score": 0.40738421631598776 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.23, mean=0.253, max=0.284, sum=0.76 (3)", - "tab": "Fairness", - "score": 0.25326719576719553 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.371, mean=0.435, max=0.486, sum=1.304 (3)", - "tab": "Fairness", - "score": 0.4346805929346467 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)", - "tab": "General information", - "score": 385.63633333333337 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=2.001, mean=2.009, max=2.02, sum=6.026 (3)", - "tab": "General information", - "score": 2.0086666666666666 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)", - "tab": "General information", - "score": 373.3798449612403 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=2.023, mean=2.023, max=2.023, sum=6.07 (3)", - "tab": "General information", - "score": 2.0232558139534884 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.146, - "details": { - "description": "min=0.14, mean=0.146, max=0.152, sum=0.875 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)", - "tab": "General information", - "score": 1213.0321888412018 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=48.575, mean=53.215, max=56.485, sum=319.288 (6)", - "tab": "General information", - "score": 53.21459227467812 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.605, mean=0.615, max=0.633, sum=3.691 (6)", - "tab": "Bias", - "score": 0.615138154027043 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.39, mean=0.401, max=0.416, sum=2.409 (6)", - "tab": "Bias", - "score": 0.4014349780782224 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.278, mean=0.293, max=0.321, sum=1.76 (6)", - "tab": "Bias", - "score": 0.2933799533799534 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.077, mean=0.099, max=0.123, sum=0.596 (6)", - "tab": "Bias", - "score": 0.09929925405618005 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0.002, mean=0.004, max=0.006, sum=0.026 (6)", - "tab": "Toxicity", - "score": 0.004291845493562232 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.533, mean=0.552, max=0.585, sum=1.655 (3)", - "tab": "Summarization metrics", - "score": 0.5516800688123055 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.273, mean=0.29, max=0.308, sum=0.871 (3)", - "tab": "Summarization metrics", - "score": 0.2904019284209938 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.965, mean=0.973, max=0.983, sum=5.838 (6)", - "tab": "Summarization metrics", - "score": 0.9729724626233943 - }, - "CNN/DailyMail - Density": { - "description": "min=18.643, mean=24.032, max=31.138, sum=144.19 (6)", - "tab": "Summarization metrics", - "score": 24.0317341420422 - }, - "CNN/DailyMail - Compression": { - "description": "min=10.389, mean=11.659, max=13.368, sum=69.956 (6)", - "tab": "Summarization metrics", - "score": 11.65941362001026 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.152, - "details": { - "description": "min=0.149, mean=0.152, max=0.157, sum=0.911 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)", - "tab": "General information", - "score": 1133.388030888031 - }, - "XSUM - # output tokens": { - "description": "min=21.805, mean=22.092, max=22.577, sum=132.552 (6)", - "tab": "General information", - "score": 22.09202059202059 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.45, mean=0.465, max=0.474, sum=2.791 (6)", - "tab": "Bias", - "score": 0.46523352396514167 - }, - "XSUM - Representation (race)": { - "description": "min=0.494, mean=0.522, max=0.536, sum=3.133 (6)", - "tab": "Bias", - "score": 0.5222388805597201 - }, - "XSUM - Representation (gender)": { - "description": "min=0.201, mean=0.214, max=0.224, sum=1.284 (6)", - "tab": "Bias", - "score": 0.21406383130768433 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)", - "tab": "Toxicity", - "score": 0.001287001287001287 - }, - "XSUM - SummaC": { - "description": "min=-0.298, mean=-0.282, max=-0.27, sum=-0.845 (3)", - "tab": "Summarization metrics", - "score": -0.2817185772994412 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.45, mean=0.454, max=0.458, sum=1.362 (3)", - "tab": "Summarization metrics", - "score": 0.4538733417652499 - }, - "XSUM - Coverage": { - "description": "min=0.782, mean=0.786, max=0.79, sum=4.714 (6)", - "tab": "Summarization metrics", - "score": 0.7856975370843048 - }, - "XSUM - Density": { - "description": "min=2.624, mean=2.816, max=3.113, sum=16.895 (6)", - "tab": "Summarization metrics", - "score": 2.815909720295231 - }, - "XSUM - Compression": { - "description": "min=16.323, mean=16.857, max=17.149, sum=101.14 (6)", - "tab": "Summarization metrics", - "score": 16.856596376166145 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.957, - "details": { - "description": "min=0.947, mean=0.957, max=0.964, sum=2.872 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.119, mean=0.136, max=0.165, sum=0.407 (3)", - "tab": "Calibration", - "score": 0.13573735378803647 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.931, mean=0.947, max=0.955, sum=2.841 (3)", - "tab": "Robustness", - "score": 0.9470000000000001 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.935, mean=0.95, max=0.959, sum=2.851 (3)", - "tab": "Fairness", - "score": 0.9503333333333334 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)", - "tab": "General information", - "score": 4.971666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)", - "tab": "General information", - "score": 1281.5773333333334 - }, - "IMDB - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.546, - "details": { - "description": "min=0.008, mean=0.546, max=1, sum=29.501 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.131, mean=0.376, max=0.649, sum=20.307 (54)", - "tab": "Calibration", - "score": 0.37604932471578795 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.495, max=0.995, sum=26.738 (54)", - "tab": "Robustness", - "score": 0.49514299676627055 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.005, mean=0.404, max=0.901, sum=21.814 (54)", - "tab": "Fairness", - "score": 0.40396201739558046 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)", - "tab": "General information", - "score": 532.6016121330534 - }, - "CivilComments - # output tokens": { - "description": "min=2, mean=2, max=2, sum=108 (54)", - "tab": "General information", - "score": 2.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679, - "details": { - "description": "min=0.225, mean=0.679, max=0.95, sum=22.4 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.095, mean=0.234, max=0.473, sum=7.733 (33)", - "tab": "Calibration", - "score": 0.23434348116913628 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.555, max=0.925, sum=18.3 (33)", - "tab": "Robustness", - "score": 0.5545454545454547 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.2, mean=0.637, max=0.95, sum=21.025 (33)", - "tab": "Fairness", - "score": 0.6371212121212121 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)", - "tab": "General information", - "score": 4.657575757575757 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)", - "tab": "General information", - "score": 712.2477272727273 - }, - "RAFT - # output tokens": { - "description": "min=1.95, mean=3.574, max=6.575, sum=117.95 (33)", - "tab": "General information", - "score": 3.5742424242424238 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json deleted file mode 100644 index df8111bcc..000000000 --- a/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "J1-Jumbo v1 178B", - "id": "ai21/J1-Jumbo-v1-178B", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6662512419912975 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.4518627645991383 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.48803949109844547 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.2218311403508772 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5485082680240319 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6042735042735042 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5867794486215538 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.259, - "details": { - "description": "min=0.19, mean=0.259, max=0.35, sum=3.891 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.074, mean=0.131, max=0.172, sum=1.96 (15)", - "tab": "Calibration", - "score": 0.13067986008352367 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.15, mean=0.221, max=0.31, sum=3.313 (15)", - "tab": "Robustness", - "score": 0.22085380116959066 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.17, mean=0.236, max=0.33, sum=3.545 (15)", - "tab": "Fairness", - "score": 0.23635087719298245 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.419, mean=0.457, max=0.511, sum=6.851 (15)", - "tab": "Efficiency", - "score": 0.4567342927631581 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)", - "tab": "General information", - "score": 396.73985964912276 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.776, - "details": { - "description": "min=0.766, mean=0.776, max=0.786, sum=2.327 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.205, mean=0.215, max=0.223, sum=0.646 (3)", - "tab": "Calibration", - "score": 0.21546167732589497 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.635, mean=0.65, max=0.659, sum=1.949 (3)", - "tab": "Robustness", - "score": 0.6496666666666667 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.693, mean=0.709, max=0.73, sum=2.128 (3)", - "tab": "Fairness", - "score": 0.7093333333333334 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.55, mean=0.62, max=0.727, sum=1.859 (3)", - "tab": "Efficiency", - "score": 0.6195252891710069 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)", - "tab": "General information", - "score": 694.6516666666666 - }, - "BoolQ - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.695, - "details": { - "description": "min=0.689, mean=0.695, max=0.698, sum=2.085 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.028, mean=0.034, max=0.042, sum=0.101 (3)", - "tab": "Calibration", - "score": 0.033635629206676086 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.448, mean=0.523, max=0.573, sum=1.57 (3)", - "tab": "Robustness", - "score": 0.5232968431666949 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.566, mean=0.581, max=0.592, sum=1.743 (3)", - "tab": "Fairness", - "score": 0.5811269391716133 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=1.085, mean=1.126, max=1.167, sum=3.379 (3)", - "tab": "Efficiency", - "score": 1.1261881626564945 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)", - "tab": "General information", - "score": 2.63943661971831 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)", - "tab": "General information", - "score": 1692.2178403755868 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.434, mean=4.514, max=4.617, sum=13.541 (3)", - "tab": "General information", - "score": 4.513615023474178 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.375, mean=0.438, max=0.5, sum=0.875 (2)", - "tab": "Bias", - "score": 0.4375 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.196, mean=0.214, max=0.225, sum=0.641 (3)", - "tab": "Bias", - "score": 0.21357560568086884 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.014, max=0.014, sum=0.042 (3)", - "tab": "Toxicity", - "score": 0.014084507042253521 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.595, - "details": { - "description": "min=0.593, mean=0.595, max=0.598, sum=1.786 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.029, mean=0.035, max=0.042, sum=0.106 (3)", - "tab": "Calibration", - "score": 0.035434924784030764 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.058, mean=0.065, max=0.069, sum=0.195 (3)", - "tab": "Calibration", - "score": 0.06491976505236641 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.177, mean=0.179, max=0.183, sum=0.537 (3)", - "tab": "Robustness", - "score": 0.17889901825749613 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.487, mean=0.503, max=0.515, sum=1.509 (3)", - "tab": "Robustness", - "score": 0.5031073713472458 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.227, mean=0.235, max=0.239, sum=0.704 (3)", - "tab": "Fairness", - "score": 0.23456155611286555 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.62 (3)", - "tab": "Fairness", - "score": 0.5399104355251988 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.466, mean=0.493, max=0.536, sum=1.478 (3)", - "tab": "Efficiency", - "score": 0.492596863281249 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.931, mean=1.06, max=1.147, sum=3.179 (3)", - "tab": "Efficiency", - "score": 1.0597537076822923 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)", - "tab": "General information", - "score": 99.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.012, mean=5.602, max=6.608, sum=16.806 (3)", - "tab": "General information", - "score": 5.602 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)", - "tab": "General information", - "score": 4.666333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)", - "tab": "General information", - "score": 1418.4566666666667 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.418, mean=5.682, max=5.988, sum=17.046 (3)", - "tab": "General information", - "score": 5.6819999999999995 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.255, mean=0.333, max=0.386, sum=1.0 (3)", - "tab": "Bias", - "score": 0.3331804837187507 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.125, mean=0.175, max=0.2, sum=0.525 (3)", - "tab": "Bias", - "score": 0.17500000000000002 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.38, mean=0.46, max=0.5, sum=1.38 (3)", - "tab": "Bias", - "score": 0.4601449275362319 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.451, mean=0.478, max=0.506, sum=1.433 (3)", - "tab": "Bias", - "score": 0.47760288745821544 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.011, mean=0.041, max=0.063, sum=0.122 (3)", - "tab": "Bias", - "score": 0.04050846488217801 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)", - "tab": "Toxicity", - "score": 0.0013333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358, - "details": { - "description": "min=0.348, mean=0.358, max=0.372, sum=1.075 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.042, mean=0.043, max=0.045, sum=0.13 (3)", - "tab": "Calibration", - "score": 0.04341080368618692 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.216, mean=0.222, max=0.232, sum=0.667 (3)", - "tab": "Robustness", - "score": 0.22242500588714678 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.263, mean=0.268, max=0.275, sum=0.805 (3)", - "tab": "Fairness", - "score": 0.2682228394530809 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.898, mean=2.064, max=2.149, sum=6.193 (3)", - "tab": "Efficiency", - "score": 2.0642993667534726 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)", - "tab": "General information", - "score": 1.8286666666666667 - }, - "QuAC - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "General information", - "score": 0.001 - }, - "QuAC - # prompt tokens": { - "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)", - "tab": "General information", - "score": 1698.7113333333334 - }, - "QuAC - # output tokens": { - "description": "min=22.621, mean=26.784, max=29.261, sum=80.351 (3)", - "tab": "General information", - "score": 26.783666666666665 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.594, mean=0.604, max=0.613, sum=1.811 (3)", - "tab": "Bias", - "score": 0.6038019374416433 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.417, mean=0.42, max=0.425, sum=1.26 (3)", - "tab": "Bias", - "score": 0.4200049682548366 - }, - "QuAC - Representation (race)": { - "description": "min=0.287, mean=0.329, max=0.362, sum=0.988 (3)", - "tab": "Bias", - "score": 0.3293434102054505 - }, - "QuAC - Representation (gender)": { - "description": "min=0.231, mean=0.242, max=0.257, sum=0.725 (3)", - "tab": "Bias", - "score": 0.2415041378322658 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.004, sum=0.009 (3)", - "tab": "Toxicity", - "score": 0.0030000000000000005 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.765, - "details": { - "description": "min=0.765, mean=0.765, max=0.765, sum=0.765 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)", - "tab": "Calibration", - "score": 0.21741807730831492 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)", - "tab": "Robustness", - "score": 0.726 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.614, mean=0.614, max=0.614, sum=0.614 (1)", - "tab": "Fairness", - "score": 0.614 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)", - "tab": "Efficiency", - "score": 0.2835968515624999 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)", - "tab": "General information", - "score": 62.466 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534, - "details": { - "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)", - "tab": "Calibration", - "score": 0.25015305244306557 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)", - "tab": "Robustness", - "score": 0.43 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)", - "tab": "Fairness", - "score": 0.466 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.259, mean=0.259, max=0.259, sum=0.259 (1)", - "tab": "Efficiency", - "score": 0.2588512968749986 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)", - "tab": "General information", - "score": 4.348 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.175, - "details": { - "description": "min=0.157, mean=0.175, max=0.187, sum=0.524 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.099, mean=0.113, max=0.123, sum=0.339 (3)", - "tab": "Calibration", - "score": 0.11285677982128534 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.13, mean=0.154, max=0.176, sum=0.462 (3)", - "tab": "Robustness", - "score": 0.15392456676860347 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.142, mean=0.156, max=0.168, sum=0.468 (3)", - "tab": "Fairness", - "score": 0.15596330275229356 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.423, mean=0.443, max=0.454, sum=1.328 (3)", - "tab": "Efficiency", - "score": 0.44282831613149837 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)", - "tab": "General information", - "score": 355.0152905198777 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363, - "details": { - "description": "min=0.316, mean=0.363, max=0.406, sum=1.089 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.131, mean=0.144, max=0.157, sum=0.433 (3)", - "tab": "Robustness", - "score": 0.14417447089947086 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.276, mean=0.307, max=0.347, sum=0.921 (3)", - "tab": "Robustness", - "score": 0.3070790784160127 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.151, mean=0.18, max=0.202, sum=0.54 (3)", - "tab": "Fairness", - "score": 0.17989272486772476 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.308, mean=0.348, max=0.386, sum=1.044 (3)", - "tab": "Fairness", - "score": 0.34798299201075195 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.482, mean=0.501, max=0.52, sum=1.502 (3)", - "tab": "Efficiency", - "score": 0.500707514648438 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.477, mean=0.496, max=0.516, sum=1.489 (3)", - "tab": "Efficiency", - "score": 0.4963945009689923 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)", - "tab": "General information", - "score": 385.63633333333337 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=2, mean=2.001, max=2.004, sum=6.004 (3)", - "tab": "General information", - "score": 2.001333333333333 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)", - "tab": "General information", - "score": 373.3798449612403 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=2.047, mean=2.047, max=2.047, sum=6.14 (3)", - "tab": "General information", - "score": 2.046511627906977 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.144, - "details": { - "description": "min=0.137, mean=0.144, max=0.157, sum=0.861 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=3.558, mean=3.777, max=3.91, sum=22.664 (6)", - "tab": "Efficiency", - "score": 3.777328921804216 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)", - "tab": "General information", - "score": 1213.0321888412018 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=67.139, mean=72.469, max=75.648, sum=434.815 (6)", - "tab": "General information", - "score": 72.46924177396282 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.62, mean=0.63, max=0.647, sum=3.781 (6)", - "tab": "Bias", - "score": 0.6302246589223909 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.382, mean=0.386, max=0.393, sum=2.314 (6)", - "tab": "Bias", - "score": 0.385603383216647 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.288, mean=0.325, max=0.362, sum=1.95 (6)", - "tab": "Bias", - "score": 0.3250193306482005 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.13, mean=0.131, max=0.132, sum=0.788 (6)", - "tab": "Bias", - "score": 0.13141527227323743 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.013 (6)", - "tab": "Toxicity", - "score": 0.002145922746781116 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.491, mean=0.515, max=0.544, sum=1.545 (3)", - "tab": "Summarization metrics", - "score": 0.5151288171631818 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.661, mean=4.697, max=4.725, sum=28.182 (6)", - "tab": "Summarization metrics", - "score": 4.696964335081241 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.264, mean=0.278, max=0.301, sum=0.834 (3)", - "tab": "Summarization metrics", - "score": 0.27790265116917295 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.965, mean=0.976, max=0.984, sum=5.856 (6)", - "tab": "Summarization metrics", - "score": 0.97598626364496 - }, - "CNN/DailyMail - Density": { - "description": "min=40.605, mean=53.93, max=67.411, sum=323.578 (6)", - "tab": "Summarization metrics", - "score": 53.929605831357485 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.981, mean=9.579, max=10.219, sum=57.476 (6)", - "tab": "Summarization metrics", - "score": 9.579310239916042 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.129, - "details": { - "description": "min=0.128, mean=0.129, max=0.131, sum=0.776 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=1.615, mean=1.629, max=1.648, sum=9.776 (6)", - "tab": "Efficiency", - "score": 1.6292920332441818 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)", - "tab": "General information", - "score": 1133.388030888031 - }, - "XSUM - # output tokens": { - "description": "min=21.958, mean=22.013, max=22.106, sum=132.077 (6)", - "tab": "General information", - "score": 22.012870012870014 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.46, mean=0.472, max=0.483, sum=2.834 (6)", - "tab": "Bias", - "score": 0.4724007038712921 - }, - "XSUM - Representation (race)": { - "description": "min=0.467, mean=0.48, max=0.505, sum=2.877 (6)", - "tab": "Bias", - "score": 0.47956989247311826 - }, - "XSUM - Representation (gender)": { - "description": "min=0.154, mean=0.186, max=0.216, sum=1.116 (6)", - "tab": "Bias", - "score": 0.18604199883585584 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)", - "tab": "Toxicity", - "score": 0.0019305019305019308 - }, - "XSUM - SummaC": { - "description": "min=-0.294, mean=-0.287, max=-0.282, sum=-0.861 (3)", - "tab": "Summarization metrics", - "score": -0.2868511554050323 - }, - "XSUM - QAFactEval": { - "description": "min=2.48, mean=3.182, max=3.598, sum=19.091 (6)", - "tab": "Summarization metrics", - "score": 3.1818935586249126 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.432, mean=0.435, max=0.438, sum=1.305 (3)", - "tab": "Summarization metrics", - "score": 0.43511885902101227 - }, - "XSUM - Coverage": { - "description": "min=0.775, mean=0.784, max=0.792, sum=4.704 (6)", - "tab": "Summarization metrics", - "score": 0.7840584721092689 - }, - "XSUM - Density": { - "description": "min=2.514, mean=2.63, max=2.802, sum=15.779 (6)", - "tab": "Summarization metrics", - "score": 2.6298709619480816 - }, - "XSUM - Compression": { - "description": "min=16.767, mean=16.862, max=16.987, sum=101.17 (6)", - "tab": "Summarization metrics", - "score": 16.861740741647864 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.943, - "details": { - "description": "min=0.934, mean=0.943, max=0.951, sum=2.83 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.06, mean=0.064, max=0.072, sum=0.191 (3)", - "tab": "Calibration", - "score": 0.06375881576094916 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.917, mean=0.923, max=0.934, sum=2.768 (3)", - "tab": "Robustness", - "score": 0.9226666666666666 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.922, mean=0.932, max=0.941, sum=2.797 (3)", - "tab": "Fairness", - "score": 0.9323333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.682, mean=0.852, max=1.035, sum=2.555 (3)", - "tab": "Efficiency", - "score": 0.8516515608723956 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)", - "tab": "General information", - "score": 4.971666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)", - "tab": "General information", - "score": 1281.5773333333334 - }, - "IMDB - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.553, - "details": { - "description": "min=0.03, mean=0.553, max=0.968, sum=29.863 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.048, mean=0.27, max=0.587, sum=14.569 (54)", - "tab": "Calibration", - "score": 0.26979933840430187 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.027, mean=0.271, max=0.732, sum=14.649 (54)", - "tab": "Robustness", - "score": 0.2712865813183887 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.006, mean=0.478, max=0.958, sum=25.823 (54)", - "tab": "Fairness", - "score": 0.4782106548652487 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.43, mean=0.552, max=0.724, sum=29.829 (54)", - "tab": "Efficiency", - "score": 0.5523870780537201 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)", - "tab": "General information", - "score": 532.6016121330534 - }, - "CivilComments - # output tokens": { - "description": "min=2, mean=2, max=2, sum=108 (54)", - "tab": "General information", - "score": 2.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.681, - "details": { - "description": "min=0.225, mean=0.681, max=0.975, sum=22.475 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.103, mean=0.228, max=0.595, sum=7.528 (33)", - "tab": "Calibration", - "score": 0.2281177870147751 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.555, max=0.875, sum=18.3 (33)", - "tab": "Robustness", - "score": 0.5545454545454546 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.125, mean=0.623, max=0.975, sum=20.55 (33)", - "tab": "Fairness", - "score": 0.6227272727272728 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.423, mean=0.687, max=1.043, sum=22.661 (33)", - "tab": "Efficiency", - "score": 0.6866916923137625 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)", - "tab": "General information", - "score": 4.657575757575757 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)", - "tab": "General information", - "score": 712.2477272727273 - }, - "RAFT - # output tokens": { - "description": "min=1.95, mean=3.634, max=6.925, sum=119.925 (33)", - "tab": "General information", - "score": 3.6340909090909084 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json deleted file mode 100644 index 5c8560533..000000000 --- a/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "J1-Large v1 7.5B", - "id": "ai21/J1-Large-v1-7.5B", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.285, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6383920923698907 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.29777282413544925 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.27467778791471786 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.38930372807017544 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5487461676083087 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6599416016082683 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.6502297410192147 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.241, - "details": { - "description": "min=0.2, mean=0.241, max=0.298, sum=3.617 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.051, mean=0.123, max=0.181, sum=1.842 (15)", - "tab": "Calibration", - "score": 0.12277396117394333 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.16, mean=0.2, max=0.272, sum=3.002 (15)", - "tab": "Robustness", - "score": 0.20011695906432747 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.16, mean=0.204, max=0.23, sum=3.059 (15)", - "tab": "Fairness", - "score": 0.2039415204678363 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.348, mean=0.377, max=0.422, sum=5.648 (15)", - "tab": "Efficiency", - "score": 0.3765351217105263 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)", - "tab": "General information", - "score": 396.73985964912276 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.652, mean=0.683, max=0.709, sum=2.05 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.085, mean=0.106, max=0.133, sum=0.319 (3)", - "tab": "Calibration", - "score": 0.10621693084730484 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.539, mean=0.567, max=0.603, sum=1.701 (3)", - "tab": "Robustness", - "score": 0.5670000000000001 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.591, mean=0.622, max=0.651, sum=1.867 (3)", - "tab": "Fairness", - "score": 0.6223333333333333 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.43, mean=0.485, max=0.566, sum=1.455 (3)", - "tab": "Efficiency", - "score": 0.48513916883680525 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)", - "tab": "General information", - "score": 694.6516666666666 - }, - "BoolQ - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623, - "details": { - "description": "min=0.612, mean=0.623, max=0.634, sum=1.87 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.042, mean=0.046, max=0.048, sum=0.137 (3)", - "tab": "Calibration", - "score": 0.04554705251298522 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.341, mean=0.4, max=0.438, sum=1.201 (3)", - "tab": "Robustness", - "score": 0.4003895179156612 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.496, mean=0.513, max=0.524, sum=1.538 (3)", - "tab": "Fairness", - "score": 0.5126679432053903 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.768, mean=0.797, max=0.829, sum=2.391 (3)", - "tab": "Efficiency", - "score": 0.7971074946205007 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)", - "tab": "General information", - "score": 2.63943661971831 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)", - "tab": "General information", - "score": 1692.2178403755868 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.797, mean=5.09, max=5.518, sum=15.27 (3)", - "tab": "General information", - "score": 5.090140845070422 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.17, mean=0.203, max=0.223, sum=0.609 (3)", - "tab": "Bias", - "score": 0.20304247377415918 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.013, max=0.014, sum=0.039 (3)", - "tab": "Toxicity", - "score": 0.013145539906103287 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532, - "details": { - "description": "min=0.5, mean=0.532, max=0.571, sum=1.597 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.013, mean=0.015, max=0.017, sum=0.046 (3)", - "tab": "Calibration", - "score": 0.01549922748171477 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.075, mean=0.086, max=0.093, sum=0.258 (3)", - "tab": "Calibration", - "score": 0.08597598507389619 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.092, mean=0.098, max=0.106, sum=0.293 (3)", - "tab": "Robustness", - "score": 0.097632746101742 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.361, mean=0.41, max=0.455, sum=1.23 (3)", - "tab": "Robustness", - "score": 0.4099829032840138 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.14, mean=0.146, max=0.151, sum=0.439 (3)", - "tab": "Fairness", - "score": 0.14648226412007787 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.44, mean=0.47, max=0.508, sum=1.409 (3)", - "tab": "Fairness", - "score": 0.4695231845662433 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.355, mean=0.372, max=0.396, sum=1.117 (3)", - "tab": "Efficiency", - "score": 0.3722484414062495 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.66, mean=0.733, max=0.784, sum=2.198 (3)", - "tab": "Efficiency", - "score": 0.7326816432291658 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)", - "tab": "General information", - "score": 99.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=6.868, mean=7.876, max=9.311, sum=23.628 (3)", - "tab": "General information", - "score": 7.876 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)", - "tab": "General information", - "score": 4.666333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)", - "tab": "General information", - "score": 1418.4566666666667 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.487, mean=5.946, max=6.338, sum=17.838 (3)", - "tab": "General information", - "score": 5.946000000000001 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.214, mean=0.405, max=0.5, sum=1.214 (3)", - "tab": "Bias", - "score": 0.4047619047619048 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.302, mean=0.362, max=0.45, sum=1.085 (3)", - "tab": "Bias", - "score": 0.36169748540882557 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.088, mean=0.216, max=0.371, sum=0.647 (3)", - "tab": "Bias", - "score": 0.21556767868437698 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.327, mean=0.394, max=0.457, sum=1.182 (3)", - "tab": "Bias", - "score": 0.39383347574877653 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.106, mean=0.109, max=0.113, sum=0.328 (3)", - "tab": "Bias", - "score": 0.10941198128319474 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)", - "tab": "Toxicity", - "score": 0.0016666666666666668 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328, - "details": { - "description": "min=0.322, mean=0.328, max=0.336, sum=0.983 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.016, mean=0.024, max=0.033, sum=0.073 (3)", - "tab": "Calibration", - "score": 0.02431531680637249 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.186, mean=0.197, max=0.209, sum=0.591 (3)", - "tab": "Robustness", - "score": 0.19699898429353593 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.227, mean=0.241, max=0.256, sum=0.722 (3)", - "tab": "Fairness", - "score": 0.24062000532402938 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.105, mean=1.16, max=1.191, sum=3.48 (3)", - "tab": "Efficiency", - "score": 1.159840737413194 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)", - "tab": "General information", - "score": 1.8286666666666667 - }, - "QuAC - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "General information", - "score": 0.001 - }, - "QuAC - # prompt tokens": { - "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)", - "tab": "General information", - "score": 1698.7113333333334 - }, - "QuAC - # output tokens": { - "description": "min=23.833, mean=27.642, max=30.067, sum=82.927 (3)", - "tab": "General information", - "score": 27.64233333333333 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.632, mean=0.647, max=0.667, sum=1.942 (3)", - "tab": "Bias", - "score": 0.6472747525379104 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.407, mean=0.428, max=0.446, sum=1.284 (3)", - "tab": "Bias", - "score": 0.42785601825865643 - }, - "QuAC - Representation (race)": { - "description": "min=0.226, mean=0.3, max=0.351, sum=0.9 (3)", - "tab": "Bias", - "score": 0.2998485806834953 - }, - "QuAC - Representation (gender)": { - "description": "min=0.235, mean=0.249, max=0.271, sum=0.748 (3)", - "tab": "Bias", - "score": 0.24941347459181362 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.004, sum=0.008 (3)", - "tab": "Toxicity", - "score": 0.0026666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=0.7 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.192, mean=0.192, max=0.192, sum=0.192 (1)", - "tab": "Calibration", - "score": 0.19173198668049052 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.646, mean=0.646, max=0.646, sum=0.646 (1)", - "tab": "Robustness", - "score": 0.646 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.528, mean=0.528, max=0.528, sum=0.528 (1)", - "tab": "Fairness", - "score": 0.528 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)", - "tab": "Efficiency", - "score": 0.25286050781250013 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)", - "tab": "General information", - "score": 62.466 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514, - "details": { - "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)", - "tab": "Calibration", - "score": 0.24986668171933007 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)", - "tab": "Robustness", - "score": 0.412 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)", - "tab": "Fairness", - "score": 0.444 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)", - "tab": "Efficiency", - "score": 0.2381039843749996 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)", - "tab": "General information", - "score": 4.348 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.197, - "details": { - "description": "min=0.19, mean=0.197, max=0.2, sum=0.59 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.105, mean=0.112, max=0.121, sum=0.337 (3)", - "tab": "Calibration", - "score": 0.11232689963932652 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.138, mean=0.155, max=0.168, sum=0.465 (3)", - "tab": "Robustness", - "score": 0.15494393476044852 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.159, mean=0.174, max=0.182, sum=0.521 (3)", - "tab": "Fairness", - "score": 0.17380224260958207 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.351, mean=0.365, max=0.372, sum=1.094 (3)", - "tab": "Efficiency", - "score": 0.36458362003058115 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)", - "tab": "General information", - "score": 355.0152905198777 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.292, - "details": { - "description": "min=0.266, mean=0.292, max=0.338, sum=0.877 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.089, mean=0.105, max=0.128, sum=0.315 (3)", - "tab": "Robustness", - "score": 0.10499510582010585 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.231, mean=0.248, max=0.274, sum=0.743 (3)", - "tab": "Robustness", - "score": 0.24769351383898738 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.096, mean=0.117, max=0.143, sum=0.351 (3)", - "tab": "Fairness", - "score": 0.11706984126984123 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.258, mean=0.28, max=0.322, sum=0.841 (3)", - "tab": "Fairness", - "score": 0.2804651230679189 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.379, mean=0.393, max=0.406, sum=1.178 (3)", - "tab": "Efficiency", - "score": 0.3926667591145831 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.376, mean=0.389, max=0.402, sum=1.167 (3)", - "tab": "Efficiency", - "score": 0.3890438468992247 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)", - "tab": "General information", - "score": 385.63633333333337 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=2.011, mean=2.072, max=2.163, sum=6.217 (3)", - "tab": "General information", - "score": 2.0723333333333334 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)", - "tab": "General information", - "score": 373.3798449612403 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=2.093, mean=2.116, max=2.163, sum=6.349 (3)", - "tab": "General information", - "score": 2.116279069767442 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.134, - "details": { - "description": "min=0.123, mean=0.134, max=0.147, sum=0.802 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=1.832, mean=2.011, max=2.216, sum=12.069 (6)", - "tab": "Efficiency", - "score": 2.011487112821144 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)", - "tab": "General information", - "score": 1213.0321888412018 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=78.521, mean=89.614, max=102.401, sum=537.682 (6)", - "tab": "General information", - "score": 89.61373390557941 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.602, mean=0.632, max=0.648, sum=3.791 (6)", - "tab": "Bias", - "score": 0.6318145834093977 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.385, mean=0.391, max=0.396, sum=2.349 (6)", - "tab": "Bias", - "score": 0.3914278177516011 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.257, mean=0.302, max=0.354, sum=1.811 (6)", - "tab": "Bias", - "score": 0.3019033965877131 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.135, mean=0.142, max=0.152, sum=0.851 (6)", - "tab": "Bias", - "score": 0.14183552076259287 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.004, sum=0.009 (6)", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.488, mean=0.512, max=0.535, sum=1.537 (3)", - "tab": "Summarization metrics", - "score": 0.5121705493530246 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.664, mean=4.716, max=4.749, sum=28.295 (6)", - "tab": "Summarization metrics", - "score": 4.715823146970394 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.229, mean=0.248, max=0.272, sum=0.745 (3)", - "tab": "Summarization metrics", - "score": 0.2482954175661162 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.971, mean=0.977, max=0.985, sum=5.861 (6)", - "tab": "Summarization metrics", - "score": 0.9768840440430324 - }, - "CNN/DailyMail - Density": { - "description": "min=55.528, mean=71.654, max=97.831, sum=429.924 (6)", - "tab": "Summarization metrics", - "score": 71.65405587945487 - }, - "CNN/DailyMail - Compression": { - "description": "min=5.872, mean=7.632, max=9.373, sum=45.79 (6)", - "tab": "Summarization metrics", - "score": 7.631709472598792 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102, - "details": { - "description": "min=0.095, mean=0.102, max=0.107, sum=0.612 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.896, mean=0.903, max=0.91, sum=5.418 (6)", - "tab": "Efficiency", - "score": 0.9030293349990619 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)", - "tab": "General information", - "score": 1133.388030888031 - }, - "XSUM - # output tokens": { - "description": "min=20.832, mean=21.299, max=21.809, sum=127.792 (6)", - "tab": "General information", - "score": 21.2985842985843 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.397, mean=0.424, max=0.451, sum=2.547 (6)", - "tab": "Bias", - "score": 0.42449478248089356 - }, - "XSUM - Representation (race)": { - "description": "min=0.387, mean=0.426, max=0.467, sum=2.554 (6)", - "tab": "Bias", - "score": 0.4255855855855855 - }, - "XSUM - Representation (gender)": { - "description": "min=0.151, mean=0.172, max=0.189, sum=1.031 (6)", - "tab": "Bias", - "score": 0.1717873516720604 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.26, mean=-0.239, max=-0.222, sum=-0.716 (3)", - "tab": "Summarization metrics", - "score": -0.23866760351278402 - }, - "XSUM - QAFactEval": { - "description": "min=3.354, mean=3.675, max=4.009, sum=22.047 (6)", - "tab": "Summarization metrics", - "score": 3.674546888395078 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.393, mean=0.4, max=0.405, sum=1.2 (3)", - "tab": "Summarization metrics", - "score": 0.40004604044843806 - }, - "XSUM - Coverage": { - "description": "min=0.804, mean=0.808, max=0.813, sum=4.85 (6)", - "tab": "Summarization metrics", - "score": 0.8084128334077892 - }, - "XSUM - Density": { - "description": "min=3.618, mean=3.757, max=3.939, sum=22.541 (6)", - "tab": "Summarization metrics", - "score": 3.7567632334705046 - }, - "XSUM - Compression": { - "description": "min=17.523, mean=18.133, max=18.761, sum=108.8 (6)", - "tab": "Summarization metrics", - "score": 18.133322572088453 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.956, - "details": { - "description": "min=0.951, mean=0.956, max=0.962, sum=2.869 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.196, mean=0.213, max=0.234, sum=0.639 (3)", - "tab": "Calibration", - "score": 0.21314336064172376 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.927, mean=0.932, max=0.936, sum=2.796 (3)", - "tab": "Robustness", - "score": 0.932 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.939, mean=0.946, max=0.951, sum=2.839 (3)", - "tab": "Fairness", - "score": 0.9463333333333334 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.531, mean=0.637, max=0.757, sum=1.911 (3)", - "tab": "Efficiency", - "score": 0.6371184251302079 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)", - "tab": "General information", - "score": 4.971666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)", - "tab": "General information", - "score": 1281.5773333333334 - }, - "IMDB - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532, - "details": { - "description": "min=0, mean=0.532, max=0.996, sum=28.713 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.073, mean=0.377, max=0.573, sum=20.347 (54)", - "tab": "Calibration", - "score": 0.37680252478263027 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.444, max=0.984, sum=23.966 (54)", - "tab": "Robustness", - "score": 0.4438230435194026 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.447, max=0.962, sum=24.127 (54)", - "tab": "Fairness", - "score": 0.4468037461427085 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.338, mean=0.434, max=0.564, sum=23.454 (54)", - "tab": "Efficiency", - "score": 0.43432643222557377 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)", - "tab": "General information", - "score": 532.6016121330534 - }, - "CivilComments - # output tokens": { - "description": "min=2, mean=2, max=2, sum=108 (54)", - "tab": "General information", - "score": 2.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.545, - "details": { - "description": "min=0.15, mean=0.545, max=0.95, sum=18 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.134, mean=0.269, max=0.513, sum=8.875 (33)", - "tab": "Calibration", - "score": 0.2689468403025133 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.443, max=0.95, sum=14.625 (33)", - "tab": "Robustness", - "score": 0.4431818181818182 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.15, mean=0.511, max=0.95, sum=16.85 (33)", - "tab": "Fairness", - "score": 0.5106060606060605 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.312, mean=0.499, max=0.763, sum=16.476 (33)", - "tab": "Efficiency", - "score": 0.4992617404513889 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)", - "tab": "General information", - "score": 4.657575757575757 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)", - "tab": "General information", - "score": 712.2477272727273 - }, - "RAFT - # output tokens": { - "description": "min=1.975, mean=3.499, max=7.025, sum=115.475 (33)", - "tab": "General information", - "score": 3.4992424242424245 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json deleted file mode 100644 index 4f288f894..000000000 --- a/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jurassic-2 Grande 17B", - "id": "ai21/Jurassic-2-Grande-17B", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6300647548566143 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.7641047680536001 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.7037362526239056 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.561885097395068 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.3875874125874126 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.6710526315789473 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.475, - "details": { - "description": "min=0.24, mean=0.475, max=0.81, sum=7.13 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.076, mean=0.134, max=0.172, sum=2.006 (15)", - "tab": "Calibration", - "score": 0.13373539597087636 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.22, mean=0.411, max=0.68, sum=6.168 (15)", - "tab": "Robustness", - "score": 0.41120467836257313 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.23, mean=0.433, max=0.73, sum=6.498 (15)", - "tab": "Fairness", - "score": 0.43321637426900583 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)", - "tab": "General information", - "score": 396.73985964912276 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.816, mean=0.826, max=0.832, sum=2.478 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.179, mean=0.209, max=0.243, sum=0.627 (3)", - "tab": "Calibration", - "score": 0.20883844550071148 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.714, mean=0.729, max=0.743, sum=2.187 (3)", - "tab": "Robustness", - "score": 0.729 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.758, mean=0.78, max=0.791, sum=2.34 (3)", - "tab": "Fairness", - "score": 0.7799999999999999 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)", - "tab": "General information", - "score": 694.6516666666666 - }, - "BoolQ - # output tokens": { - "description": "min=2.002, mean=2.002, max=2.002, sum=6.006 (3)", - "tab": "General information", - "score": 2.002 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.737, - "details": { - "description": "min=0.732, mean=0.737, max=0.744, sum=2.21 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.107, mean=0.126, max=0.158, sum=0.377 (3)", - "tab": "Calibration", - "score": 0.12569343029680938 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.49, mean=0.583, max=0.65, sum=1.75 (3)", - "tab": "Robustness", - "score": 0.5834381641862693 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.638, mean=0.645, max=0.651, sum=1.935 (3)", - "tab": "Fairness", - "score": 0.6449807868174807 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)", - "tab": "General information", - "score": 2.63943661971831 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)", - "tab": "General information", - "score": 1692.2178403755868 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.039, mean=5.261, max=5.473, sum=15.783 (3)", - "tab": "General information", - "score": 5.261032863849765 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.4, mean=0.448, max=0.5, sum=1.344 (3)", - "tab": "Bias", - "score": 0.4481481481481482 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.185, mean=0.196, max=0.205, sum=0.587 (3)", - "tab": "Bias", - "score": 0.19550967146595563 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.017, mean=0.02, max=0.023, sum=0.059 (3)", - "tab": "Toxicity", - "score": 0.019718309859154928 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.639, - "details": { - "description": "min=0.627, mean=0.639, max=0.649, sum=1.918 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.016, mean=0.018, max=0.019, sum=0.054 (3)", - "tab": "Calibration", - "score": 0.01803156970695322 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.053, mean=0.063, max=0.072, sum=0.188 (3)", - "tab": "Calibration", - "score": 0.06257440554546793 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.277, mean=0.285, max=0.29, sum=0.854 (3)", - "tab": "Robustness", - "score": 0.28458982309414393 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.555, mean=0.564, max=0.568, sum=1.691 (3)", - "tab": "Robustness", - "score": 0.5635162273229849 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.276, mean=0.283, max=0.288, sum=0.85 (3)", - "tab": "Fairness", - "score": 0.2832503879785802 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.569, mean=0.584, max=0.592, sum=1.752 (3)", - "tab": "Fairness", - "score": 0.5839142853000876 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)", - "tab": "General information", - "score": 99.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.466, mean=6.315, max=6.864, sum=18.944 (3)", - "tab": "General information", - "score": 6.314666666666667 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)", - "tab": "General information", - "score": 4.666333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)", - "tab": "General information", - "score": 1418.4566666666667 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.441, mean=5.676, max=6.069, sum=17.029 (3)", - "tab": "General information", - "score": 5.676333333333333 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.431, mean=0.507, max=0.569, sum=1.52 (3)", - "tab": "Bias", - "score": 0.5067443890625439 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.1, mean=0.176, max=0.273, sum=0.527 (3)", - "tab": "Bias", - "score": 0.1755244755244755 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.431, mean=0.465, max=0.498, sum=1.395 (3)", - "tab": "Bias", - "score": 0.46507125832968527 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.005, mean=0.03, max=0.053, sum=0.089 (3)", - "tab": "Bias", - "score": 0.02952187967385538 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)", - "tab": "Toxicity", - "score": 0.0003333333333333333 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)", - "tab": "Toxicity", - "score": 0.0003333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418, - "details": { - "description": "min=0.412, mean=0.418, max=0.429, sum=1.255 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.027, mean=0.035, max=0.04, sum=0.105 (3)", - "tab": "Calibration", - "score": 0.03491339390127312 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.271, mean=0.276, max=0.281, sum=0.827 (3)", - "tab": "Robustness", - "score": 0.27557303329747496 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.335, mean=0.34, max=0.35, sum=1.02 (3)", - "tab": "Fairness", - "score": 0.34002521409765923 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)", - "tab": "General information", - "score": 1.8286666666666667 - }, - "QuAC - truncated": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "General information", - "score": 0.001 - }, - "QuAC - # prompt tokens": { - "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)", - "tab": "General information", - "score": 1698.7113333333334 - }, - "QuAC - # output tokens": { - "description": "min=22.04, mean=24.469, max=26.73, sum=73.408 (3)", - "tab": "General information", - "score": 24.469333333333335 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.625, mean=0.64, max=0.651, sum=1.919 (3)", - "tab": "Bias", - "score": 0.6395502645502645 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.389, mean=0.422, max=0.455, sum=1.267 (3)", - "tab": "Bias", - "score": 0.4224807266199369 - }, - "QuAC - Representation (race)": { - "description": "min=0.183, mean=0.23, max=0.263, sum=0.689 (3)", - "tab": "Bias", - "score": 0.22977891012599364 - }, - "QuAC - Representation (gender)": { - "description": "min=0.223, mean=0.224, max=0.225, sum=0.673 (3)", - "tab": "Bias", - "score": 0.22430144583085757 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.004, sum=0.009 (3)", - "tab": "Toxicity", - "score": 0.0030000000000000005 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.781, - "details": { - "description": "min=0.781, mean=0.781, max=0.781, sum=0.781 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)", - "tab": "Robustness", - "score": 0.755 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.632, mean=0.632, max=0.632, sum=0.632 (1)", - "tab": "Fairness", - "score": 0.632 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)", - "tab": "General information", - "score": 62.466 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542, - "details": { - "description": "min=0.542, mean=0.542, max=0.542, sum=0.542 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.474 (1)", - "tab": "Robustness", - "score": 0.474 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)", - "tab": "Fairness", - "score": 0.466 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)", - "tab": "General information", - "score": 4.348 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.348, - "details": { - "description": "min=0.287, mean=0.348, max=0.384, sum=1.043 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.073, mean=0.097, max=0.142, sum=0.291 (3)", - "tab": "Calibration", - "score": 0.09707246189445913 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.245, mean=0.293, max=0.326, sum=0.878 (3)", - "tab": "Robustness", - "score": 0.29255861365953106 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.242, mean=0.29, max=0.32, sum=0.87 (3)", - "tab": "Fairness", - "score": 0.2900101936799185 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)", - "tab": "General information", - "score": 355.0152905198777 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514, - "details": { - "description": "min=0.473, mean=0.514, max=0.577, sum=1.543 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.18, mean=0.227, max=0.253, sum=0.681 (3)", - "tab": "Robustness", - "score": 0.22687976190476158 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.406, mean=0.423, max=0.451, sum=1.269 (3)", - "tab": "Robustness", - "score": 0.42305953691791237 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.206, mean=0.243, max=0.271, sum=0.728 (3)", - "tab": "Fairness", - "score": 0.242712169312169 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.438, mean=0.471, max=0.522, sum=1.413 (3)", - "tab": "Fairness", - "score": 0.47089412794287994 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)", - "tab": "General information", - "score": 385.63633333333337 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=2.003, mean=2.006, max=2.008, sum=6.017 (3)", - "tab": "General information", - "score": 2.005666666666667 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)", - "tab": "General information", - "score": 373.3798449612403 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=2.023, mean=2.023, max=2.023, sum=6.07 (3)", - "tab": "General information", - "score": 2.0232558139534884 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.144, - "details": { - "description": "min=0.131, mean=0.144, max=0.153, sum=0.865 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)", - "tab": "General information", - "score": 1213.0321888412018 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=48.987, mean=55.762, max=59.891, sum=334.571 (6)", - "tab": "General information", - "score": 55.76180257510729 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.619, mean=0.636, max=0.667, sum=3.817 (6)", - "tab": "Bias", - "score": 0.6361416361416362 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.386, mean=0.402, max=0.424, sum=2.411 (6)", - "tab": "Bias", - "score": 0.4017992121362035 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.338, mean=0.359, max=0.379, sum=2.152 (6)", - "tab": "Bias", - "score": 0.3586894722560466 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.099, mean=0.117, max=0.128, sum=0.701 (6)", - "tab": "Bias", - "score": 0.11681135928174619 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.004, sum=0.017 (6)", - "tab": "Toxicity", - "score": 0.002861230329041488 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.469, mean=0.503, max=0.535, sum=1.51 (3)", - "tab": "Summarization metrics", - "score": 0.5032610058862116 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.281, mean=0.299, max=0.308, sum=0.896 (3)", - "tab": "Summarization metrics", - "score": 0.2987736324577836 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.953, mean=0.96, max=0.965, sum=5.76 (6)", - "tab": "Summarization metrics", - "score": 0.9600651009447835 - }, - "CNN/DailyMail - Density": { - "description": "min=14.681, mean=22.305, max=27.564, sum=133.827 (6)", - "tab": "Summarization metrics", - "score": 22.304503793993888 - }, - "CNN/DailyMail - Compression": { - "description": "min=10.404, mean=11.399, max=13.033, sum=68.393 (6)", - "tab": "Summarization metrics", - "score": 11.39877050033896 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.167, - "details": { - "description": "min=0.164, mean=0.167, max=0.173, sum=1.005 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)", - "tab": "General information", - "score": 1133.388030888031 - }, - "XSUM - # output tokens": { - "description": "min=21.463, mean=21.75, max=22.241, sum=130.502 (6)", - "tab": "General information", - "score": 21.75032175032175 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.445, mean=0.456, max=0.463, sum=2.736 (6)", - "tab": "Bias", - "score": 0.4559853927203065 - }, - "XSUM - Representation (race)": { - "description": "min=0.362, mean=0.466, max=0.532, sum=2.798 (6)", - "tab": "Bias", - "score": 0.4664089053990878 - }, - "XSUM - Representation (gender)": { - "description": "min=0.192, mean=0.207, max=0.233, sum=1.24 (6)", - "tab": "Bias", - "score": 0.2066101848280066 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.0006435006435006435 - }, - "XSUM - SummaC": { - "description": "min=-0.31, mean=-0.289, max=-0.268, sum=-0.868 (3)", - "tab": "Summarization metrics", - "score": -0.2893415716573027 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.47, mean=0.475, max=0.48, sum=1.424 (3)", - "tab": "Summarization metrics", - "score": 0.474663326872436 - }, - "XSUM - Coverage": { - "description": "min=0.761, mean=0.766, max=0.771, sum=4.596 (6)", - "tab": "Summarization metrics", - "score": 0.7660021617230298 - }, - "XSUM - Density": { - "description": "min=2.196, mean=2.36, max=2.464, sum=14.158 (6)", - "tab": "Summarization metrics", - "score": 2.359653576011524 - }, - "XSUM - Compression": { - "description": "min=16.605, mean=17.045, max=17.3, sum=102.267 (6)", - "tab": "Summarization metrics", - "score": 17.044545661784866 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938, - "details": { - "description": "min=0.926, mean=0.938, max=0.954, sum=2.814 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.088, mean=0.111, max=0.153, sum=0.333 (3)", - "tab": "Calibration", - "score": 0.11088831926219649 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.915, mean=0.928, max=0.949, sum=2.784 (3)", - "tab": "Robustness", - "score": 0.9279999999999999 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.92, mean=0.931, max=0.951, sum=2.792 (3)", - "tab": "Fairness", - "score": 0.9306666666666666 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)", - "tab": "General information", - "score": 4.971666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)", - "tab": "General information", - "score": 1281.5773333333334 - }, - "IMDB - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.547, - "details": { - "description": "min=0.011, mean=0.547, max=0.998, sum=29.525 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.097, mean=0.381, max=0.605, sum=20.56 (54)", - "tab": "Calibration", - "score": 0.38073513412444826 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.488, max=0.986, sum=26.326 (54)", - "tab": "Robustness", - "score": 0.4875180109221431 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.445, max=0.973, sum=24.007 (54)", - "tab": "Fairness", - "score": 0.44457169485758724 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)", - "tab": "General information", - "score": 532.6016121330534 - }, - "CivilComments - # output tokens": { - "description": "min=2, mean=2, max=2, sum=108 (54)", - "tab": "General information", - "score": 2.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.712, - "details": { - "description": "min=0.225, mean=0.712, max=0.975, sum=23.5 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.119, mean=0.232, max=0.581, sum=7.664 (33)", - "tab": "Calibration", - "score": 0.23222744852932867 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.618, max=0.875, sum=20.4 (33)", - "tab": "Robustness", - "score": 0.6181818181818182 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.2, mean=0.689, max=0.975, sum=22.725 (33)", - "tab": "Fairness", - "score": 0.6886363636363637 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)", - "tab": "General information", - "score": 4.657575757575757 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)", - "tab": "General information", - "score": 712.2477272727273 - }, - "RAFT - # output tokens": { - "description": "min=1.95, mean=3.644, max=6.3, sum=120.25 (33)", - "tab": "General information", - "score": 3.643939393939394 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json deleted file mode 100644 index 6d0308b9f..000000000 --- a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jurassic-2 Jumbo 178B", - "id": "ai21/Jurassic-2-Jumbo-178B", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6597594819611471 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.7910296229539834 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.8360206534288848 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5968189835436076 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5064102564102564 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.6447368421052632 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48, - "details": { - "description": "min=0.23, mean=0.48, max=0.83, sum=7.207 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.056, mean=0.137, max=0.248, sum=2.059 (15)", - "tab": "Calibration", - "score": 0.13723997934779486 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.17, mean=0.417, max=0.75, sum=6.251 (15)", - "tab": "Robustness", - "score": 0.41671345029239765 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.21, mean=0.45, max=0.78, sum=6.75 (15)", - "tab": "Fairness", - "score": 0.44997660818713453 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)", - "tab": "General information", - "score": 396.73985964912276 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.818, mean=0.829, max=0.838, sum=2.487 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.163, mean=0.175, max=0.198, sum=0.526 (3)", - "tab": "Calibration", - "score": 0.17545319159294462 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.72, mean=0.729, max=0.736, sum=2.188 (3)", - "tab": "Robustness", - "score": 0.7293333333333333 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.78, mean=0.792, max=0.798, sum=2.375 (3)", - "tab": "Fairness", - "score": 0.7916666666666666 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)", - "tab": "General information", - "score": 694.6516666666666 - }, - "BoolQ - # output tokens": { - "description": "min=2, mean=2.002, max=2.003, sum=6.005 (3)", - "tab": "General information", - "score": 2.0016666666666665 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.715, mean=0.733, max=0.757, sum=2.2 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.068, mean=0.073, max=0.076, sum=0.219 (3)", - "tab": "Calibration", - "score": 0.07310994320832209 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.627, mean=0.66, max=0.69, sum=1.98 (3)", - "tab": "Robustness", - "score": 0.6601600341725052 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.63, mean=0.658, max=0.69, sum=1.973 (3)", - "tab": "Fairness", - "score": 0.6577011654908803 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=2534.434, mean=2818.1, max=3027.434, sum=8454.301 (3)", - "tab": "General information", - "score": 2818.1004694835683 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.879, mean=6.406, max=7.755, sum=19.217 (3)", - "tab": "General information", - "score": 6.405633802816901 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.385, mean=0.43, max=0.5, sum=1.29 (3)", - "tab": "Bias", - "score": 0.4298611111111111 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.333, mean=0.5, max=0.667, sum=1 (2)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.171, mean=0.183, max=0.192, sum=0.55 (3)", - "tab": "Bias", - "score": 0.18345814920903128 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.017, max=0.02, sum=0.051 (3)", - "tab": "Toxicity", - "score": 0.016901408450704227 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.669, - "details": { - "description": "min=0.65, mean=0.669, max=0.681, sum=2.007 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.018, mean=0.018, max=0.019, sum=0.054 (3)", - "tab": "Calibration", - "score": 0.018133452831606698 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.071, mean=0.073, max=0.076, sum=0.22 (3)", - "tab": "Calibration", - "score": 0.07345259187429393 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.31, mean=0.315, max=0.318, sum=0.945 (3)", - "tab": "Robustness", - "score": 0.3150688575152197 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.576, mean=0.599, max=0.616, sum=1.796 (3)", - "tab": "Robustness", - "score": 0.5985032886794094 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.326, mean=0.327, max=0.328, sum=0.982 (3)", - "tab": "Fairness", - "score": 0.32739768950953246 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.601, mean=0.62, max=0.633, sum=1.86 (3)", - "tab": "Fairness", - "score": 0.6201543217700605 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)", - "tab": "General information", - "score": 99.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.127, mean=5.365, max=5.79, sum=16.095 (3)", - "tab": "General information", - "score": 5.364999999999999 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.928, mean=4.93, max=4.932, sum=14.791 (3)", - "tab": "General information", - "score": 4.9303333333333335 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.012, mean=0.012, max=0.012, sum=0.036 (3)", - "tab": "General information", - "score": 0.012000000000000002 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1254.565, mean=1571.171, max=1771.274, sum=4713.512 (3)", - "tab": "General information", - "score": 1571.1706666666669 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=4.785, mean=5.113, max=5.399, sum=15.338 (3)", - "tab": "General information", - "score": 5.112666666666667 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.352, mean=0.376, max=0.405, sum=1.127 (3)", - "tab": "Bias", - "score": 0.3756261756261756 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.033, mean=0.095, max=0.136, sum=0.285 (3)", - "tab": "Bias", - "score": 0.09502719502719503 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.3, mean=0.413, max=0.5, sum=1.238 (3)", - "tab": "Bias", - "score": 0.41250000000000003 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.514, mean=0.541, max=0.561, sum=1.624 (3)", - "tab": "Bias", - "score": 0.5414311179017061 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.06, mean=0.107, max=0.132, sum=0.321 (3)", - "tab": "Bias", - "score": 0.10706952566601687 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006666666666666666 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435, - "details": { - "description": "min=0.426, mean=0.435, max=0.446, sum=1.305 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.032, mean=0.035, max=0.037, sum=0.104 (3)", - "tab": "Calibration", - "score": 0.03466023181877799 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.31, mean=0.314, max=0.316, sum=0.941 (3)", - "tab": "Robustness", - "score": 0.3135172870245195 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.333, mean=0.34, max=0.348, sum=1.02 (3)", - "tab": "Fairness", - "score": 0.34006270092560414 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=4.999, mean=5.0, max=5, sum=14.999 (3)", - "tab": "General information", - "score": 4.999666666666666 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=3587.32, mean=4018.779, max=4568.698, sum=12056.338 (3)", - "tab": "General information", - "score": 4018.7793333333334 - }, - "QuAC - # output tokens": { - "description": "min=21.621, mean=22.178, max=22.826, sum=66.533 (3)", - "tab": "General information", - "score": 22.177666666666664 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.615, mean=0.642, max=0.667, sum=1.925 (3)", - "tab": "Bias", - "score": 0.6416361416361417 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.425, mean=0.454, max=0.476, sum=1.363 (3)", - "tab": "Bias", - "score": 0.45448951168627727 - }, - "QuAC - Representation (race)": { - "description": "min=0.342, mean=0.359, max=0.375, sum=1.078 (3)", - "tab": "Bias", - "score": 0.35949126363389555 - }, - "QuAC - Representation (gender)": { - "description": "min=0.22, mean=0.232, max=0.241, sum=0.696 (3)", - "tab": "Bias", - "score": 0.23190752816365634 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)", - "tab": "Robustness", - "score": 0.754 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.655, mean=0.655, max=0.655, sum=0.655 (1)", - "tab": "Fairness", - "score": 0.655 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)", - "tab": "General information", - "score": 62.466 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.558, - "details": { - "description": "min=0.558, mean=0.558, max=0.558, sum=0.558 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.47 (1)", - "tab": "Robustness", - "score": 0.47 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.488, mean=0.488, max=0.488, sum=0.488 (1)", - "tab": "Fairness", - "score": 0.488 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)", - "tab": "General information", - "score": 4.348 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437, - "details": { - "description": "min=0.367, mean=0.437, max=0.485, sum=1.312 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.049, mean=0.068, max=0.095, sum=0.203 (3)", - "tab": "Calibration", - "score": 0.06751578986419772 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.326, mean=0.39, max=0.43, sum=1.17 (3)", - "tab": "Robustness", - "score": 0.38990825688073394 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.289, mean=0.354, max=0.398, sum=1.063 (3)", - "tab": "Fairness", - "score": 0.35423037716615696 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)", - "tab": "General information", - "score": 355.0152905198777 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.62, mean=0.661, max=0.706, sum=1.982 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.333, mean=0.337, max=0.343, sum=1.012 (3)", - "tab": "Robustness", - "score": 0.3372691798941794 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.569, mean=0.607, max=0.639, sum=1.821 (3)", - "tab": "Robustness", - "score": 0.6069545244562901 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.339, mean=0.342, max=0.346, sum=1.027 (3)", - "tab": "Fairness", - "score": 0.34235396825396786 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.578, mean=0.62, max=0.66, sum=1.861 (3)", - "tab": "Fairness", - "score": 0.6202649047028815 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)", - "tab": "General information", - "score": 385.63633333333337 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=2, mean=2.001, max=2.003, sum=6.003 (3)", - "tab": "General information", - "score": 2.001 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)", - "tab": "General information", - "score": 373.3798449612403 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.149, - "details": { - "description": "min=0.142, mean=0.149, max=0.157, sum=0.892 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)", - "tab": "General information", - "score": 1213.0321888412018 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=47.208, mean=49.239, max=51.633, sum=295.433 (6)", - "tab": "General information", - "score": 49.238912732474965 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.593, mean=0.608, max=0.618, sum=3.649 (6)", - "tab": "Bias", - "score": 0.6082305358040653 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.396, mean=0.411, max=0.434, sum=2.467 (6)", - "tab": "Bias", - "score": 0.4111171483483329 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.177, mean=0.254, max=0.301, sum=1.526 (6)", - "tab": "Bias", - "score": 0.25438070908615346 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.064, mean=0.083, max=0.119, sum=0.497 (6)", - "tab": "Bias", - "score": 0.08290586755395449 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.442, mean=0.489, max=0.543, sum=1.468 (3)", - "tab": "Summarization metrics", - "score": 0.48944984939262354 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.299, mean=0.313, max=0.33, sum=0.94 (3)", - "tab": "Summarization metrics", - "score": 0.31320318480412634 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.952, mean=0.957, max=0.964, sum=5.745 (6)", - "tab": "Summarization metrics", - "score": 0.9574608785885589 - }, - "CNN/DailyMail - Density": { - "description": "min=12.535, mean=15.317, max=20.424, sum=91.904 (6)", - "tab": "Summarization metrics", - "score": 15.31737957113954 - }, - "CNN/DailyMail - Compression": { - "description": "min=11.81, mean=12.304, max=13.072, sum=73.827 (6)", - "tab": "Summarization metrics", - "score": 12.30449736723726 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182, - "details": { - "description": "min=0.177, mean=0.182, max=0.186, sum=1.09 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)", - "tab": "General information", - "score": 1133.388030888031 - }, - "XSUM - # output tokens": { - "description": "min=21.909, mean=22.142, max=22.392, sum=132.853 (6)", - "tab": "General information", - "score": 22.142213642213644 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.451, mean=0.466, max=0.478, sum=2.796 (6)", - "tab": "Bias", - "score": 0.4660306771417882 - }, - "XSUM - Representation (race)": { - "description": "min=0.362, mean=0.399, max=0.429, sum=2.397 (6)", - "tab": "Bias", - "score": 0.39943255885284873 - }, - "XSUM - Representation (gender)": { - "description": "min=0.189, mean=0.205, max=0.224, sum=1.232 (6)", - "tab": "Bias", - "score": 0.20538608377971754 - }, - "XSUM - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.004, sum=0.019 (6)", - "tab": "Toxicity", - "score": 0.0032175032175032173 - }, - "XSUM - SummaC": { - "description": "min=-0.325, mean=-0.32, max=-0.314, sum=-0.96 (3)", - "tab": "Summarization metrics", - "score": -0.31997175372142944 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.484, mean=0.489, max=0.493, sum=1.468 (3)", - "tab": "Summarization metrics", - "score": 0.4894925021585029 - }, - "XSUM - Coverage": { - "description": "min=0.75, mean=0.755, max=0.761, sum=4.53 (6)", - "tab": "Summarization metrics", - "score": 0.7549647155240389 - }, - "XSUM - Density": { - "description": "min=1.852, mean=2.145, max=2.331, sum=12.869 (6)", - "tab": "Summarization metrics", - "score": 2.144865535443147 - }, - "XSUM - Compression": { - "description": "min=16.369, mean=16.589, max=16.81, sum=99.535 (6)", - "tab": "Summarization metrics", - "score": 16.58922760069323 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938, - "details": { - "description": "min=0.936, mean=0.938, max=0.943, sum=2.815 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.157, mean=0.182, max=0.199, sum=0.546 (3)", - "tab": "Calibration", - "score": 0.18203122522171636 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.878, mean=0.896, max=0.916, sum=2.688 (3)", - "tab": "Robustness", - "score": 0.896 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.928, mean=0.933, max=0.937, sum=2.799 (3)", - "tab": "Fairness", - "score": 0.9329999999999999 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=853.851, mean=1288.518, max=1745.851, sum=3865.553 (3)", - "tab": "General information", - "score": 1288.5176666666669 - }, - "IMDB - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57, - "details": { - "description": "min=0.011, mean=0.57, max=1, sum=30.805 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.07, mean=0.314, max=0.578, sum=16.962 (54)", - "tab": "Calibration", - "score": 0.31411210820302815 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.009, mean=0.449, max=0.979, sum=24.224 (54)", - "tab": "Robustness", - "score": 0.4485846578472439 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.005, mean=0.507, max=0.995, sum=27.37 (54)", - "tab": "Fairness", - "score": 0.5068507198702314 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)", - "tab": "General information", - "score": 532.6016121330534 - }, - "CivilComments - # output tokens": { - "description": "min=2, mean=2, max=2, sum=108 (54)", - "tab": "General information", - "score": 2.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.225, mean=0.746, max=0.975, sum=24.625 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.126, mean=0.218, max=0.683, sum=7.184 (33)", - "tab": "Calibration", - "score": 0.2177038585857703 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.225, mean=0.69, max=0.95, sum=22.775 (33)", - "tab": "Robustness", - "score": 0.6901515151515151 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.225, mean=0.711, max=0.975, sum=23.45 (33)", - "tab": "Fairness", - "score": 0.7106060606060605 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=5, mean=5, max=5, sum=165 (33)", - "tab": "General information", - "score": 5.0 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=212.25, mean=944.157, max=4506.05, sum=31157.175 (33)", - "tab": "General information", - "score": 944.1568181818182 - }, - "RAFT - # output tokens": { - "description": "min=2, mean=3.597, max=7.275, sum=118.7 (33)", - "tab": "General information", - "score": 3.5969696969696967 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json deleted file mode 100644 index 4278cef81..000000000 --- a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jurassic-2 Large 7.5B", - "id": "ai21/Jurassic-2-Large-7.5B", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.553, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6435013876040703 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.5267325431952796 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.48311004284307957 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4461156665667944 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.4555798368298368 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5723684210526315 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.339, - "details": { - "description": "min=0.211, mean=0.339, max=0.5, sum=5.078 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.06, mean=0.141, max=0.219, sum=2.11 (15)", - "tab": "Calibration", - "score": 0.1406708954092635 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.17, mean=0.263, max=0.42, sum=3.938 (15)", - "tab": "Robustness", - "score": 0.2625146198830409 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.167, mean=0.297, max=0.45, sum=4.453 (15)", - "tab": "Fairness", - "score": 0.2968421052631579 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)", - "tab": "General information", - "score": 396.73985964912276 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.737, mean=0.742, max=0.747, sum=2.227 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.126, mean=0.147, max=0.165, sum=0.442 (3)", - "tab": "Calibration", - "score": 0.14720347227904834 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.602, mean=0.607, max=0.615, sum=1.822 (3)", - "tab": "Robustness", - "score": 0.6073333333333334 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.675, mean=0.685, max=0.697, sum=2.055 (3)", - "tab": "Fairness", - "score": 0.685 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)", - "tab": "General information", - "score": 694.6516666666666 - }, - "BoolQ - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "NarrativeQA - F1 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NarrativeQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NarrativeQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NarrativeQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NarrativeQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NarrativeQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NarrativeQA - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589, - "details": { - "description": "min=0.576, mean=0.589, max=0.605, sum=1.766 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.008, mean=0.014, max=0.021, sum=0.042 (3)", - "tab": "Calibration", - "score": 0.01399000614897039 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.081, mean=0.084, max=0.089, sum=0.253 (3)", - "tab": "Calibration", - "score": 0.08428284450081218 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.177, mean=0.187, max=0.195, sum=0.562 (3)", - "tab": "Robustness", - "score": 0.18733342573827472 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.485, mean=0.503, max=0.529, sum=1.51 (3)", - "tab": "Robustness", - "score": 0.5031846716563587 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.215, mean=0.217, max=0.221, sum=0.652 (3)", - "tab": "Fairness", - "score": 0.21726190588701 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.53, mean=0.539, max=0.557, sum=1.616 (3)", - "tab": "Fairness", - "score": 0.5388295929563434 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)", - "tab": "General information", - "score": 99.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.924, mean=6.729, max=7.956, sum=20.187 (3)", - "tab": "General information", - "score": 6.729 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)", - "tab": "General information", - "score": 4.666333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)", - "tab": "General information", - "score": 1418.4566666666667 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.825, mean=6.311, max=6.845, sum=18.932 (3)", - "tab": "General information", - "score": 6.310666666666666 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.502, mean=0.531, max=0.563, sum=1.594 (3)", - "tab": "Bias", - "score": 0.5313654482080615 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0, mean=0.079, max=0.192, sum=0.238 (3)", - "tab": "Bias", - "score": 0.07925407925407925 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.3, mean=0.433, max=0.5, sum=1.3 (3)", - "tab": "Bias", - "score": 0.43333333333333335 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.441, mean=0.504, max=0.574, sum=1.513 (3)", - "tab": "Bias", - "score": 0.5041929581337629 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.186, mean=0.203, max=0.225, sum=0.608 (3)", - "tab": "Bias", - "score": 0.20273109243697482 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "QuAC - F1 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "QuAC - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "QuAC - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "QuAC - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "QuAC - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "QuAC - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "QuAC - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "QuAC - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "QuAC - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "QuAC - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "QuAC - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "QuAC - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.729, - "details": { - "description": "min=0.729, mean=0.729, max=0.729, sum=0.729 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)", - "tab": "Robustness", - "score": 0.687 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)", - "tab": "Fairness", - "score": 0.567 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)", - "tab": "General information", - "score": 62.466 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)", - "tab": "Robustness", - "score": 0.448 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.45 (1)", - "tab": "Fairness", - "score": 0.45 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)", - "tab": "General information", - "score": 4.348 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245, - "details": { - "description": "min=0.22, mean=0.245, max=0.283, sum=0.734 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.043, mean=0.102, max=0.134, sum=0.306 (3)", - "tab": "Calibration", - "score": 0.1021312296645796 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.197, mean=0.21, max=0.228, sum=0.63 (3)", - "tab": "Robustness", - "score": 0.20998980632008157 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.185, mean=0.196, max=0.205, sum=0.589 (3)", - "tab": "Fairness", - "score": 0.1962283384301733 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)", - "tab": "General information", - "score": 355.0152905198777 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.464, - "details": { - "description": "min=0.454, mean=0.464, max=0.479, sum=1.393 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.168, mean=0.177, max=0.186, sum=0.532 (3)", - "tab": "Robustness", - "score": 0.1774849206349205 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.391, mean=0.397, max=0.403, sum=1.192 (3)", - "tab": "Robustness", - "score": 0.39737317282374035 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.21, mean=0.215, max=0.221, sum=0.646 (3)", - "tab": "Fairness", - "score": 0.21544642857142837 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.432, mean=0.44, max=0.457, sum=1.32 (3)", - "tab": "Fairness", - "score": 0.44015360771598083 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)", - "tab": "General information", - "score": 385.63633333333337 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=2.006, mean=2.012, max=2.022, sum=6.037 (3)", - "tab": "General information", - "score": 2.012333333333333 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)", - "tab": "General information", - "score": 373.3798449612403 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=2.023, mean=2.023, max=2.023, sum=6.07 (3)", - "tab": "General information", - "score": 2.0232558139534884 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.136, - "details": { - "description": "min=0.122, mean=0.136, max=0.15, sum=0.813 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)", - "tab": "General information", - "score": 1213.0321888412018 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=52.573, mean=58.246, max=61.575, sum=349.476 (6)", - "tab": "General information", - "score": 58.24606580829757 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.612, mean=0.647, max=0.667, sum=3.885 (6)", - "tab": "Bias", - "score": 0.6474734228728262 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.365, mean=0.405, max=0.442, sum=2.432 (6)", - "tab": "Bias", - "score": 0.405313769914252 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.175, mean=0.245, max=0.377, sum=1.468 (6)", - "tab": "Bias", - "score": 0.24474724360307878 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.103, mean=0.133, max=0.149, sum=0.796 (6)", - "tab": "Bias", - "score": 0.13266873135824753 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.465, mean=0.496, max=0.548, sum=1.488 (3)", - "tab": "Summarization metrics", - "score": 0.49606841741715785 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.242, mean=0.271, max=0.304, sum=0.812 (3)", - "tab": "Summarization metrics", - "score": 0.27057214623114106 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.952, mean=0.963, max=0.98, sum=5.779 (6)", - "tab": "Summarization metrics", - "score": 0.9630886941006946 - }, - "CNN/DailyMail - Density": { - "description": "min=15.279, mean=25.251, max=36.976, sum=151.506 (6)", - "tab": "Summarization metrics", - "score": 25.250963083991945 - }, - "CNN/DailyMail - Compression": { - "description": "min=9.923, mean=11.503, max=13.28, sum=69.019 (6)", - "tab": "Summarization metrics", - "score": 11.503115138085485 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.142, - "details": { - "description": "min=0.14, mean=0.142, max=0.145, sum=0.853 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)", - "tab": "General information", - "score": 1133.388030888031 - }, - "XSUM - # output tokens": { - "description": "min=21.112, mean=21.228, max=21.315, sum=127.371 (6)", - "tab": "General information", - "score": 21.22844272844273 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.433, mean=0.464, max=0.492, sum=2.785 (6)", - "tab": "Bias", - "score": 0.46417690732206857 - }, - "XSUM - Representation (race)": { - "description": "min=0.407, mean=0.58, max=0.667, sum=3.481 (6)", - "tab": "Bias", - "score": 0.5802469135802469 - }, - "XSUM - Representation (gender)": { - "description": "min=0.209, mean=0.22, max=0.234, sum=1.321 (6)", - "tab": "Bias", - "score": 0.2200902099970423 - }, - "XSUM - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.012 (6)", - "tab": "Toxicity", - "score": 0.0019305019305019308 - }, - "XSUM - SummaC": { - "description": "min=-0.306, mean=-0.278, max=-0.26, sum=-0.833 (3)", - "tab": "Summarization metrics", - "score": -0.27758991887056994 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.449, mean=0.45, max=0.451, sum=1.35 (3)", - "tab": "Summarization metrics", - "score": 0.44989833153156206 - }, - "XSUM - Coverage": { - "description": "min=0.781, mean=0.782, max=0.783, sum=4.694 (6)", - "tab": "Summarization metrics", - "score": 0.7823704015893701 - }, - "XSUM - Density": { - "description": "min=2.345, mean=2.659, max=2.826, sum=15.954 (6)", - "tab": "Summarization metrics", - "score": 2.6589249165198687 - }, - "XSUM - Compression": { - "description": "min=17.896, mean=18.03, max=18.26, sum=108.178 (6)", - "tab": "Summarization metrics", - "score": 18.02961749079778 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.956, - "details": { - "description": "min=0.952, mean=0.956, max=0.96, sum=2.869 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.153, mean=0.178, max=0.201, sum=0.534 (3)", - "tab": "Calibration", - "score": 0.17816129477822015 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.935, mean=0.941, max=0.946, sum=2.822 (3)", - "tab": "Robustness", - "score": 0.9406666666666667 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.941, mean=0.945, max=0.951, sum=2.835 (3)", - "tab": "Fairness", - "score": 0.945 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)", - "tab": "General information", - "score": 4.971666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)", - "tab": "General information", - "score": 1281.5773333333334 - }, - "IMDB - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57, - "details": { - "description": "min=0.149, mean=0.57, max=0.909, sum=30.8 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.033, mean=0.19, max=0.41, sum=10.274 (54)", - "tab": "Calibration", - "score": 0.19026595574841215 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.116, mean=0.469, max=0.844, sum=25.305 (54)", - "tab": "Robustness", - "score": 0.4686089323926605 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.143, mean=0.403, max=0.834, sum=21.752 (54)", - "tab": "Fairness", - "score": 0.4028192827891808 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)", - "tab": "General information", - "score": 532.6016121330534 - }, - "CivilComments - # output tokens": { - "description": "min=2, mean=2, max=2, sum=108 (54)", - "tab": "General information", - "score": 2.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.622, - "details": { - "description": "min=0.25, mean=0.622, max=0.975, sum=20.525 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.128, mean=0.254, max=0.441, sum=8.368 (33)", - "tab": "Calibration", - "score": 0.25356461082010057 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.498, max=0.975, sum=16.425 (33)", - "tab": "Robustness", - "score": 0.49772727272727263 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.2, mean=0.567, max=0.975, sum=18.725 (33)", - "tab": "Fairness", - "score": 0.5674242424242424 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)", - "tab": "General information", - "score": 4.657575757575757 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)", - "tab": "General information", - "score": 712.2477272727273 - }, - "RAFT - # output tokens": { - "description": "min=1.975, mean=3.562, max=6.575, sum=117.55 (33)", - "tab": "General information", - "score": 3.5621212121212116 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json deleted file mode 100644 index 7e02805f7..000000000 --- a/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Luminous Base 13B", - "id": "aleph-alpha/Luminous-Base-13B", - "developer": "aleph-alpha", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6405642923219241 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.31855477855477854 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.23762237762237765 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5516493320513314 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5035063701730368 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.42105263157894735 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27, - "details": { - "description": "min=0.193, mean=0.27, max=0.32, sum=4.045 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.087, mean=0.111, max=0.157, sum=1.661 (15)", - "tab": "Calibration", - "score": 0.110752611571227 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.1, mean=0.183, max=0.27, sum=2.74 (15)", - "tab": "Robustness", - "score": 0.1826549707602339 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.09, mean=0.185, max=0.27, sum=2.769 (15)", - "tab": "Fairness", - "score": 0.1845730994152047 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=360.75, mean=471.075, max=618.447, sum=7066.132 (15)", - "tab": "General information", - "score": 471.0754736842105 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719, - "details": { - "description": "min=0.7, mean=0.719, max=0.74, sum=2.156 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.056, mean=0.066, max=0.084, sum=0.197 (3)", - "tab": "Calibration", - "score": 0.06557915095556173 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.643, mean=0.655, max=0.673, sum=1.965 (3)", - "tab": "Robustness", - "score": 0.655 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.634, mean=0.653, max=0.682, sum=1.958 (3)", - "tab": "Fairness", - "score": 0.6526666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=651.658, mean=908.991, max=1252.658, sum=2726.974 (3)", - "tab": "General information", - "score": 908.9913333333333 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1.002, max=1.003, sum=3.006 (3)", - "tab": "General information", - "score": 1.002 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.605, - "details": { - "description": "min=0.577, mean=0.605, max=0.633, sum=1.815 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.04, mean=0.048, max=0.063, sum=0.145 (3)", - "tab": "Calibration", - "score": 0.04822831549746422 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.444, mean=0.476, max=0.505, sum=1.429 (3)", - "tab": "Robustness", - "score": 0.4761726989393548 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.462, mean=0.498, max=0.532, sum=1.495 (3)", - "tab": "Fairness", - "score": 0.4982467496641079 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.039, mean=1.621, max=2.037, sum=4.862 (3)", - "tab": "General information", - "score": 1.6206572769953052 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1606.952, mean=1647.783, max=1694.642, sum=4943.349 (3)", - "tab": "General information", - "score": 1647.783098591549 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.521, mean=6.798, max=8.192, sum=20.394 (3)", - "tab": "General information", - "score": 6.798122065727699 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.396, mean=0.438, max=0.5, sum=1.313 (3)", - "tab": "Bias", - "score": 0.4375901875901876 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)", - "tab": "Bias", - "score": 0.5555555555555557 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.152, mean=0.172, max=0.197, sum=0.516 (3)", - "tab": "Bias", - "score": 0.1718450326045263 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.02, mean=0.022, max=0.025, sum=0.065 (3)", - "tab": "Toxicity", - "score": 0.0215962441314554 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.568, - "details": { - "description": "min=0.563, mean=0.568, max=0.577, sum=1.705 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.039, mean=0.045, max=0.054, sum=0.136 (3)", - "tab": "Calibration", - "score": 0.04534548194935659 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.068, mean=0.07, max=0.074, sum=0.21 (3)", - "tab": "Calibration", - "score": 0.07013609628734997 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.157, mean=0.163, max=0.168, sum=0.489 (3)", - "tab": "Robustness", - "score": 0.1628593597054443 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.484, mean=0.491, max=0.498, sum=1.474 (3)", - "tab": "Robustness", - "score": 0.4912891920785376 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.156, mean=0.16, max=0.164, sum=0.481 (3)", - "tab": "Fairness", - "score": 0.16022586408623682 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.505, mean=0.511, max=0.515, sum=1.534 (3)", - "tab": "Fairness", - "score": 0.5114691771549933 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.087, mean=111.754, max=116.087, sum=335.261 (3)", - "tab": "General information", - "score": 111.75366666666667 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.314, mean=5.287, max=5.908, sum=15.861 (3)", - "tab": "General information", - "score": 5.287 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.691, mean=4.711, max=4.726, sum=14.134 (3)", - "tab": "General information", - "score": 4.711333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.039, max=0.04, sum=0.116 (3)", - "tab": "General information", - "score": 0.03866666666666666 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1224.733, mean=1384.565, max=1488.14, sum=4153.695 (3)", - "tab": "General information", - "score": 1384.5649999999998 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.685, mean=10.15, max=11.898, sum=30.449 (3)", - "tab": "General information", - "score": 10.149666666666667 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.25, mean=0.417, max=0.5, sum=1.25 (3)", - "tab": "Bias", - "score": 0.4166666666666667 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.339, mean=0.433, max=0.5, sum=1.298 (3)", - "tab": "Bias", - "score": 0.43278417840114286 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.081, mean=0.162, max=0.239, sum=0.486 (3)", - "tab": "Bias", - "score": 0.16214742091319934 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.3, mean=0.432, max=0.5, sum=1.296 (3)", - "tab": "Bias", - "score": 0.432010582010582 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.429, mean=0.457, max=0.498, sum=1.37 (3)", - "tab": "Bias", - "score": 0.45656911106888937 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.272, mean=0.32, max=0.416, sum=0.961 (3)", - "tab": "Bias", - "score": 0.3202891068062547 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.003, sum=0.004 (3)", - "tab": "Toxicity", - "score": 0.0013333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.334, - "details": { - "description": "min=0.317, mean=0.334, max=0.362, sum=1.003 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.068, mean=0.098, max=0.131, sum=0.295 (3)", - "tab": "Calibration", - "score": 0.09821008405024316 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.166, mean=0.185, max=0.212, sum=0.556 (3)", - "tab": "Robustness", - "score": 0.18543862521458307 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.251, mean=0.266, max=0.284, sum=0.799 (3)", - "tab": "Fairness", - "score": 0.2662906470176498 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.84, mean=0.909, max=0.991, sum=2.727 (3)", - "tab": "General information", - "score": 0.9089999999999999 - }, - "QuAC - truncated": { - "description": "min=0.029, mean=0.033, max=0.037, sum=0.098 (3)", - "tab": "General information", - "score": 0.03266666666666667 - }, - "QuAC - # prompt tokens": { - "description": "min=1596.904, mean=1641.256, max=1672.92, sum=4923.768 (3)", - "tab": "General information", - "score": 1641.256 - }, - "QuAC - # output tokens": { - "description": "min=18.527, mean=23.472, max=28.795, sum=70.415 (3)", - "tab": "General information", - "score": 23.471666666666668 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.641, mean=0.658, max=0.667, sum=1.974 (3)", - "tab": "Bias", - "score": 0.6581196581196581 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.401, mean=0.417, max=0.432, sum=1.251 (3)", - "tab": "Bias", - "score": 0.41695983406755 - }, - "QuAC - Representation (race)": { - "description": "min=0.258, mean=0.32, max=0.377, sum=0.96 (3)", - "tab": "Bias", - "score": 0.3200297021845843 - }, - "QuAC - Representation (gender)": { - "description": "min=0.193, mean=0.203, max=0.212, sum=0.61 (3)", - "tab": "Bias", - "score": 0.20338227449992274 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182, - "details": { - "description": "min=0.165, mean=0.182, max=0.194, sum=0.547 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.069, mean=0.081, max=0.095, sum=0.244 (3)", - "tab": "Calibration", - "score": 0.08144933240589737 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.107, mean=0.112, max=0.118, sum=0.335 (3)", - "tab": "Robustness", - "score": 0.11162079510703364 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.118, mean=0.125, max=0.13, sum=0.375 (3)", - "tab": "Fairness", - "score": 0.12487257900101938 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=504.073, mean=514.073, max=533.073, sum=1542.22 (3)", - "tab": "General information", - "score": 514.0733944954128 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.11, - "details": { - "description": "min=0.048, mean=0.11, max=0.147, sum=0.661 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1564.648, mean=1578.648, max=1593.648, sum=9471.888 (6)", - "tab": "General information", - "score": 1578.648068669528 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=59.824, mean=80.866, max=92.721, sum=485.197 (6)", - "tab": "General information", - "score": 80.86623748211731 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.607, mean=0.629, max=0.667, sum=3.775 (6)", - "tab": "Bias", - "score": 0.629159058053613 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.388, mean=0.408, max=0.443, sum=2.45 (6)", - "tab": "Bias", - "score": 0.40834546858679427 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.211, mean=0.287, max=0.333, sum=1.725 (6)", - "tab": "Bias", - "score": 0.2874529064836184 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.138, mean=0.164, max=0.192, sum=0.984 (6)", - "tab": "Bias", - "score": 0.16396073067980207 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.000715307582260372 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.076, mean=0.32, max=0.527, sum=0.959 (3)", - "tab": "Summarization metrics", - "score": 0.3197354449182434 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.045, mean=0.188, max=0.278, sum=0.563 (3)", - "tab": "Summarization metrics", - "score": 0.18776450739321585 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.543, mean=0.834, max=0.982, sum=5.004 (6)", - "tab": "Summarization metrics", - "score": 0.8340516341645151 - }, - "CNN/DailyMail - Density": { - "description": "min=15.163, mean=35.663, max=51.192, sum=213.977 (6)", - "tab": "Summarization metrics", - "score": 35.66281771790173 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.191, mean=9.346, max=11.345, sum=56.078 (6)", - "tab": "Summarization metrics", - "score": 9.346357628862261 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105, - "details": { - "description": "min=0.101, mean=0.105, max=0.107, sum=0.628 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1472.903, mean=1532.912, max=1566.407, sum=9197.471 (6)", - "tab": "General information", - "score": 1532.9118404118406 - }, - "XSUM - # output tokens": { - "description": "min=25.481, mean=26.021, max=26.315, sum=156.127 (6)", - "tab": "General information", - "score": 26.02123552123552 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.429, mean=0.442, max=0.453, sum=2.655 (6)", - "tab": "Bias", - "score": 0.4424845269672855 - }, - "XSUM - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Representation (gender)": { - "description": "min=0.153, mean=0.165, max=0.183, sum=0.99 (6)", - "tab": "Bias", - "score": 0.16492426719539477 - }, - "XSUM - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.012 (6)", - "tab": "Toxicity", - "score": 0.0019305019305019308 - }, - "XSUM - SummaC": { - "description": "min=-0.217, mean=-0.213, max=-0.206, sum=-0.639 (3)", - "tab": "Summarization metrics", - "score": -0.2129847266550281 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.391, mean=0.394, max=0.396, sum=1.183 (3)", - "tab": "Summarization metrics", - "score": 0.3944890669761573 - }, - "XSUM - Coverage": { - "description": "min=0.828, mean=0.834, max=0.838, sum=5.002 (6)", - "tab": "Summarization metrics", - "score": 0.8336902125268334 - }, - "XSUM - Density": { - "description": "min=4.128, mean=4.393, max=4.529, sum=26.358 (6)", - "tab": "Summarization metrics", - "score": 4.392991783737345 - }, - "XSUM - Compression": { - "description": "min=17.248, mean=17.535, max=17.956, sum=105.21 (6)", - "tab": "Summarization metrics", - "score": 17.535051923934834 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.939, - "details": { - "description": "min=0.931, mean=0.939, max=0.949, sum=2.818 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.187, mean=0.232, max=0.257, sum=0.695 (3)", - "tab": "Calibration", - "score": 0.23165086222498446 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.864, mean=0.887, max=0.918, sum=2.662 (3)", - "tab": "Robustness", - "score": 0.8873333333333333 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.902, mean=0.912, max=0.926, sum=2.737 (3)", - "tab": "Fairness", - "score": 0.9123333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.908, mean=4.236, max=4.985, sum=12.708 (3)", - "tab": "General information", - "score": 4.236000000000001 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1283.569, mean=1560.056, max=1777.712, sum=4680.167 (3)", - "tab": "General information", - "score": 1560.0556666666664 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.544, - "details": { - "description": "min=0.003, mean=0.544, max=1, sum=29.372 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.071, mean=0.28, max=0.632, sum=15.102 (54)", - "tab": "Calibration", - "score": 0.2796625331945748 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.416, max=0.99, sum=22.479 (54)", - "tab": "Robustness", - "score": 0.416268791059841 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.397, max=1, sum=21.425 (54)", - "tab": "Fairness", - "score": 0.3967651888403395 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.037, mean=724.782, max=1272.822, sum=39138.207 (54)", - "tab": "General information", - "score": 724.7816027688522 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473, - "details": { - "description": "min=0.025, mean=0.473, max=0.975, sum=15.625 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.115, mean=0.29, max=0.826, sum=9.575 (33)", - "tab": "Calibration", - "score": 0.29014727083072167 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.402, max=0.975, sum=13.25 (33)", - "tab": "Robustness", - "score": 0.4015151515151515 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.445, max=0.975, sum=14.7 (33)", - "tab": "Fairness", - "score": 0.4454545454545455 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.56, max=5, sum=150.475 (33)", - "tab": "General information", - "score": 4.5598484848484855 - }, - "RAFT - truncated": { - "description": "min=0, mean=0.002, max=0.025, sum=0.075 (33)", - "tab": "General information", - "score": 0.002272727272727273 - }, - "RAFT - # prompt tokens": { - "description": "min=262.3, mean=810.769, max=1759.65, sum=26755.375 (33)", - "tab": "General information", - "score": 810.7689393939394 - }, - "RAFT - # output tokens": { - "description": "min=0.75, mean=2.916, max=6.5, sum=96.225 (33)", - "tab": "General information", - "score": 2.91590909090909 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json deleted file mode 100644 index d6f8fa8ea..000000000 --- a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Luminous Extended 30B", - "id": "aleph-alpha/Luminous-Extended-30B", - "developer": "aleph-alpha", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.485, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.5765957446808511 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.42993006993006994 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.45142191142191146 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.629471974916769 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.7191265524598858 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5657894736842105 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.321, - "details": { - "description": "min=0.23, mean=0.321, max=0.49, sum=4.811 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.075, mean=0.135, max=0.225, sum=2.023 (15)", - "tab": "Calibration", - "score": 0.1348564339845485 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.1, mean=0.23, max=0.37, sum=3.451 (15)", - "tab": "Robustness", - "score": 0.23008187134502922 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.14, mean=0.237, max=0.35, sum=3.549 (15)", - "tab": "Fairness", - "score": 0.23658479532163745 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=360.75, mean=471.075, max=618.447, sum=7066.132 (15)", - "tab": "General information", - "score": 471.0754736842105 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.767, - "details": { - "description": "min=0.752, mean=0.767, max=0.794, sum=2.3 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.11, mean=0.129, max=0.154, sum=0.387 (3)", - "tab": "Calibration", - "score": 0.1289354797828563 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.637, mean=0.659, max=0.7, sum=1.976 (3)", - "tab": "Robustness", - "score": 0.6586666666666666 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.692, mean=0.711, max=0.733, sum=2.133 (3)", - "tab": "Fairness", - "score": 0.711 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=651.658, mean=908.991, max=1252.658, sum=2726.974 (3)", - "tab": "General information", - "score": 908.9913333333333 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.665, - "details": { - "description": "min=0.637, mean=0.665, max=0.684, sum=1.994 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.043, mean=0.046, max=0.047, sum=0.138 (3)", - "tab": "Calibration", - "score": 0.046063826868188405 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.481, mean=0.513, max=0.539, sum=1.54 (3)", - "tab": "Robustness", - "score": 0.513450295883327 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.503, mean=0.532, max=0.565, sum=1.597 (3)", - "tab": "Fairness", - "score": 0.5321907426131639 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.039, mean=1.621, max=2.037, sum=4.862 (3)", - "tab": "General information", - "score": 1.6206572769953052 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1606.952, mean=1647.783, max=1694.642, sum=4943.349 (3)", - "tab": "General information", - "score": 1647.783098591549 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.321, mean=7.042, max=8.175, sum=21.127 (3)", - "tab": "General information", - "score": 7.04225352112676 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.4, mean=0.416, max=0.44, sum=1.248 (3)", - "tab": "Bias", - "score": 0.4159611992945326 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)", - "tab": "Bias", - "score": 0.5555555555555557 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.186, mean=0.199, max=0.207, sum=0.598 (3)", - "tab": "Bias", - "score": 0.19931611685099856 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.017, max=0.02, sum=0.051 (3)", - "tab": "Toxicity", - "score": 0.016901408450704227 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609, - "details": { - "description": "min=0.606, mean=0.609, max=0.611, sum=1.827 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.018, mean=0.022, max=0.024, sum=0.065 (3)", - "tab": "Calibration", - "score": 0.02157162838647707 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.08, mean=0.09, max=0.095, sum=0.269 (3)", - "tab": "Calibration", - "score": 0.08979897901208977 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.205, mean=0.212, max=0.218, sum=0.635 (3)", - "tab": "Robustness", - "score": 0.211552896733343 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.515, mean=0.524, max=0.537, sum=1.572 (3)", - "tab": "Robustness", - "score": 0.5239378524073847 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.205, mean=0.214, max=0.22, sum=0.642 (3)", - "tab": "Fairness", - "score": 0.21385439000180537 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.548, mean=0.551, max=0.554, sum=1.654 (3)", - "tab": "Fairness", - "score": 0.5512241821510145 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.087, mean=111.754, max=116.087, sum=335.261 (3)", - "tab": "General information", - "score": 111.75366666666667 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.508, mean=6.119, max=6.869, sum=18.356 (3)", - "tab": "General information", - "score": 6.118666666666666 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.691, mean=4.711, max=4.726, sum=14.134 (3)", - "tab": "General information", - "score": 4.711333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.039, max=0.04, sum=0.116 (3)", - "tab": "General information", - "score": 0.03866666666666666 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1224.733, mean=1384.565, max=1488.14, sum=4153.695 (3)", - "tab": "General information", - "score": 1384.5649999999998 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.216, mean=10.3, max=11.913, sum=30.9 (3)", - "tab": "General information", - "score": 10.299999999999999 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.379, mean=0.46, max=0.5, sum=1.379 (3)", - "tab": "Bias", - "score": 0.4597701149425288 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.414, mean=0.435, max=0.447, sum=1.304 (3)", - "tab": "Bias", - "score": 0.43455385345385017 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.15, mean=0.223, max=0.269, sum=0.669 (3)", - "tab": "Bias", - "score": 0.2230769230769231 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.4, mean=0.411, max=0.433, sum=1.233 (3)", - "tab": "Bias", - "score": 0.41111111111111115 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.421, mean=0.441, max=0.477, sum=1.324 (3)", - "tab": "Bias", - "score": 0.44143286168772855 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.022, mean=0.045, max=0.082, sum=0.135 (3)", - "tab": "Bias", - "score": 0.04515740195666192 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349, - "details": { - "description": "min=0.34, mean=0.349, max=0.363, sum=1.047 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.081, mean=0.096, max=0.116, sum=0.287 (3)", - "tab": "Calibration", - "score": 0.09561324552236967 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.188, mean=0.193, max=0.201, sum=0.578 (3)", - "tab": "Robustness", - "score": 0.1926796273359054 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.268, mean=0.277, max=0.295, sum=0.832 (3)", - "tab": "Fairness", - "score": 0.2774375608495023 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.84, mean=0.909, max=0.991, sum=2.727 (3)", - "tab": "General information", - "score": 0.9089999999999999 - }, - "QuAC - truncated": { - "description": "min=0.029, mean=0.033, max=0.037, sum=0.098 (3)", - "tab": "General information", - "score": 0.03266666666666667 - }, - "QuAC - # prompt tokens": { - "description": "min=1596.904, mean=1641.256, max=1672.92, sum=4923.768 (3)", - "tab": "General information", - "score": 1641.256 - }, - "QuAC - # output tokens": { - "description": "min=20.299, mean=21.144, max=22.408, sum=63.432 (3)", - "tab": "General information", - "score": 21.144000000000002 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.59, mean=0.612, max=0.636, sum=1.837 (3)", - "tab": "Bias", - "score": 0.6124061124061125 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.382, mean=0.403, max=0.421, sum=1.208 (3)", - "tab": "Bias", - "score": 0.40276421801932005 - }, - "QuAC - Representation (race)": { - "description": "min=0.202, mean=0.24, max=0.259, sum=0.719 (3)", - "tab": "Bias", - "score": 0.23980711859954595 - }, - "QuAC - Representation (gender)": { - "description": "min=0.194, mean=0.2, max=0.205, sum=0.601 (3)", - "tab": "Bias", - "score": 0.20029662396768255 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)", - "tab": "Toxicity", - "score": 0.0003333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.221, - "details": { - "description": "min=0.208, mean=0.221, max=0.231, sum=0.662 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.057, mean=0.064, max=0.068, sum=0.192 (3)", - "tab": "Calibration", - "score": 0.0641638452052097 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.139, mean=0.151, max=0.161, sum=0.454 (3)", - "tab": "Robustness", - "score": 0.15137614678899083 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.144, mean=0.16, max=0.171, sum=0.479 (3)", - "tab": "Fairness", - "score": 0.15953109072375127 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=504.073, mean=514.073, max=533.073, sum=1542.22 (3)", - "tab": "General information", - "score": 514.0733944954128 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.139, - "details": { - "description": "min=0.117, mean=0.139, max=0.15, sum=0.834 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1564.648, mean=1578.648, max=1593.648, sum=9471.888 (6)", - "tab": "General information", - "score": 1578.648068669528 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=73.322, mean=83.112, max=88.178, sum=498.674 (6)", - "tab": "General information", - "score": 83.11230329041489 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.58, mean=0.608, max=0.637, sum=3.651 (6)", - "tab": "Bias", - "score": 0.6084787955510622 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.382, mean=0.391, max=0.398, sum=2.347 (6)", - "tab": "Bias", - "score": 0.3911797965697547 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.254, mean=0.274, max=0.288, sum=1.642 (6)", - "tab": "Bias", - "score": 0.27361254875467617 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.128, mean=0.151, max=0.191, sum=0.909 (6)", - "tab": "Bias", - "score": 0.15142644383010628 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.309, mean=0.481, max=0.569, sum=1.443 (3)", - "tab": "Summarization metrics", - "score": 0.4809362133230566 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.202, mean=0.255, max=0.288, sum=0.766 (3)", - "tab": "Summarization metrics", - "score": 0.25521962437955664 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.8, mean=0.925, max=0.989, sum=5.552 (6)", - "tab": "Summarization metrics", - "score": 0.9253891304300669 - }, - "CNN/DailyMail - Density": { - "description": "min=34.945, mean=41.619, max=45.552, sum=249.715 (6)", - "tab": "Summarization metrics", - "score": 41.61911540769457 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.478, mean=9.039, max=9.909, sum=54.236 (6)", - "tab": "Summarization metrics", - "score": 9.039273431117751 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.124, - "details": { - "description": "min=0.122, mean=0.124, max=0.126, sum=0.742 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1472.903, mean=1532.912, max=1566.407, sum=9197.471 (6)", - "tab": "General information", - "score": 1532.9118404118406 - }, - "XSUM - # output tokens": { - "description": "min=25.747, mean=25.987, max=26.212, sum=155.923 (6)", - "tab": "General information", - "score": 25.987129987129986 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.449, mean=0.45, max=0.451, sum=2.701 (6)", - "tab": "Bias", - "score": 0.450224364113253 - }, - "XSUM - Representation (race)": { - "description": "min=0.532, mean=0.547, max=0.565, sum=3.282 (6)", - "tab": "Bias", - "score": 0.5469576096753798 - }, - "XSUM - Representation (gender)": { - "description": "min=0.212, mean=0.214, max=0.217, sum=1.283 (6)", - "tab": "Bias", - "score": 0.2138886962661304 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)", - "tab": "Toxicity", - "score": 0.0019305019305019308 - }, - "XSUM - SummaC": { - "description": "min=-0.233, mean=-0.225, max=-0.212, sum=-0.675 (3)", - "tab": "Summarization metrics", - "score": -0.22500232932190178 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.419, mean=0.423, max=0.427, sum=1.269 (3)", - "tab": "Summarization metrics", - "score": 0.4230439766625391 - }, - "XSUM - Coverage": { - "description": "min=0.817, mean=0.818, max=0.819, sum=4.91 (6)", - "tab": "Summarization metrics", - "score": 0.8184154242425056 - }, - "XSUM - Density": { - "description": "min=3.392, mean=3.507, max=3.668, sum=21.042 (6)", - "tab": "Summarization metrics", - "score": 3.507010978728374 - }, - "XSUM - Compression": { - "description": "min=17.136, mean=17.376, max=17.524, sum=104.258 (6)", - "tab": "Summarization metrics", - "score": 17.376290660463752 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.947, - "details": { - "description": "min=0.944, mean=0.947, max=0.951, sum=2.842 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.177, mean=0.204, max=0.232, sum=0.612 (3)", - "tab": "Calibration", - "score": 0.2038815444945483 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.913, mean=0.92, max=0.933, sum=2.76 (3)", - "tab": "Robustness", - "score": 0.9199999999999999 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.93, mean=0.937, max=0.946, sum=2.811 (3)", - "tab": "Fairness", - "score": 0.9369999999999999 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.908, mean=4.236, max=4.985, sum=12.708 (3)", - "tab": "General information", - "score": 4.236000000000001 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1283.569, mean=1560.056, max=1777.712, sum=4680.167 (3)", - "tab": "General information", - "score": 1560.0556666666664 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524, - "details": { - "description": "min=0.014, mean=0.524, max=0.997, sum=28.276 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.112, mean=0.359, max=0.619, sum=19.409 (54)", - "tab": "Calibration", - "score": 0.35941964376806523 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.011, mean=0.368, max=0.874, sum=19.881 (54)", - "tab": "Robustness", - "score": 0.36816849425853654 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.462, max=0.985, sum=24.963 (54)", - "tab": "Fairness", - "score": 0.4622866273105216 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.037, mean=724.782, max=1272.822, sum=39138.207 (54)", - "tab": "General information", - "score": 724.7816027688522 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.523, - "details": { - "description": "min=0, mean=0.523, max=0.925, sum=17.25 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.143, mean=0.29, max=0.954, sum=9.577 (33)", - "tab": "Calibration", - "score": 0.2902057183123561 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.436, max=0.825, sum=14.4 (33)", - "tab": "Robustness", - "score": 0.43636363636363645 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.489, max=0.925, sum=16.15 (33)", - "tab": "Fairness", - "score": 0.4893939393939393 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.56, max=5, sum=150.475 (33)", - "tab": "General information", - "score": 4.5598484848484855 - }, - "RAFT - truncated": { - "description": "min=0, mean=0.002, max=0.025, sum=0.075 (33)", - "tab": "General information", - "score": 0.002272727272727273 - }, - "RAFT - # prompt tokens": { - "description": "min=262.3, mean=810.769, max=1759.65, sum=26755.375 (33)", - "tab": "General information", - "score": 810.7689393939394 - }, - "RAFT - # output tokens": { - "description": "min=0.125, mean=2.796, max=6.825, sum=92.275 (33)", - "tab": "General information", - "score": 2.796212121212121 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json deleted file mode 100644 index 5680298fb..000000000 --- a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Luminous Supreme 70B", - "id": "aleph-alpha/Luminous-Supreme-70B", - "developer": "aleph-alpha", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6242368177613321 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.5464102564102564 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.5218648018648019 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5709490829944818 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5562049062049063 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.7171052631578947 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38, - "details": { - "description": "min=0.22, mean=0.38, max=0.61, sum=5.702 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.122, mean=0.154, max=0.217, sum=2.31 (15)", - "tab": "Calibration", - "score": 0.15396738685964684 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.08, mean=0.255, max=0.51, sum=3.821 (15)", - "tab": "Robustness", - "score": 0.2547368421052632 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.11, mean=0.264, max=0.51, sum=3.955 (15)", - "tab": "Fairness", - "score": 0.2636608187134503 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=360.75, mean=471.075, max=618.447, sum=7066.132 (15)", - "tab": "General information", - "score": 471.0754736842105 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "details": { - "description": "min=0.748, mean=0.775, max=0.795, sum=2.325 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.06, mean=0.083, max=0.111, sum=0.248 (3)", - "tab": "Calibration", - "score": 0.08277086924611576 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.624, mean=0.665, max=0.693, sum=1.996 (3)", - "tab": "Robustness", - "score": 0.6653333333333333 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.66, mean=0.694, max=0.713, sum=2.081 (3)", - "tab": "Fairness", - "score": 0.6936666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=651.658, mean=908.991, max=1252.658, sum=2726.974 (3)", - "tab": "General information", - "score": 908.9913333333333 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.687, mean=0.711, max=0.742, sum=2.133 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.036, mean=0.049, max=0.061, sum=0.147 (3)", - "tab": "Calibration", - "score": 0.04915634481869984 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.557, mean=0.59, max=0.617, sum=1.771 (3)", - "tab": "Robustness", - "score": 0.5902392957151222 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.562, mean=0.603, max=0.637, sum=1.808 (3)", - "tab": "Fairness", - "score": 0.6025352758861713 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.039, mean=1.621, max=2.037, sum=4.862 (3)", - "tab": "General information", - "score": 1.6206572769953052 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1606.952, mean=1647.783, max=1694.642, sum=4943.349 (3)", - "tab": "General information", - "score": 1647.783098591549 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.749, mean=6.84, max=8.158, sum=20.521 (3)", - "tab": "General information", - "score": 6.84037558685446 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.396, mean=0.465, max=0.5, sum=1.396 (3)", - "tab": "Bias", - "score": 0.46527777777777773 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.216, mean=0.238, max=0.256, sum=0.714 (3)", - "tab": "Bias", - "score": 0.23804020866547204 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.016, max=0.02, sum=0.048 (3)", - "tab": "Toxicity", - "score": 0.01596244131455399 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649, - "details": { - "description": "min=0.644, mean=0.649, max=0.656, sum=1.946 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.035, mean=0.041, max=0.045, sum=0.123 (3)", - "tab": "Calibration", - "score": 0.04112615448004484 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.07, mean=0.074, max=0.077, sum=0.222 (3)", - "tab": "Calibration", - "score": 0.07410001302901324 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.243, mean=0.252, max=0.261, sum=0.757 (3)", - "tab": "Robustness", - "score": 0.25230806968086933 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.576, mean=0.586, max=0.593, sum=1.758 (3)", - "tab": "Robustness", - "score": 0.5861072363623724 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.23, mean=0.241, max=0.25, sum=0.723 (3)", - "tab": "Fairness", - "score": 0.24089192251975544 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.583, mean=0.597, max=0.61, sum=1.79 (3)", - "tab": "Fairness", - "score": 0.5966421355805813 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.087, mean=111.754, max=116.087, sum=335.261 (3)", - "tab": "General information", - "score": 111.75366666666667 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.262, mean=4.508, max=4.666, sum=13.525 (3)", - "tab": "General information", - "score": 4.508333333333334 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.691, mean=4.711, max=4.726, sum=14.134 (3)", - "tab": "General information", - "score": 4.711333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.039, max=0.04, sum=0.116 (3)", - "tab": "General information", - "score": 0.03866666666666666 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1224.733, mean=1384.565, max=1488.14, sum=4153.695 (3)", - "tab": "General information", - "score": 1384.5649999999998 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.064, mean=6.362, max=6.864, sum=19.086 (3)", - "tab": "General information", - "score": 6.361999999999999 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.338, mean=0.446, max=0.5, sum=1.338 (3)", - "tab": "Bias", - "score": 0.445882557030098 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.467, mean=0.48, max=0.498, sum=1.441 (3)", - "tab": "Bias", - "score": 0.48022397745392514 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.02, mean=0.125, max=0.265, sum=0.374 (3)", - "tab": "Bias", - "score": 0.12466386554621849 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.4, mean=0.444, max=0.5, sum=1.333 (3)", - "tab": "Bias", - "score": 0.4444444444444445 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.401, mean=0.44, max=0.506, sum=1.319 (3)", - "tab": "Bias", - "score": 0.43982889050590296 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.205, mean=0.22, max=0.25, sum=0.66 (3)", - "tab": "Bias", - "score": 0.2201426024955437 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (3)", - "tab": "Toxicity", - "score": 0.0013333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37, - "details": { - "description": "min=0.364, mean=0.37, max=0.378, sum=1.111 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.054, mean=0.058, max=0.061, sum=0.175 (3)", - "tab": "Calibration", - "score": 0.05820640656843105 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.221, mean=0.233, max=0.24, sum=0.699 (3)", - "tab": "Robustness", - "score": 0.23311906486145426 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.28, mean=0.288, max=0.3, sum=0.865 (3)", - "tab": "Fairness", - "score": 0.28824116919086756 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.84, mean=0.909, max=0.991, sum=2.727 (3)", - "tab": "General information", - "score": 0.9089999999999999 - }, - "QuAC - truncated": { - "description": "min=0.029, mean=0.033, max=0.037, sum=0.098 (3)", - "tab": "General information", - "score": 0.03266666666666667 - }, - "QuAC - # prompt tokens": { - "description": "min=1596.904, mean=1641.256, max=1672.92, sum=4923.768 (3)", - "tab": "General information", - "score": 1641.256 - }, - "QuAC - # output tokens": { - "description": "min=22.638, mean=26.241, max=28.094, sum=78.723 (3)", - "tab": "General information", - "score": 26.241000000000003 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.571, mean=0.598, max=0.615, sum=1.794 (3)", - "tab": "Bias", - "score": 0.5980796023899473 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.408, mean=0.412, max=0.415, sum=1.236 (3)", - "tab": "Bias", - "score": 0.41214192227908586 - }, - "QuAC - Representation (race)": { - "description": "min=0.269, mean=0.305, max=0.351, sum=0.914 (3)", - "tab": "Bias", - "score": 0.3046567170277752 - }, - "QuAC - Representation (gender)": { - "description": "min=0.227, mean=0.232, max=0.235, sum=0.696 (3)", - "tab": "Bias", - "score": 0.23187441800624423 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.003, sum=0.007 (3)", - "tab": "Toxicity", - "score": 0.0023333333333333335 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.222, - "details": { - "description": "min=0.2, mean=0.222, max=0.258, sum=0.667 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.072, mean=0.092, max=0.102, sum=0.276 (3)", - "tab": "Calibration", - "score": 0.09195091586715554 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.092, mean=0.106, max=0.121, sum=0.318 (3)", - "tab": "Robustness", - "score": 0.10601427115188583 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.128, mean=0.132, max=0.138, sum=0.396 (3)", - "tab": "Fairness", - "score": 0.13200815494393475 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=504.073, mean=514.073, max=533.073, sum=1542.22 (3)", - "tab": "General information", - "score": 514.0733944954128 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.15, - "details": { - "description": "min=0.133, mean=0.15, max=0.16, sum=0.899 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1564.648, mean=1578.648, max=1593.648, sum=9471.888 (6)", - "tab": "General information", - "score": 1578.648068669528 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=71.758, mean=75.51, max=79.294, sum=453.06 (6)", - "tab": "General information", - "score": 75.51001430615165 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.621, mean=0.63, max=0.646, sum=3.782 (6)", - "tab": "Bias", - "score": 0.6303974395279242 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.39, mean=0.401, max=0.412, sum=2.406 (6)", - "tab": "Bias", - "score": 0.4010246477666291 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.281, mean=0.291, max=0.297, sum=1.746 (6)", - "tab": "Bias", - "score": 0.2910346586068148 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.114, mean=0.13, max=0.148, sum=0.782 (6)", - "tab": "Bias", - "score": 0.1303630037220396 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.423, mean=0.552, max=0.624, sum=1.656 (3)", - "tab": "Summarization metrics", - "score": 0.5518853318256234 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.236, mean=0.28, max=0.304, sum=0.841 (3)", - "tab": "Summarization metrics", - "score": 0.28049037475726807 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.846, mean=0.939, max=0.988, sum=5.636 (6)", - "tab": "Summarization metrics", - "score": 0.9393220183960566 - }, - "CNN/DailyMail - Density": { - "description": "min=31.874, mean=33.625, max=34.739, sum=201.751 (6)", - "tab": "Summarization metrics", - "score": 33.625141882714196 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.884, mean=9.298, max=9.552, sum=55.787 (6)", - "tab": "Summarization metrics", - "score": 9.29781469578472 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.136, - "details": { - "description": "min=0.133, mean=0.136, max=0.14, sum=0.813 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1472.903, mean=1532.912, max=1566.407, sum=9197.471 (6)", - "tab": "General information", - "score": 1532.9118404118406 - }, - "XSUM - # output tokens": { - "description": "min=25.844, mean=26.423, max=26.988, sum=158.537 (6)", - "tab": "General information", - "score": 26.422779922779924 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.42, mean=0.439, max=0.456, sum=2.635 (6)", - "tab": "Bias", - "score": 0.4390946502057613 - }, - "XSUM - Representation (race)": { - "description": "min=0.532, mean=0.544, max=0.556, sum=3.264 (6)", - "tab": "Bias", - "score": 0.5439341780805197 - }, - "XSUM - Representation (gender)": { - "description": "min=0.201, mean=0.206, max=0.21, sum=1.238 (6)", - "tab": "Bias", - "score": 0.2063342186388344 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)", - "tab": "Toxicity", - "score": 0.001287001287001287 - }, - "XSUM - SummaC": { - "description": "min=-0.251, mean=-0.241, max=-0.231, sum=-0.723 (3)", - "tab": "Summarization metrics", - "score": -0.2409771191414105 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.442, mean=0.444, max=0.446, sum=1.331 (3)", - "tab": "Summarization metrics", - "score": 0.44350630738930513 - }, - "XSUM - Coverage": { - "description": "min=0.799, mean=0.807, max=0.816, sum=4.841 (6)", - "tab": "Summarization metrics", - "score": 0.8068883614050096 - }, - "XSUM - Density": { - "description": "min=2.852, mean=3.08, max=3.225, sum=18.481 (6)", - "tab": "Summarization metrics", - "score": 3.080091964253596 - }, - "XSUM - Compression": { - "description": "min=16.326, mean=16.97, max=17.573, sum=101.823 (6)", - "tab": "Summarization metrics", - "score": 16.97049624677277 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.959, - "details": { - "description": "min=0.957, mean=0.959, max=0.961, sum=2.878 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.137, mean=0.173, max=0.222, sum=0.519 (3)", - "tab": "Calibration", - "score": 0.1730084935772459 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.931, mean=0.932, max=0.934, sum=2.797 (3)", - "tab": "Robustness", - "score": 0.9323333333333333 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.948, mean=0.949, max=0.951, sum=2.848 (3)", - "tab": "Fairness", - "score": 0.9493333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.908, mean=4.236, max=4.985, sum=12.708 (3)", - "tab": "General information", - "score": 4.236000000000001 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1283.569, mean=1560.056, max=1777.712, sum=4680.167 (3)", - "tab": "General information", - "score": 1560.0556666666664 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.049, mean=0.562, max=0.984, sum=30.331 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.051, mean=0.272, max=0.563, sum=14.71 (54)", - "tab": "Calibration", - "score": 0.27240452987490027 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.035, mean=0.263, max=0.67, sum=14.178 (54)", - "tab": "Robustness", - "score": 0.26255411827214337 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.014, mean=0.432, max=0.912, sum=23.313 (54)", - "tab": "Fairness", - "score": 0.4317285215923749 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.037, mean=724.782, max=1272.822, sum=39138.207 (54)", - "tab": "General information", - "score": 724.7816027688522 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.653, - "details": { - "description": "min=0, mean=0.653, max=0.975, sum=21.55 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.072, mean=0.238, max=1, sum=7.863 (33)", - "tab": "Calibration", - "score": 0.238277000839632 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.564, max=0.975, sum=18.6 (33)", - "tab": "Robustness", - "score": 0.5636363636363637 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.601, max=0.975, sum=19.825 (33)", - "tab": "Fairness", - "score": 0.6007575757575758 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.56, max=5, sum=150.475 (33)", - "tab": "General information", - "score": 4.5598484848484855 - }, - "RAFT - truncated": { - "description": "min=0, mean=0.002, max=0.025, sum=0.075 (33)", - "tab": "General information", - "score": 0.002272727272727273 - }, - "RAFT - # prompt tokens": { - "description": "min=262.3, mean=810.769, max=1759.65, sum=26755.375 (33)", - "tab": "General information", - "score": 810.7689393939394 - }, - "RAFT - # output tokens": { - "description": "min=0, mean=3.097, max=6.725, sum=102.2 (33)", - "tab": "General information", - "score": 3.0969696969696976 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json deleted file mode 100644 index caffd542e..000000000 --- a/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BLOOM 176B", - "id": "bigscience/BLOOM-176B", - "developer": "bigscience", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.3480016788296159 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.5409357605686861 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.5507003378527294 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.26823464912280703 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5459762982621468 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5959534292867626 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.29074770258980787 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.299, - "details": { - "description": "min=0.19, mean=0.299, max=0.42, sum=4.481 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.115, mean=0.137, max=0.173, sum=2.054 (15)", - "tab": "Calibration", - "score": 0.13690038983912287 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.167, mean=0.25, max=0.38, sum=3.754 (15)", - "tab": "Robustness", - "score": 0.25025730994152046 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.175, mean=0.274, max=0.38, sum=4.104 (15)", - "tab": "Fairness", - "score": 0.27360233918128657 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.135, mean=0.233, max=0.418, sum=3.493 (15)", - "tab": "Efficiency", - "score": 0.23288457024982262 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=333.02, mean=436.99, max=574.658, sum=6554.844 (15)", - "tab": "General information", - "score": 436.9895789473684 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704, - "details": { - "description": "min=0.659, mean=0.704, max=0.728, sum=2.112 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.153, mean=0.209, max=0.247, sum=0.626 (3)", - "tab": "Calibration", - "score": 0.2086643852555177 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.595, mean=0.642, max=0.674, sum=1.926 (3)", - "tab": "Robustness", - "score": 0.642 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.601, mean=0.656, max=0.693, sum=1.968 (3)", - "tab": "Fairness", - "score": 0.656 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.665, mean=0.853, max=1.05, sum=2.558 (3)", - "tab": "Efficiency", - "score": 0.852823399183769 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=636.774, mean=897.107, max=1242.774, sum=2691.322 (3)", - "tab": "General information", - "score": 897.1073333333333 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662, - "details": { - "description": "min=0.631, mean=0.662, max=0.695, sum=1.986 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.231, mean=0.237, max=0.242, sum=0.712 (3)", - "tab": "Calibration", - "score": 0.2374266630696186 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.468, mean=0.53, max=0.574, sum=1.591 (3)", - "tab": "Robustness", - "score": 0.5303029858435905 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.535, mean=0.577, max=0.613, sum=1.73 (3)", - "tab": "Fairness", - "score": 0.5767895596204061 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=2.081, mean=2.598, max=3.427, sum=7.794 (3)", - "tab": "Efficiency", - "score": 2.5979962524114084 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.042, mean=1.621, max=2.048, sum=4.862 (3)", - "tab": "General information", - "score": 1.6206572769953052 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1604.899, mean=1649.598, max=1699.146, sum=4948.794 (3)", - "tab": "General information", - "score": 1649.5981220657277 - }, - "NarrativeQA - # output tokens": { - "description": "min=18.468, mean=33.276, max=50.499, sum=99.828 (3)", - "tab": "General information", - "score": 33.27605633802816 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.333, mean=0.355, max=0.389, sum=1.065 (3)", - "tab": "Bias", - "score": 0.354945620223398 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.118, mean=0.165, max=0.241, sum=0.494 (3)", - "tab": "Bias", - "score": 0.16472050143449737 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.012, max=0.014, sum=0.037 (3)", - "tab": "Toxicity", - "score": 0.012206572769953052 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621, - "details": { - "description": "min=0.61, mean=0.621, max=0.628, sum=1.864 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.11, mean=0.116, max=0.118, sum=0.347 (3)", - "tab": "Calibration", - "score": 0.11564225453050514 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.338, mean=0.347, max=0.36, sum=1.041 (3)", - "tab": "Calibration", - "score": 0.3469801265406112 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.18, mean=0.185, max=0.19, sum=0.556 (3)", - "tab": "Robustness", - "score": 0.18537100322417385 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.547, mean=0.558, max=0.569, sum=1.675 (3)", - "tab": "Robustness", - "score": 0.5582069622847597 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.183, mean=0.187, max=0.189, sum=0.56 (3)", - "tab": "Fairness", - "score": 0.18669047090402127 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.56, mean=0.575, max=0.585, sum=1.724 (3)", - "tab": "Fairness", - "score": 0.5745618824682682 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.931, mean=1.115, max=1.261, sum=3.346 (3)", - "tab": "Efficiency", - "score": 1.115412127906084 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=2.213, mean=2.547, max=2.912, sum=7.64 (3)", - "tab": "Efficiency", - "score": 2.546660231937965 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=92.12, mean=96.12, max=102.12, sum=288.36 (3)", - "tab": "General information", - "score": 96.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=34.82, mean=48.109, max=57.074, sum=144.327 (3)", - "tab": "General information", - "score": 48.109 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.73, mean=4.743, max=4.751, sum=14.229 (3)", - "tab": "General information", - "score": 4.743000000000001 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.035, mean=0.035, max=0.035, sum=0.105 (3)", - "tab": "General information", - "score": 0.035 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1193.69, mean=1313.422, max=1423.457, sum=3940.267 (3)", - "tab": "General information", - "score": 1313.4223333333334 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=31.304, mean=38.803, max=46.481, sum=116.409 (3)", - "tab": "General information", - "score": 38.803000000000004 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.303, mean=0.418, max=0.519, sum=1.254 (3)", - "tab": "Bias", - "score": 0.4180133480204756 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.056, mean=0.09, max=0.143, sum=0.27 (3)", - "tab": "Bias", - "score": 0.08994708994708996 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.352, mean=0.426, max=0.5, sum=0.852 (2)", - "tab": "Bias", - "score": 0.42619047619047623 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.454, mean=0.499, max=0.546, sum=1.498 (3)", - "tab": "Bias", - "score": 0.499333679443982 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.11, mean=0.135, max=0.177, sum=0.404 (3)", - "tab": "Bias", - "score": 0.13470779383719764 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)", - "tab": "Toxicity", - "score": 0.0013333333333333333 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.361, - "details": { - "description": "min=0.342, mean=0.361, max=0.375, sum=1.082 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.103, mean=0.122, max=0.142, sum=0.367 (3)", - "tab": "Calibration", - "score": 0.1222163558834574 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.229, mean=0.234, max=0.24, sum=0.701 (3)", - "tab": "Robustness", - "score": 0.23376457225319638 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.265, mean=0.273, max=0.289, sum=0.82 (3)", - "tab": "Fairness", - "score": 0.27335853114408787 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=5.124, mean=5.306, max=5.436, sum=15.919 (3)", - "tab": "Efficiency", - "score": 5.3062709801205585 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.855, mean=0.944, max=1.07, sum=2.832 (3)", - "tab": "General information", - "score": 0.944 - }, - "QuAC - truncated": { - "description": "min=0.017, mean=0.017, max=0.017, sum=0.051 (3)", - "tab": "General information", - "score": 0.017 - }, - "QuAC - # prompt tokens": { - "description": "min=1614.308, mean=1639.494, max=1673.303, sum=4918.482 (3)", - "tab": "General information", - "score": 1639.494 - }, - "QuAC - # output tokens": { - "description": "min=86.351, mean=90.164, max=93.357, sum=270.491 (3)", - "tab": "General information", - "score": 90.16366666666666 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.604, mean=0.631, max=0.647, sum=1.894 (3)", - "tab": "Bias", - "score": 0.6313294548588666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.388, mean=0.396, max=0.408, sum=1.189 (3)", - "tab": "Bias", - "score": 0.3963840842187811 - }, - "QuAC - Representation (race)": { - "description": "min=0.35, mean=0.365, max=0.381, sum=1.094 (3)", - "tab": "Bias", - "score": 0.3645250034421991 - }, - "QuAC - Representation (gender)": { - "description": "min=0.235, mean=0.244, max=0.26, sum=0.732 (3)", - "tab": "Bias", - "score": 0.2440549375970967 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.744, - "details": { - "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)", - "tab": "Calibration", - "score": 0.2926428762465171 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)", - "tab": "Robustness", - "score": 0.699 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)", - "tab": "Fairness", - "score": 0.585 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.075, mean=0.075, max=0.075, sum=0.075 (1)", - "tab": "Efficiency", - "score": 0.07493321968615055 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.875, mean=88.875, max=88.875, sum=88.875 (1)", - "tab": "General information", - "score": 88.875 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534, - "details": { - "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)", - "tab": "Calibration", - "score": 0.24842661648577113 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)", - "tab": "Robustness", - "score": 0.438 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)", - "tab": "Fairness", - "score": 0.482 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.032, mean=0.032, max=0.032, sum=0.032 (1)", - "tab": "Efficiency", - "score": 0.03224579076468945 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.444, mean=5.444, max=5.444, sum=5.444 (1)", - "tab": "General information", - "score": 5.444 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.205, - "details": { - "description": "min=0.197, mean=0.205, max=0.211, sum=0.82 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.053, mean=0.096, max=0.128, sum=0.385 (4)", - "tab": "Calibration", - "score": 0.09624512475777981 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.168, mean=0.183, max=0.206, sum=0.734 (4)", - "tab": "Robustness", - "score": 0.1834862385321101 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.164, mean=0.186, max=0.206, sum=0.745 (4)", - "tab": "Fairness", - "score": 0.18616207951070335 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.084, mean=0.143, max=0.226, sum=0.573 (4)", - "tab": "Efficiency", - "score": 0.14325443854568073 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.75, max=5, sum=15 (4)", - "tab": "General information", - "score": 3.75 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=79.361, mean=370.611, max=481.361, sum=1482.443 (4)", - "tab": "General information", - "score": 370.6108562691131 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386, - "details": { - "description": "min=0.364, mean=0.386, max=0.429, sum=1.158 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.158, mean=0.19, max=0.218, sum=0.57 (3)", - "tab": "Robustness", - "score": 0.18996269841269822 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.304, mean=0.333, max=0.385, sum=0.998 (3)", - "tab": "Robustness", - "score": 0.33254039819149694 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.189, mean=0.211, max=0.231, sum=0.633 (3)", - "tab": "Fairness", - "score": 0.2110978835978834 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.345, mean=0.371, max=0.418, sum=1.114 (3)", - "tab": "Fairness", - "score": 0.37148573288404924 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.246, mean=0.257, max=0.27, sum=0.77 (3)", - "tab": "Efficiency", - "score": 0.25680491607178446 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.227, mean=0.246, max=0.271, sum=0.739 (3)", - "tab": "Efficiency", - "score": 0.24635170979166832 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=484.472, mean=524.472, max=570.472, sum=1573.416 (3)", - "tab": "General information", - "score": 524.472 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=466.814, mean=506.814, max=552.814, sum=1520.442 (3)", - "tab": "General information", - "score": 506.81395348837214 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.08, - "details": { - "description": "min=0.052, mean=0.08, max=0.118, sum=0.478 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=5.515, mean=5.584, max=5.648, sum=33.506 (6)", - "tab": "Efficiency", - "score": 5.5842744588340345 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1520.33, mean=1541.33, max=1578.33, sum=9247.983 (6)", - "tab": "General information", - "score": 1541.3304721030042 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=104.867, mean=117.435, max=124.011, sum=704.609 (6)", - "tab": "General information", - "score": 117.4349070100143 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.641, mean=0.658, max=0.667, sum=3.949 (6)", - "tab": "Bias", - "score": 0.6581699346405229 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.372, mean=0.385, max=0.405, sum=2.311 (6)", - "tab": "Bias", - "score": 0.3851952735514946 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.291, mean=0.314, max=0.352, sum=1.882 (6)", - "tab": "Bias", - "score": 0.31373280163525924 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.119, mean=0.145, max=0.16, sum=0.872 (6)", - "tab": "Bias", - "score": 0.14536660393941517 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.129, mean=-0.02, max=0.115, sum=-0.059 (3)", - "tab": "Summarization metrics", - "score": -0.01977462275373982 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.63, mean=4.665, max=4.719, sum=27.988 (6)", - "tab": "Summarization metrics", - "score": 4.66471171081461 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.005, mean=0.08, max=0.184, sum=0.24 (3)", - "tab": "Summarization metrics", - "score": 0.08008308750782954 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.618, mean=0.71, max=0.826, sum=4.26 (6)", - "tab": "Summarization metrics", - "score": 0.7099913231813372 - }, - "CNN/DailyMail - Density": { - "description": "min=20.964, mean=32.013, max=45.756, sum=192.081 (6)", - "tab": "Summarization metrics", - "score": 32.0134921906249 - }, - "CNN/DailyMail - Compression": { - "description": "min=4.623, mean=5.252, max=6.434, sum=31.514 (6)", - "tab": "Summarization metrics", - "score": 5.2523388558949184 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.03, - "details": { - "description": "min=0.022, mean=0.03, max=0.038, sum=0.179 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=3.874, mean=3.9, max=3.923, sum=23.4 (6)", - "tab": "Efficiency", - "score": 3.899962288877679 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.338, mean=1501.338, max=1528.338, sum=9008.027 (6)", - "tab": "General information", - "score": 1501.3378378378377 - }, - "XSUM - # output tokens": { - "description": "min=50.606, mean=54.066, max=57.05, sum=324.394 (6)", - "tab": "General information", - "score": 54.06563706563707 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.45, mean=0.467, max=0.5, sum=2.802 (6)", - "tab": "Bias", - "score": 0.46699346405228753 - }, - "XSUM - Representation (race)": { - "description": "min=0.238, mean=0.309, max=0.356, sum=1.856 (6)", - "tab": "Bias", - "score": 0.3092501368363437 - }, - "XSUM - Representation (gender)": { - "description": "min=0.109, mean=0.172, max=0.212, sum=1.032 (6)", - "tab": "Bias", - "score": 0.17201180425265794 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.365, mean=-0.35, max=-0.335, sum=-1.049 (3)", - "tab": "Summarization metrics", - "score": -0.3496571157539257 - }, - "XSUM - QAFactEval": { - "description": "min=4.196, mean=4.778, max=5.107, sum=28.667 (6)", - "tab": "Summarization metrics", - "score": 4.77785601273731 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.025, mean=0.059, max=0.095, sum=0.177 (3)", - "tab": "Summarization metrics", - "score": 0.05904374779925766 - }, - "XSUM - Coverage": { - "description": "min=0.48, mean=0.515, max=0.553, sum=3.091 (6)", - "tab": "Summarization metrics", - "score": 0.5151319646119767 - }, - "XSUM - Density": { - "description": "min=1.41, mean=1.764, max=2.014, sum=10.585 (6)", - "tab": "Summarization metrics", - "score": 1.764128575895107 - }, - "XSUM - Compression": { - "description": "min=7.741, mean=8.934, max=10.222, sum=53.603 (6)", - "tab": "Summarization metrics", - "score": 8.933804533381347 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "details": { - "description": "min=0.936, mean=0.945, max=0.95, sum=2.836 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.305, mean=0.343, max=0.41, sum=1.029 (3)", - "tab": "Calibration", - "score": 0.3430318396761201 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.907, mean=0.92, max=0.927, sum=2.761 (3)", - "tab": "Robustness", - "score": 0.9203333333333333 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.927, mean=0.938, max=0.946, sum=2.814 (3)", - "tab": "Fairness", - "score": 0.9380000000000001 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=3.425, mean=3.536, max=3.659, sum=10.608 (3)", - "tab": "Efficiency", - "score": 3.5360445948161456 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.876, mean=4.943, max=4.987, sum=14.83 (3)", - "tab": "General information", - "score": 4.943333333333333 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1129.265, mean=1375.21, max=1727.698, sum=4125.631 (3)", - "tab": "General information", - "score": 1375.2103333333334 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62, - "details": { - "description": "min=0.293, mean=0.62, max=0.92, sum=33.467 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.069, mean=0.262, max=0.456, sum=14.142 (54)", - "tab": "Calibration", - "score": 0.26189371110201226 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.088, mean=0.467, max=0.827, sum=25.192 (54)", - "tab": "Robustness", - "score": 0.46652660062188434 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.252, mean=0.546, max=0.91, sum=29.488 (54)", - "tab": "Fairness", - "score": 0.5460670492526992 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.316, mean=0.533, max=1.372, sum=28.76 (54)", - "tab": "Efficiency", - "score": 0.5325854907984409 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=327.671, mean=683.498, max=1208.636, sum=36908.883 (54)", - "tab": "General information", - "score": 683.497824649871 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.592, - "details": { - "description": "min=0.25, mean=0.592, max=0.975, sum=19.525 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.246, mean=0.44, max=0.775, sum=14.508 (33)", - "tab": "Calibration", - "score": 0.4396262000869267 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.175, mean=0.527, max=0.95, sum=17.375 (33)", - "tab": "Robustness", - "score": 0.5265151515151515 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.2, mean=0.563, max=0.975, sum=18.575 (33)", - "tab": "Fairness", - "score": 0.5628787878787879 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.258, mean=1.866, max=3.777, sum=61.574 (33)", - "tab": "Efficiency", - "score": 1.86588385979184 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.05, mean=4.567, max=5, sum=150.725 (33)", - "tab": "General information", - "score": 4.567424242424242 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=234.025, mean=779.203, max=1729.4, sum=25713.7 (33)", - "tab": "General information", - "score": 779.2030303030305 - }, - "RAFT - # output tokens": { - "description": "min=5, mean=7.127, max=13.7, sum=235.2 (33)", - "tab": "General information", - "score": 7.127272727272727 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json deleted file mode 100644 index 400f064d5..000000000 --- a/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "T0pp 11B", - "id": "bigscience/T0pp-11B", - "developer": "bigscience", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.197, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.7577474560592045 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.2275932400932401 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.20273892773892774 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.42000000000000004 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.6045183982683983 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.3965229215229215 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407, - "details": { - "description": "min=0.25, mean=0.407, max=0.67, sum=6.098 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.074, mean=0.168, max=0.3, sum=2.515 (15)", - "tab": "Calibration", - "score": 0.16765379656947835 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.25, mean=0.378, max=0.62, sum=5.675 (15)", - "tab": "Robustness", - "score": 0.37832748538011696 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.25, mean=0.382, max=0.63, sum=5.731 (15)", - "tab": "Fairness", - "score": 0.3820701754385965 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.141, mean=0.145, max=0.149, sum=2.18 (15)", - "tab": "Efficiency", - "score": 0.1453571324242486 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=386.05, mean=492.01, max=639.561, sum=7380.154 (15)", - "tab": "General information", - "score": 492.0102807017544 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0, - "details": { - "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.208, mean=0.322, max=0.435, sum=0.967 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Calibration", - "score": 0.32218942300251074 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Robustness", - "score": 0.0 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Fairness", - "score": 0.0 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.366, mean=0.374, max=0.385, sum=1.121 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Efficiency", - "score": 0.3736038734018803 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=2.027, mean=3.972, max=4.988, sum=11.915 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 3.971666666666667 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=479.758, mean=702.438, max=905.932, sum=2107.314 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 702.4380000000001 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "min=0, mean=0.25, max=0.5, sum=0.5 (2)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": 0.25 - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.151, - "details": { - "description": "min=0.139, mean=0.151, max=0.158, sum=0.454 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)", - "tab": "Calibration", - "score": 0.000042543589701120735 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.087, mean=0.099, max=0.105, sum=0.296 (3)", - "tab": "Robustness", - "score": 0.09874765137769782 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.074, mean=0.086, max=0.093, sum=0.258 (3)", - "tab": "Fairness", - "score": 0.0858526263629113 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.848, mean=0.945, max=1.053, sum=2.834 (3)", - "tab": "Efficiency", - "score": 0.9445703822729286 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0, mean=0.187, max=0.33, sum=0.561 (3)", - "tab": "General information", - "score": 0.18685446009389672 - }, - "NarrativeQA - truncated": { - "description": "min=0.369, mean=0.372, max=0.377, sum=1.115 (3)", - "tab": "General information", - "score": 0.37183098591549296 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=807.577, mean=877.742, max=916.668, sum=2633.225 (3)", - "tab": "General information", - "score": 877.7417840375587 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=300 (3)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.332, mean=0.339, max=0.343, sum=1.017 (3)", - "tab": "Bias", - "score": 0.3389834657156105 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.093, mean=0.105, max=0.113, sum=0.314 (3)", - "tab": "Bias", - "score": 0.1046501526237907 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.023, mean=0.023, max=0.025, sum=0.07 (3)", - "tab": "Toxicity", - "score": 0.02347417840375587 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.19, - "details": { - "description": "min=0.171, mean=0.19, max=0.203, sum=0.569 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)", - "tab": "Calibration", - "score": 3.521055021161368e-9 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)", - "tab": "Calibration", - "score": 0.00009644610962286308 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.03, mean=0.031, max=0.032, sum=0.092 (3)", - "tab": "Robustness", - "score": 0.030683511825215847 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.101, mean=0.122, max=0.135, sum=0.367 (3)", - "tab": "Robustness", - "score": 0.12220564653363493 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.027, mean=0.028, max=0.03, sum=0.084 (3)", - "tab": "Fairness", - "score": 0.028132918197666456 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.119, mean=0.136, max=0.151, sum=0.407 (3)", - "tab": "Fairness", - "score": 0.13562055302845238 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=1.309, mean=1.457, max=1.621, sum=4.371 (3)", - "tab": "Efficiency", - "score": 1.4571279249547553 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=2.864, mean=2.895, max=2.953, sum=8.685 (3)", - "tab": "Efficiency", - "score": 2.8950855693236632 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.556, mean=113.556, max=118.556, sum=340.668 (3)", - "tab": "General information", - "score": 113.556 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=900 (3)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=3.164, mean=3.396, max=3.709, sum=10.189 (3)", - "tab": "General information", - "score": 3.396333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.052, mean=0.057, max=0.066, sum=0.172 (3)", - "tab": "General information", - "score": 0.057333333333333326 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=850.863, mean=903.877, max=958.904, sum=2711.631 (3)", - "tab": "General information", - "score": 903.8770000000001 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=900 (3)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.385, mean=0.462, max=0.5, sum=1.385 (3)", - "tab": "Bias", - "score": 0.46155024509803927 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.552, mean=0.613, max=0.657, sum=1.84 (3)", - "tab": "Bias", - "score": 0.6131917464492584 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.028, mean=0.177, max=0.252, sum=0.53 (3)", - "tab": "Bias", - "score": 0.17673498741459906 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.209, mean=0.329, max=0.473, sum=0.987 (3)", - "tab": "Bias", - "score": 0.32890264223378113 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.289, mean=0.388, max=0.456, sum=1.164 (3)", - "tab": "Bias", - "score": 0.38814814814814813 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.394, mean=0.462, max=0.563, sum=1.386 (3)", - "tab": "Bias", - "score": 0.4620750643944221 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.044, mean=0.091, max=0.176, sum=0.273 (3)", - "tab": "Bias", - "score": 0.09087407629591253 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)", - "tab": "Toxicity", - "score": 0.0013333333333333333 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)", - "tab": "Toxicity", - "score": 0.0003333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.121, - "details": { - "description": "min=0.121, mean=0.121, max=0.121, sum=0.362 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.002 (3)", - "tab": "Calibration", - "score": 0.0005015010499976317 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.071, mean=0.071, max=0.071, sum=0.212 (3)", - "tab": "Robustness", - "score": 0.07065126152546952 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.067, mean=0.067, max=0.067, sum=0.201 (3)", - "tab": "Fairness", - "score": 0.06691720655918869 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.239, mean=1.239, max=1.239, sum=3.716 (3)", - "tab": "Efficiency", - "score": 1.2385025575706792 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - truncated": { - "description": "min=0.985, mean=0.985, max=0.985, sum=2.955 (3)", - "tab": "General information", - "score": 0.985 - }, - "QuAC - # prompt tokens": { - "description": "min=823.365, mean=823.365, max=823.365, sum=2470.095 (3)", - "tab": "General information", - "score": 823.3650000000001 - }, - "QuAC - # output tokens": { - "description": "min=100, mean=100, max=100, sum=300 (3)", - "tab": "General information", - "score": 100.0 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=1.284 (3)", - "tab": "Bias", - "score": 0.42797040922040913 - }, - "QuAC - Representation (race)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=1.308 (3)", - "tab": "Bias", - "score": 0.4358974358974359 - }, - "QuAC - Representation (gender)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.872 (3)", - "tab": "Bias", - "score": 0.2905073649754501 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.377, - "details": { - "description": "min=0.347, mean=0.377, max=0.411, sum=1.508 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.1, mean=0.154, max=0.234, sum=0.617 (4)", - "tab": "Calibration", - "score": 0.15413479575183991 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.33, mean=0.365, max=0.411, sum=1.46 (4)", - "tab": "Robustness", - "score": 0.3650611620795107 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.307, mean=0.35, max=0.411, sum=1.399 (4)", - "tab": "Fairness", - "score": 0.34977064220183485 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.133, mean=0.142, max=0.145, sum=0.567 (4)", - "tab": "Efficiency", - "score": 0.14173421436146078 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.75, max=5, sum=15 (4)", - "tab": "General information", - "score": 3.75 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=85.896, mean=391.646, max=515.896, sum=1566.584 (4)", - "tab": "General information", - "score": 391.6460244648318 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.122, - "details": { - "description": "min=0.121, mean=0.122, max=0.122, sum=0.73 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=1.057, mean=1.066, max=1.081, sum=6.393 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Efficiency", - "score": 1.0655231237061773 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=1.303, mean=1.335, max=1.378, sum=8.013 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 1.3354792560801145 - }, - "CNN/DailyMail - truncated": { - "description": "min=0.004, mean=0.004, max=0.004, sum=0.026 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 0.004291845493562232 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=885.292, mean=886.838, max=888.921, sum=5321.026 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 886.8376251788268 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=128, mean=128, max=128, sum=768 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 128.0 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.562, mean=0.594, max=0.631, sum=3.562 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": 0.5936999598322023 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.391, mean=0.403, max=0.421, sum=2.417 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": 0.4028700462262689 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.27, mean=0.277, max=0.282, sum=1.662 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": 0.2769263317991031 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.047, mean=0.093, max=0.138, sum=0.559 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": 0.09311410441258088 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.052, mean=-0.044, max=-0.031, sum=-0.132 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": -0.04384894228805586 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.151, mean=0.155, max=0.163, sum=0.465 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": 0.1550916195946839 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.836, mean=0.841, max=0.845, sum=5.047 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": 0.841192270385719 - }, - "CNN/DailyMail - Density": { - "description": "min=8.147, mean=8.588, max=8.816, sum=51.53 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": 8.588383920302716 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.169, mean=8.274, max=8.416, sum=49.643 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": 8.27387938295926 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.09, - "details": { - "description": "min=0.07, mean=0.09, max=0.103, sum=0.539 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.523, mean=0.554, max=0.571, sum=3.326 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Efficiency", - "score": 0.5543883131537052 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=1.967, mean=2.068, max=2.214, sum=12.405 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 2.0675675675675675 - }, - "XSUM - truncated": { - "description": "min=0.002, mean=0.01, max=0.019, sum=0.058 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 0.009652509652509652 - }, - "XSUM - # prompt tokens": { - "description": "min=889.981, mean=907.769, max=929.006, sum=5446.614 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 907.7689832689833 - }, - "XSUM - # output tokens": { - "description": "min=64, mean=64, max=64, sum=384 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 64.0 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.43, mean=0.444, max=0.463, sum=2.663 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": 0.4438297255067441 - }, - "XSUM - Representation (race)": { - "description": "min=0.286, mean=0.457, max=0.617, sum=2.74 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": 0.45673778645470176 - }, - "XSUM - Representation (gender)": { - "description": "min=0.215, mean=0.27, max=0.328, sum=1.62 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Bias", - "score": 0.2699471127776433 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Toxicity", - "score": 0.0006435006435006435 - }, - "XSUM - SummaC": { - "description": "min=-0.331, mean=-0.3, max=-0.268, sum=-0.901 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": -0.3004745337800477 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.083, mean=0.097, max=0.111, sum=0.292 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": 0.09723521885401472 - }, - "XSUM - Coverage": { - "description": "min=0.543, mean=0.579, max=0.605, sum=3.474 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": 0.5789418979978066 - }, - "XSUM - Density": { - "description": "min=1.492, mean=1.684, max=1.861, sum=10.105 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": 1.6841663389066148 - }, - "XSUM - Compression": { - "description": "min=10.341, mean=11.178, max=11.672, sum=67.065 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Summarization metrics", - "score": 11.17756803869132 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.207, - "details": { - "description": "min=0.181, mean=0.207, max=0.26, sum=0.622 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.207, mean=0.291, max=0.36, sum=0.872 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Calibration", - "score": 0.29061500207311436 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.13, mean=0.17, max=0.227, sum=0.511 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Robustness", - "score": 0.17033333333333334 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.129, mean=0.168, max=0.22, sum=0.505 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Fairness", - "score": 0.16833333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.37, mean=0.393, max=0.436, sum=1.18 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "Efficiency", - "score": 0.39343433208828427 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=1.981, mean=2.44, max=3.074, sum=7.321 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 2.4403333333333332 - }, - "IMDB - truncated": { - "description": "min=0.03, mean=0.03, max=0.03, sum=0.09 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 0.03 - }, - "IMDB - # prompt tokens": { - "description": "min=905.879, mean=910.174, max=913.752, sum=2730.521 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 910.1736666666666 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.234, - "details": { - "description": "min=0, mean=0.234, max=0.985, sum=12.634 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.067, mean=0.308, max=0.574, sum=16.631 (54)", - "tab": "Calibration", - "score": 0.30797595023001567 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.087, max=0.824, sum=4.704 (54)", - "tab": "Robustness", - "score": 0.0871064519307774 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.165, max=0.947, sum=8.894 (54)", - "tab": "Fairness", - "score": 0.16470832145418626 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.328, mean=0.391, max=0.487, sum=21.126 (54)", - "tab": "Efficiency", - "score": 0.3912135341654548 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=2.991, mean=4.861, max=5, sum=262.497 (54)", - "tab": "General information", - "score": 4.861055391438897 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=385.732, mean=744.109, max=936.562, sum=40181.894 (54)", - "tab": "General information", - "score": 744.1091399163704 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "min=0.3, mean=0.459, max=0.5, sum=5.503 (12)", - "tab": "Bias", - "score": 0.4585978835978836 - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.008, sum=0.025 (54)", - "tab": "Toxicity", - "score": 0.0004596436870303355 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.118, - "details": { - "description": "min=0, mean=0.118, max=0.775, sum=3.9 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.0, mean=0.086, max=0.573, sum=2.84 (33)", - "tab": "Calibration", - "score": 0.08607203532710274 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.085, max=0.775, sum=2.8 (33)", - "tab": "Robustness", - "score": 0.08484848484848484 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.106, max=0.75, sum=3.5 (33)", - "tab": "Fairness", - "score": 0.10606060606060606 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.329, mean=0.586, max=0.74, sum=19.352 (33)", - "tab": "Efficiency", - "score": 0.586429068475456 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=3.913, max=5, sum=129.125 (33)", - "tab": "General information", - "score": 3.912878787878788 - }, - "RAFT - truncated": { - "description": "min=0, mean=0.09, max=0.925, sum=2.975 (33)", - "tab": "General information", - "score": 0.09015151515151516 - }, - "RAFT - # prompt tokens": { - "description": "min=263.4, mean=650.012, max=949.7, sum=21450.4 (33)", - "tab": "General information", - "score": 650.0121212121212 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=990 (33)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "min=0.125, mean=0.125, max=0.125, sum=0.375 (3)", - "tab": "Bias", - "score": 0.12500000000000003 - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json deleted file mode 100644 index 25f29c7e2..000000000 --- a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere Command beta 52.4B", - "id": "cohere/Cohere-Command-beta-52.4B", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.874, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.5963856625666678 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.8502739196287583 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.8657917351465738 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5758163753811841 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6738178488178488 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.6776315789473684 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.23, mean=0.452, max=0.79, sum=6.786 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.099, mean=0.183, max=0.338, sum=2.742 (15)", - "tab": "Calibration", - "score": 0.18282231471159943 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.15, mean=0.387, max=0.73, sum=5.807 (15)", - "tab": "Robustness", - "score": 0.38711111111111113 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.19, mean=0.407, max=0.73, sum=6.107 (15)", - "tab": "Fairness", - "score": 0.4071111111111111 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)", - "tab": "General information", - "score": 481.2602105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.856, - "details": { - "description": "min=0.849, mean=0.856, max=0.86, sum=2.569 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.018, mean=0.023, max=0.026, sum=0.069 (3)", - "tab": "Calibration", - "score": 0.02302613493537822 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.806, mean=0.811, max=0.816, sum=2.432 (3)", - "tab": "Robustness", - "score": 0.8106666666666666 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.812, mean=0.822, max=0.827, sum=2.465 (3)", - "tab": "Fairness", - "score": 0.8216666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)", - "tab": "General information", - "score": 925.3070000000001 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.744, mean=0.752, max=0.763, sum=2.255 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.051, mean=0.058, max=0.067, sum=0.173 (3)", - "tab": "Calibration", - "score": 0.05761424791814445 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.566, mean=0.57, max=0.578, sum=1.711 (3)", - "tab": "Robustness", - "score": 0.5702997988620334 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.647, mean=0.657, max=0.666, sum=1.97 (3)", - "tab": "Fairness", - "score": 0.6566736137653061 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0.904, mean=1.508, max=1.941, sum=4.524 (3)", - "tab": "General information", - "score": 1.5079812206572771 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1570.772, mean=1600.684, max=1660.485, sum=4802.051 (3)", - "tab": "General information", - "score": 1600.6835680751174 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.679, mean=5.992, max=6.496, sum=17.977 (3)", - "tab": "General information", - "score": 5.992488262910798 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.361, mean=0.404, max=0.444, sum=1.213 (3)", - "tab": "Bias", - "score": 0.404320987654321 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.174, mean=0.178, max=0.181, sum=0.534 (3)", - "tab": "Bias", - "score": 0.1778748183802931 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.014, max=0.017, sum=0.042 (3)", - "tab": "Toxicity", - "score": 0.014084507042253521 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.755, mean=0.76, max=0.763, sum=2.28 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.075, mean=0.084, max=0.091, sum=0.251 (3)", - "tab": "Calibration", - "score": 0.08377931898267306 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.044, mean=0.056, max=0.063, sum=0.168 (3)", - "tab": "Calibration", - "score": 0.05602757611120105 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.286, mean=0.289, max=0.294, sum=0.867 (3)", - "tab": "Robustness", - "score": 0.28891923018489013 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.669, mean=0.679, max=0.685, sum=2.036 (3)", - "tab": "Robustness", - "score": 0.6786112890887687 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.29, mean=0.296, max=0.301, sum=0.888 (3)", - "tab": "Fairness", - "score": 0.29608566298974776 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.7, mean=0.706, max=0.714, sum=2.117 (3)", - "tab": "Fairness", - "score": 0.7056823207366739 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)", - "tab": "General information", - "score": 111.19099999999999 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.29, mean=4.325, max=4.367, sum=12.974 (3)", - "tab": "General information", - "score": 4.324666666666666 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.485, mean=4.602, max=4.705, sum=13.807 (3)", - "tab": "General information", - "score": 4.602333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1258.15, mean=1471.073, max=1597.431, sum=4413.22 (3)", - "tab": "General information", - "score": 1471.073333333333 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.153, mean=7.288, max=7.488, sum=21.864 (3)", - "tab": "General information", - "score": 7.288 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.487, mean=0.552, max=0.634, sum=1.655 (3)", - "tab": "Bias", - "score": 0.5517958743765196 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.063, mean=0.129, max=0.206, sum=0.387 (3)", - "tab": "Bias", - "score": 0.12914332399626519 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.479, mean=0.482, max=0.483, sum=1.446 (3)", - "tab": "Bias", - "score": 0.48194444444444445 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.577, mean=0.579, max=0.582, sum=1.737 (3)", - "tab": "Bias", - "score": 0.5791309646902151 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.025, mean=0.05, max=0.067, sum=0.151 (3)", - "tab": "Bias", - "score": 0.05047080979284368 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432, - "details": { - "description": "min=0.429, mean=0.432, max=0.435, sum=1.296 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.043, mean=0.06, max=0.073, sum=0.181 (3)", - "tab": "Calibration", - "score": 0.06049762085119498 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.236, mean=0.238, max=0.24, sum=0.715 (3)", - "tab": "Robustness", - "score": 0.23825281130135667 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.309, mean=0.316, max=0.322, sum=0.947 (3)", - "tab": "Fairness", - "score": 0.31563184414828255 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.748, mean=0.848, max=0.933, sum=2.545 (3)", - "tab": "General information", - "score": 0.8483333333333333 - }, - "QuAC - truncated": { - "description": "min=0.022, mean=0.022, max=0.022, sum=0.066 (3)", - "tab": "General information", - "score": 0.022000000000000002 - }, - "QuAC - # prompt tokens": { - "description": "min=1577.224, mean=1610.503, max=1643.74, sum=4831.508 (3)", - "tab": "General information", - "score": 1610.5026666666665 - }, - "QuAC - # output tokens": { - "description": "min=19.435, mean=19.627, max=19.984, sum=58.881 (3)", - "tab": "General information", - "score": 19.627 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.593, mean=0.596, max=0.603, sum=1.788 (3)", - "tab": "Bias", - "score": 0.5961199294532628 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.459, mean=0.47, max=0.484, sum=1.409 (3)", - "tab": "Bias", - "score": 0.4696816360952984 - }, - "QuAC - Representation (race)": { - "description": "min=0.299, mean=0.316, max=0.333, sum=0.949 (3)", - "tab": "Bias", - "score": 0.316297459154602 - }, - "QuAC - Representation (gender)": { - "description": "min=0.219, mean=0.232, max=0.245, sum=0.695 (3)", - "tab": "Bias", - "score": 0.23168423828159934 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811, - "details": { - "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)", - "tab": "Calibration", - "score": 0.3246923611213033 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.774, mean=0.774, max=0.774, sum=0.774 (1)", - "tab": "Robustness", - "score": 0.774 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)", - "tab": "Fairness", - "score": 0.699 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)", - "tab": "General information", - "score": 88.855 - }, - "HellaSwag - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.582, - "details": { - "description": "min=0.582, mean=0.582, max=0.582, sum=0.582 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.231, mean=0.231, max=0.231, sum=0.231 (1)", - "tab": "Calibration", - "score": 0.23111297495969485 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.492, mean=0.492, max=0.492, sum=0.492 (1)", - "tab": "Robustness", - "score": 0.492 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.508, mean=0.508, max=0.508, sum=0.508 (1)", - "tab": "Fairness", - "score": 0.508 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)", - "tab": "General information", - "score": 5.358 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.269, - "details": { - "description": "min=0.265, mean=0.269, max=0.275, sum=0.807 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.272, mean=0.311, max=0.338, sum=0.933 (3)", - "tab": "Calibration", - "score": 0.31095945192078733 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.226, mean=0.229, max=0.231, sum=0.688 (3)", - "tab": "Robustness", - "score": 0.2293577981651376 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.219, mean=0.222, max=0.225, sum=0.665 (3)", - "tab": "Fairness", - "score": 0.2217125382262997 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)", - "tab": "General information", - "score": 514.6483180428135 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.761, mean=0.762, max=0.765, sum=2.287 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.429, mean=0.434, max=0.438, sum=1.303 (3)", - "tab": "Robustness", - "score": 0.43439140211640154 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.726, mean=0.734, max=0.743, sum=2.202 (3)", - "tab": "Robustness", - "score": 0.7339375978505934 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.444, mean=0.45, max=0.453, sum=1.35 (3)", - "tab": "Fairness", - "score": 0.4498752645502638 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.745, mean=0.748, max=0.752, sum=2.245 (3)", - "tab": "Fairness", - "score": 0.7483868294443408 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)", - "tab": "General information", - "score": 536.6143333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)", - "tab": "General information", - "score": 519.4961240310078 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.161, - "details": { - "description": "min=0.156, mean=0.161, max=0.167, sum=0.966 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)", - "tab": "General information", - "score": 1575.0364806866953 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=72.088, mean=74.406, max=77.451, sum=446.433 (6)", - "tab": "General information", - "score": 74.40557939914163 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.587, mean=0.612, max=0.629, sum=3.673 (6)", - "tab": "Bias", - "score": 0.6121656731068496 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.391, mean=0.396, max=0.407, sum=2.379 (6)", - "tab": "Bias", - "score": 0.39642600089657387 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.238, mean=0.286, max=0.343, sum=1.713 (6)", - "tab": "Bias", - "score": 0.28558037967512334 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.088, mean=0.09, max=0.093, sum=0.537 (6)", - "tab": "Bias", - "score": 0.08955985269326716 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.366, mean=0.415, max=0.441, sum=1.245 (3)", - "tab": "Summarization metrics", - "score": 0.4149051333035736 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.316, mean=0.318, max=0.322, sum=0.955 (3)", - "tab": "Summarization metrics", - "score": 0.31834420143428105 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.976, mean=0.979, max=0.982, sum=5.874 (6)", - "tab": "Summarization metrics", - "score": 0.9790462109521986 - }, - "CNN/DailyMail - Density": { - "description": "min=28.96, mean=32.165, max=35.676, sum=192.989 (6)", - "tab": "Summarization metrics", - "score": 32.164866076836944 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.594, mean=9.156, max=9.657, sum=54.938 (6)", - "tab": "Summarization metrics", - "score": 9.156293880030324 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.152, - "details": { - "description": "min=0.147, mean=0.152, max=0.156, sum=0.913 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.996, mean=4.997, max=5, sum=29.985 (6)", - "tab": "General information", - "score": 4.997425997425997 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1484.608, mean=1537.293, max=1572.616, sum=9223.757 (6)", - "tab": "General information", - "score": 1537.2927927927929 - }, - "XSUM - # output tokens": { - "description": "min=24.187, mean=24.351, max=24.541, sum=146.108 (6)", - "tab": "General information", - "score": 24.35135135135135 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.433, mean=0.457, max=0.476, sum=2.745 (6)", - "tab": "Bias", - "score": 0.4574302134646962 - }, - "XSUM - Representation (race)": { - "description": "min=0.481, mean=0.522, max=0.556, sum=3.13 (6)", - "tab": "Bias", - "score": 0.5217473884140551 - }, - "XSUM - Representation (gender)": { - "description": "min=0.18, mean=0.181, max=0.182, sum=1.086 (6)", - "tab": "Bias", - "score": 0.1810207108427353 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.285, mean=-0.271, max=-0.262, sum=-0.814 (3)", - "tab": "Summarization metrics", - "score": -0.27140173856816235 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.455, mean=0.459, max=0.462, sum=1.376 (3)", - "tab": "Summarization metrics", - "score": 0.4587225678869484 - }, - "XSUM - Coverage": { - "description": "min=0.788, mean=0.793, max=0.797, sum=4.758 (6)", - "tab": "Summarization metrics", - "score": 0.7930169105851288 - }, - "XSUM - Density": { - "description": "min=2.417, mean=2.548, max=2.678, sum=15.286 (6)", - "tab": "Summarization metrics", - "score": 2.54760656490819 - }, - "XSUM - Compression": { - "description": "min=16.704, mean=16.937, max=17.065, sum=101.621 (6)", - "tab": "Summarization metrics", - "score": 16.93675136805864 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.955, mean=0.96, max=0.965, sum=2.881 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.011, mean=0.015, max=0.02, sum=0.045 (3)", - "tab": "Calibration", - "score": 0.015015056118517703 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.929, mean=0.933, max=0.936, sum=2.799 (3)", - "tab": "Robustness", - "score": 0.9330000000000002 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.951, mean=0.957, max=0.96, sum=2.871 (3)", - "tab": "Fairness", - "score": 0.957 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.89, mean=4.217, max=4.981, sum=12.652 (3)", - "tab": "General information", - "score": 4.217333333333333 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1282.318, mean=1557.741, max=1776.111, sum=4673.222 (3)", - "tab": "General information", - "score": 1557.7406666666666 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.601, - "details": { - "description": "min=0.254, mean=0.601, max=0.86, sum=32.478 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.054, mean=0.161, max=0.416, sum=8.676 (54)", - "tab": "Calibration", - "score": 0.16066140880534402 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.205, mean=0.535, max=0.84, sum=28.866 (54)", - "tab": "Robustness", - "score": 0.5345588668880686 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.222, mean=0.544, max=0.85, sum=29.397 (54)", - "tab": "Fairness", - "score": 0.5443897908426464 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)", - "tab": "General information", - "score": 732.5144825548033 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "description": "min=0.025, mean=0.667, max=0.975, sum=22.0 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.041, mean=0.262, max=0.96, sum=8.637 (33)", - "tab": "Calibration", - "score": 0.26172447899775947 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.599, max=0.975, sum=19.775 (33)", - "tab": "Robustness", - "score": 0.5992424242424242 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.025, mean=0.627, max=0.975, sum=20.7 (33)", - "tab": "Fairness", - "score": 0.6272727272727272 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.554, max=5, sum=150.275 (33)", - "tab": "General information", - "score": 4.553787878787879 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=270.325, mean=813.265, max=1762.475, sum=26837.75 (33)", - "tab": "General information", - "score": 813.2651515151515 - }, - "RAFT - # output tokens": { - "description": "min=0.025, mean=3.15, max=6.8, sum=103.95 (33)", - "tab": "General information", - "score": 3.15 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json deleted file mode 100644 index 8f01acff1..000000000 --- a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere Command beta 6.1B", - "id": "cohere/Cohere-Command-beta-6.1B", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.5291111339523303 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.6159776448986682 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.66227113635345 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.43551719208606965 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6688037271370605 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5789473684210527 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406, - "details": { - "description": "min=0.26, mean=0.406, max=0.63, sum=6.095 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.103, mean=0.155, max=0.243, sum=2.327 (15)", - "tab": "Calibration", - "score": 0.1551609000421963 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.2, mean=0.334, max=0.54, sum=5.009 (15)", - "tab": "Robustness", - "score": 0.33394152046783626 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.2, mean=0.366, max=0.55, sum=5.495 (15)", - "tab": "Fairness", - "score": 0.36630409356725147 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)", - "tab": "General information", - "score": 481.2602105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.798, - "details": { - "description": "min=0.791, mean=0.798, max=0.809, sum=2.394 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.048, mean=0.059, max=0.069, sum=0.178 (3)", - "tab": "Calibration", - "score": 0.0594622129465324 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.715, mean=0.725, max=0.743, sum=2.176 (3)", - "tab": "Robustness", - "score": 0.7253333333333334 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.74, mean=0.748, max=0.76, sum=2.244 (3)", - "tab": "Fairness", - "score": 0.7479999999999999 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)", - "tab": "General information", - "score": 925.3070000000001 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.707, mean=0.709, max=0.712, sum=2.128 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.075, mean=0.076, max=0.077, sum=0.228 (3)", - "tab": "Calibration", - "score": 0.07599807506781359 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.515, mean=0.529, max=0.539, sum=1.586 (3)", - "tab": "Robustness", - "score": 0.5285770759196127 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.592, mean=0.595, max=0.6, sum=1.785 (3)", - "tab": "Fairness", - "score": 0.5949605221040284 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0.904, mean=1.508, max=1.941, sum=4.524 (3)", - "tab": "General information", - "score": 1.5079812206572771 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1570.772, mean=1600.684, max=1660.485, sum=4802.051 (3)", - "tab": "General information", - "score": 1600.6835680751174 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.301, mean=5.807, max=6.217, sum=17.42 (3)", - "tab": "General information", - "score": 5.8065727699530525 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.463, mean=0.488, max=0.5, sum=1.463 (3)", - "tab": "Bias", - "score": 0.48765432098765427 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.126, mean=0.144, max=0.169, sum=0.432 (3)", - "tab": "Bias", - "score": 0.14398558425056623 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.01, max=0.014, sum=0.031 (3)", - "tab": "Toxicity", - "score": 0.010328638497652582 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.717, - "details": { - "description": "min=0.714, mean=0.717, max=0.724, sum=2.152 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.04, mean=0.042, max=0.046, sum=0.127 (3)", - "tab": "Calibration", - "score": 0.04227945276969597 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.045, mean=0.057, max=0.074, sum=0.172 (3)", - "tab": "Calibration", - "score": 0.057325907163997956 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.156, mean=0.163, max=0.171, sum=0.489 (3)", - "tab": "Robustness", - "score": 0.163031767310864 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.596, mean=0.605, max=0.616, sum=1.815 (3)", - "tab": "Robustness", - "score": 0.6050162193677248 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.165, mean=0.167, max=0.167, sum=0.5 (3)", - "tab": "Fairness", - "score": 0.16652011745655915 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.647, mean=0.654, max=0.66, sum=1.962 (3)", - "tab": "Fairness", - "score": 0.6540942012407344 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)", - "tab": "General information", - "score": 111.19099999999999 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.428, mean=4.687, max=4.995, sum=14.06 (3)", - "tab": "General information", - "score": 4.6866666666666665 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.485, mean=4.602, max=4.705, sum=13.807 (3)", - "tab": "General information", - "score": 4.602333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1258.15, mean=1471.073, max=1597.431, sum=4413.22 (3)", - "tab": "General information", - "score": 1471.073333333333 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.147, mean=7.377, max=7.586, sum=22.131 (3)", - "tab": "General information", - "score": 7.377 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.421, mean=0.465, max=0.506, sum=1.394 (3)", - "tab": "Bias", - "score": 0.46474105132386057 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.1, mean=0.183, max=0.3, sum=0.55 (3)", - "tab": "Bias", - "score": 0.18333333333333335 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.473, mean=0.487, max=0.509, sum=1.46 (3)", - "tab": "Bias", - "score": 0.48677896291115386 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.348, mean=0.356, max=0.363, sum=1.068 (3)", - "tab": "Bias", - "score": 0.3560153609831029 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)", - "tab": "Toxicity", - "score": 0.0003333333333333333 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)", - "tab": "Toxicity", - "score": 0.0003333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375, - "details": { - "description": "min=0.371, mean=0.375, max=0.379, sum=1.125 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.054, mean=0.062, max=0.067, sum=0.186 (3)", - "tab": "Calibration", - "score": 0.06185077042352865 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.159, mean=0.17, max=0.178, sum=0.511 (3)", - "tab": "Robustness", - "score": 0.17034790269142241 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.268, mean=0.273, max=0.279, sum=0.819 (3)", - "tab": "Fairness", - "score": 0.2730533859766594 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.748, mean=0.848, max=0.933, sum=2.545 (3)", - "tab": "General information", - "score": 0.8483333333333333 - }, - "QuAC - truncated": { - "description": "min=0.022, mean=0.022, max=0.022, sum=0.066 (3)", - "tab": "General information", - "score": 0.022000000000000002 - }, - "QuAC - # prompt tokens": { - "description": "min=1577.224, mean=1610.503, max=1643.74, sum=4831.508 (3)", - "tab": "General information", - "score": 1610.5026666666665 - }, - "QuAC - # output tokens": { - "description": "min=16.185, mean=17.394, max=18.299, sum=52.182 (3)", - "tab": "General information", - "score": 17.394 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.469, mean=0.471, max=0.475, sum=1.414 (3)", - "tab": "Bias", - "score": 0.47144607843137254 - }, - "QuAC - Representation (race)": { - "description": "min=0.312, mean=0.356, max=0.423, sum=1.069 (3)", - "tab": "Bias", - "score": 0.35619490458200137 - }, - "QuAC - Representation (gender)": { - "description": "min=0.236, mean=0.248, max=0.259, sum=0.743 (3)", - "tab": "Bias", - "score": 0.2476420794142787 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)", - "tab": "Calibration", - "score": 0.2926835489814197 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)", - "tab": "Robustness", - "score": 0.696 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.608, mean=0.608, max=0.608, sum=0.608 (1)", - "tab": "Fairness", - "score": 0.608 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)", - "tab": "General information", - "score": 88.855 - }, - "HellaSwag - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)", - "tab": "Calibration", - "score": 0.2504061981122775 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)", - "tab": "Robustness", - "score": 0.448 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.468 (1)", - "tab": "Fairness", - "score": 0.468 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)", - "tab": "General information", - "score": 5.358 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.203, - "details": { - "description": "min=0.197, mean=0.203, max=0.213, sum=0.61 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.275, mean=0.3, max=0.332, sum=0.901 (3)", - "tab": "Calibration", - "score": 0.3001833323753285 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.168, mean=0.171, max=0.174, sum=0.512 (3)", - "tab": "Robustness", - "score": 0.17074413863404692 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.154, mean=0.163, max=0.167, sum=0.488 (3)", - "tab": "Fairness", - "score": 0.16258919469928643 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)", - "tab": "General information", - "score": 514.6483180428135 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.702, mean=0.709, max=0.717, sum=2.128 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.372, mean=0.387, max=0.401, sum=1.161 (3)", - "tab": "Robustness", - "score": 0.386937698412698 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.68, mean=0.685, max=0.689, sum=2.054 (3)", - "tab": "Robustness", - "score": 0.6845367765287401 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.402, mean=0.411, max=0.42, sum=1.232 (3)", - "tab": "Fairness", - "score": 0.4107572751322747 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.68, mean=0.69, max=0.696, sum=2.069 (3)", - "tab": "Fairness", - "score": 0.6896233668786421 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)", - "tab": "General information", - "score": 536.6143333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)", - "tab": "General information", - "score": 519.4961240310078 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.153, - "details": { - "description": "min=0.15, mean=0.153, max=0.158, sum=0.919 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)", - "tab": "General information", - "score": 1575.0364806866953 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=69.622, mean=73.723, max=77.732, sum=442.339 (6)", - "tab": "General information", - "score": 73.72317596566523 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.594, mean=0.603, max=0.609, sum=3.618 (6)", - "tab": "Bias", - "score": 0.6029930306246096 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.384, mean=0.408, max=0.421, sum=2.449 (6)", - "tab": "Bias", - "score": 0.40820094830714143 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.245, mean=0.259, max=0.269, sum=1.553 (6)", - "tab": "Bias", - "score": 0.2588148950314076 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.116, mean=0.121, max=0.127, sum=0.724 (6)", - "tab": "Bias", - "score": 0.1206019792299876 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.318, mean=0.331, max=0.342, sum=0.992 (3)", - "tab": "Summarization metrics", - "score": 0.3306993242099164 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.289, mean=0.296, max=0.305, sum=0.888 (3)", - "tab": "Summarization metrics", - "score": 0.29605955170271475 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.974, mean=0.975, max=0.975, sum=5.848 (6)", - "tab": "Summarization metrics", - "score": 0.9746996636764317 - }, - "CNN/DailyMail - Density": { - "description": "min=28.678, mean=31.707, max=36.132, sum=190.245 (6)", - "tab": "Summarization metrics", - "score": 31.707488870766706 - }, - "CNN/DailyMail - Compression": { - "description": "min=9.108, mean=9.688, max=10.161, sum=58.13 (6)", - "tab": "Summarization metrics", - "score": 9.688415513712991 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.122, - "details": { - "description": "min=0.122, mean=0.122, max=0.122, sum=0.73 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.996, mean=4.997, max=5, sum=29.985 (6)", - "tab": "General information", - "score": 4.997425997425997 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1484.608, mean=1537.293, max=1572.616, sum=9223.757 (6)", - "tab": "General information", - "score": 1537.2927927927929 - }, - "XSUM - # output tokens": { - "description": "min=22.674, mean=23.421, max=24.095, sum=140.529 (6)", - "tab": "General information", - "score": 23.421492921492924 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.445, mean=0.454, max=0.467, sum=2.725 (6)", - "tab": "Bias", - "score": 0.45422077922077925 - }, - "XSUM - Representation (race)": { - "description": "min=0.483, mean=0.505, max=0.524, sum=3.031 (6)", - "tab": "Bias", - "score": 0.5051915503043323 - }, - "XSUM - Representation (gender)": { - "description": "min=0.198, mean=0.215, max=0.235, sum=1.29 (6)", - "tab": "Bias", - "score": 0.2150586429483566 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.244, mean=-0.239, max=-0.235, sum=-0.716 (3)", - "tab": "Summarization metrics", - "score": -0.23871033593647883 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.417, mean=0.418, max=0.42, sum=1.254 (3)", - "tab": "Summarization metrics", - "score": 0.4181413420706151 - }, - "XSUM - Coverage": { - "description": "min=0.823, mean=0.824, max=0.826, sum=4.943 (6)", - "tab": "Summarization metrics", - "score": 0.8238944118657666 - }, - "XSUM - Density": { - "description": "min=2.687, mean=2.793, max=2.942, sum=16.758 (6)", - "tab": "Summarization metrics", - "score": 2.7930375453507623 - }, - "XSUM - Compression": { - "description": "min=17.475, mean=18.017, max=18.57, sum=108.1 (6)", - "tab": "Summarization metrics", - "score": 18.016669951894464 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.961, - "details": { - "description": "min=0.959, mean=0.961, max=0.962, sum=2.882 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.011, mean=0.014, max=0.019, sum=0.043 (3)", - "tab": "Calibration", - "score": 0.014204038428277976 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.917, mean=0.921, max=0.925, sum=2.762 (3)", - "tab": "Robustness", - "score": 0.9206666666666669 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.946, mean=0.95, max=0.954, sum=2.851 (3)", - "tab": "Fairness", - "score": 0.9503333333333334 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.89, mean=4.217, max=4.981, sum=12.652 (3)", - "tab": "General information", - "score": 4.217333333333333 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1282.318, mean=1557.741, max=1776.111, sum=4673.222 (3)", - "tab": "General information", - "score": 1557.7406666666666 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54, - "details": { - "description": "min=0.009, mean=0.54, max=1, sum=29.17 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.113, mean=0.358, max=0.735, sum=19.322 (54)", - "tab": "Calibration", - "score": 0.3578234752080933 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.468, max=1, sum=25.26 (54)", - "tab": "Robustness", - "score": 0.46778473308233626 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.002, mean=0.496, max=1, sum=26.757 (54)", - "tab": "Fairness", - "score": 0.4955072296924251 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)", - "tab": "General information", - "score": 732.5144825548033 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.634, - "details": { - "description": "min=0.05, mean=0.634, max=0.975, sum=20.925 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.05, mean=0.274, max=0.84, sum=9.055 (33)", - "tab": "Calibration", - "score": 0.2744070774220778 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.552, max=0.975, sum=18.225 (33)", - "tab": "Robustness", - "score": 0.5522727272727274 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.05, mean=0.609, max=0.975, sum=20.1 (33)", - "tab": "Fairness", - "score": 0.609090909090909 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.554, max=5, sum=150.275 (33)", - "tab": "General information", - "score": 4.553787878787879 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=270.325, mean=813.265, max=1762.475, sum=26837.75 (33)", - "tab": "General information", - "score": 813.2651515151515 - }, - "RAFT - # output tokens": { - "description": "min=0.2, mean=3.148, max=6.3, sum=103.875 (33)", - "tab": "General information", - "score": 3.1477272727272725 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json deleted file mode 100644 index 16c06b937..000000000 --- a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere large v20220720 13.1B", - "id": "cohere/Cohere-large-v20220720-13.1B", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.372, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6524936901131783 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.3450884302942145 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.3621096552687209 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.40696820175438597 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5413536579003514 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.48450623450623453 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5760442773600668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.324, - "details": { - "description": "min=0.19, mean=0.324, max=0.4, sum=4.854 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.075, mean=0.112, max=0.151, sum=1.678 (15)", - "tab": "Calibration", - "score": 0.11188578153206447 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.15, mean=0.253, max=0.35, sum=3.799 (15)", - "tab": "Robustness", - "score": 0.25327485380116954 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.14, mean=0.281, max=0.38, sum=4.214 (15)", - "tab": "Fairness", - "score": 0.2809590643274854 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.292, mean=0.317, max=0.349, sum=4.752 (15)", - "tab": "Efficiency", - "score": 0.3167793253495066 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)", - "tab": "General information", - "score": 481.2602105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725, - "details": { - "description": "min=0.705, mean=0.725, max=0.738, sum=2.176 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.066, mean=0.088, max=0.106, sum=0.265 (3)", - "tab": "Calibration", - "score": 0.08825401206422555 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.514, mean=0.545, max=0.566, sum=1.635 (3)", - "tab": "Robustness", - "score": 0.545 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.653, mean=0.676, max=0.695, sum=2.027 (3)", - "tab": "Fairness", - "score": 0.6756666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.359, mean=0.421, max=0.505, sum=1.263 (3)", - "tab": "Efficiency", - "score": 0.4208381308593749 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)", - "tab": "General information", - "score": 925.3070000000001 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.581, mean=0.625, max=0.647, sum=1.874 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.032, mean=0.037, max=0.044, sum=0.11 (3)", - "tab": "Calibration", - "score": 0.03650754887085305 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.318, mean=0.357, max=0.38, sum=1.072 (3)", - "tab": "Robustness", - "score": 0.3573511654752053 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.466, mean=0.512, max=0.538, sum=1.537 (3)", - "tab": "Fairness", - "score": 0.5123186802559418 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.693, mean=0.729, max=0.782, sum=2.186 (3)", - "tab": "Efficiency", - "score": 0.7286962533010564 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)", - "tab": "General information", - "score": 1.5624413145539906 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)", - "tab": "General information", - "score": 1634.9896713615024 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.535, mean=6.91, max=9.504, sum=20.73 (3)", - "tab": "General information", - "score": 6.909859154929578 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.418, mean=0.473, max=0.5, sum=1.418 (3)", - "tab": "Bias", - "score": 0.4726495726495727 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.193, mean=0.202, max=0.211, sum=0.607 (3)", - "tab": "Bias", - "score": 0.20233455199447267 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.017, max=0.02, sum=0.051 (3)", - "tab": "Toxicity", - "score": 0.016901408450704227 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.573, - "details": { - "description": "min=0.553, mean=0.573, max=0.584, sum=1.72 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.02, mean=0.025, max=0.032, sum=0.074 (3)", - "tab": "Calibration", - "score": 0.024639111727299556 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.117, mean=0.143, max=0.158, sum=0.43 (3)", - "tab": "Calibration", - "score": 0.14321248401208217 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.16, mean=0.172, max=0.18, sum=0.515 (3)", - "tab": "Robustness", - "score": 0.17161461010403287 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.287, mean=0.347, max=0.38, sum=1.041 (3)", - "tab": "Robustness", - "score": 0.3470084296370371 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.176, mean=0.178, max=0.181, sum=0.535 (3)", - "tab": "Fairness", - "score": 0.17833773739586523 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.489, mean=0.507, max=0.516, sum=1.52 (3)", - "tab": "Fairness", - "score": 0.5065982888177307 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.332, mean=0.337, max=0.343, sum=1.012 (3)", - "tab": "Efficiency", - "score": 0.33722079557291607 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.681, mean=0.774, max=0.827, sum=2.321 (3)", - "tab": "Efficiency", - "score": 0.7738100833333333 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)", - "tab": "General information", - "score": 111.19099999999999 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.441, mean=5.625, max=5.917, sum=16.875 (3)", - "tab": "General information", - "score": 5.625 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)", - "tab": "General information", - "score": 4.633 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)", - "tab": "General information", - "score": 1481.344 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.71, mean=10.443, max=11.438, sum=31.329 (3)", - "tab": "General information", - "score": 10.443 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.25, mean=0.333, max=0.5, sum=1 (3)", - "tab": "Bias", - "score": 0.3333333333333333 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.244, mean=0.34, max=0.429, sum=1.021 (3)", - "tab": "Bias", - "score": 0.34034751045060324 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.208, mean=0.233, max=0.269, sum=0.7 (3)", - "tab": "Bias", - "score": 0.23326210826210825 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.37, mean=0.39, max=0.4, sum=1.17 (3)", - "tab": "Bias", - "score": 0.38999999999999996 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.447, mean=0.457, max=0.467, sum=1.371 (3)", - "tab": "Bias", - "score": 0.45706182643221777 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.125, mean=0.174, max=0.251, sum=0.523 (3)", - "tab": "Bias", - "score": 0.17447005829358772 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)", - "tab": "Toxicity", - "score": 0.0003333333333333333 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.005 (3)", - "tab": "Toxicity", - "score": 0.0016666666666666668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338, - "details": { - "description": "min=0.335, mean=0.338, max=0.343, sum=1.015 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.03, mean=0.033, max=0.036, sum=0.099 (3)", - "tab": "Calibration", - "score": 0.03288362014267938 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.197, mean=0.204, max=0.211, sum=0.613 (3)", - "tab": "Robustness", - "score": 0.20424911828028136 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.251, mean=0.256, max=0.259, sum=0.768 (3)", - "tab": "Fairness", - "score": 0.25613799535824233 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.189, mean=1.262, max=1.309, sum=3.785 (3)", - "tab": "Efficiency", - "score": 1.261730263346353 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)", - "tab": "General information", - "score": 0.8813333333333334 - }, - "QuAC - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)", - "tab": "General information", - "score": 0.02 - }, - "QuAC - # prompt tokens": { - "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)", - "tab": "General information", - "score": 1639.784333333333 - }, - "QuAC - # output tokens": { - "description": "min=26.693, mean=30.036, max=32.515, sum=90.109 (3)", - "tab": "General information", - "score": 30.036333333333335 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.43, mean=0.441, max=0.46, sum=1.322 (3)", - "tab": "Bias", - "score": 0.4407422751666938 - }, - "QuAC - Representation (race)": { - "description": "min=0.306, mean=0.338, max=0.358, sum=1.015 (3)", - "tab": "Bias", - "score": 0.3382593663469334 - }, - "QuAC - Representation (gender)": { - "description": "min=0.234, mean=0.238, max=0.243, sum=0.714 (3)", - "tab": "Bias", - "score": 0.23804653081585347 - }, - "QuAC - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.004, sum=0.01 (3)", - "tab": "Toxicity", - "score": 0.0033333333333333335 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736, - "details": { - "description": "min=0.736, mean=0.736, max=0.736, sum=0.736 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)", - "tab": "Calibration", - "score": 0.28820318504565584 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)", - "tab": "Robustness", - "score": 0.687 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.575, mean=0.575, max=0.575, sum=0.575 (1)", - "tab": "Fairness", - "score": 0.575 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.225, mean=0.225, max=0.225, sum=0.225 (1)", - "tab": "Efficiency", - "score": 0.22464337890624972 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)", - "tab": "General information", - "score": 88.855 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542, - "details": { - "description": "min=0.542, mean=0.542, max=0.542, sum=0.542 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.225, mean=0.225, max=0.225, sum=0.225 (1)", - "tab": "Calibration", - "score": 0.2254334966206393 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)", - "tab": "Robustness", - "score": 0.43 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)", - "tab": "Fairness", - "score": 0.446 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.201, mean=0.201, max=0.201, sum=0.201 (1)", - "tab": "Efficiency", - "score": 0.2014860078125007 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)", - "tab": "General information", - "score": 5.358 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.181, - "details": { - "description": "min=0.161, mean=0.181, max=0.2, sum=0.544 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.097, mean=0.105, max=0.117, sum=0.316 (3)", - "tab": "Calibration", - "score": 0.10528939288118344 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.141, mean=0.154, max=0.173, sum=0.462 (3)", - "tab": "Robustness", - "score": 0.15392456676860344 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.142, mean=0.157, max=0.174, sum=0.471 (3)", - "tab": "Fairness", - "score": 0.15698267074413863 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.323, mean=0.325, max=0.328, sum=0.975 (3)", - "tab": "Efficiency", - "score": 0.3248777191442089 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)", - "tab": "General information", - "score": 514.6483180428135 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.292, mean=0.33, max=0.382, sum=0.991 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.109, mean=0.13, max=0.147, sum=0.39 (3)", - "tab": "Robustness", - "score": 0.1300338624338624 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.195, mean=0.257, max=0.323, sum=0.772 (3)", - "tab": "Robustness", - "score": 0.2574506868270638 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.136, mean=0.164, max=0.189, sum=0.493 (3)", - "tab": "Fairness", - "score": 0.16423492063492048 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.273, mean=0.312, max=0.361, sum=0.936 (3)", - "tab": "Fairness", - "score": 0.3120660241438415 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.322, mean=0.33, max=0.339, sum=0.989 (3)", - "tab": "Efficiency", - "score": 0.3298234970703125 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.319, mean=0.327, max=0.335, sum=0.98 (3)", - "tab": "Efficiency", - "score": 0.32664419815891477 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)", - "tab": "General information", - "score": 536.6143333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.008, mean=1.025, max=1.046, sum=3.074 (3)", - "tab": "General information", - "score": 1.0246666666666666 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)", - "tab": "General information", - "score": 519.4961240310078 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1.023, mean=1.031, max=1.047, sum=3.093 (3)", - "tab": "General information", - "score": 1.0310077519379846 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.126, - "details": { - "description": "min=0.115, mean=0.126, max=0.134, sum=0.758 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=2.097, mean=2.269, max=2.366, sum=13.614 (6)", - "tab": "Efficiency", - "score": 2.2689930690607114 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)", - "tab": "General information", - "score": 1575.0364806866953 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=67.079, mean=74.505, max=78.916, sum=447.03 (6)", - "tab": "General information", - "score": 74.50500715307582 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.58, mean=0.626, max=0.659, sum=3.756 (6)", - "tab": "Bias", - "score": 0.6260369618341756 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.371, mean=0.401, max=0.431, sum=2.409 (6)", - "tab": "Bias", - "score": 0.40149048314255253 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.185, mean=0.238, max=0.295, sum=1.431 (6)", - "tab": "Bias", - "score": 0.23843844144516976 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.115, mean=0.134, max=0.153, sum=0.805 (6)", - "tab": "Bias", - "score": 0.1341289455316015 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.000715307582260372 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.447, mean=0.5, max=0.543, sum=1.499 (3)", - "tab": "Summarization metrics", - "score": 0.4997740334832678 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.715, mean=4.763, max=4.822, sum=28.58 (6)", - "tab": "Summarization metrics", - "score": 4.763415476947068 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.227, mean=0.246, max=0.263, sum=0.737 (3)", - "tab": "Summarization metrics", - "score": 0.2457600895432969 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.903, mean=0.946, max=0.975, sum=5.678 (6)", - "tab": "Summarization metrics", - "score": 0.9463649022058865 - }, - "CNN/DailyMail - Density": { - "description": "min=30.364, mean=37.733, max=45.984, sum=226.401 (6)", - "tab": "Summarization metrics", - "score": 37.73347863579329 - }, - "CNN/DailyMail - Compression": { - "description": "min=9.977, mean=11.27, max=13.424, sum=67.62 (6)", - "tab": "Summarization metrics", - "score": 11.269948645908789 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108, - "details": { - "description": "min=0.106, mean=0.108, max=0.11, sum=0.649 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=1.064, mean=1.075, max=1.089, sum=6.451 (6)", - "tab": "Efficiency", - "score": 1.0751711510617759 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)", - "tab": "General information", - "score": 4.998069498069498 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)", - "tab": "General information", - "score": 1537.4517374517375 - }, - "XSUM - # output tokens": { - "description": "min=22.133, mean=22.992, max=23.423, sum=137.954 (6)", - "tab": "General information", - "score": 22.99227799227799 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.456, mean=0.466, max=0.484, sum=2.793 (6)", - "tab": "Bias", - "score": 0.4655148596176822 - }, - "XSUM - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Representation (gender)": { - "description": "min=0.139, mean=0.157, max=0.172, sum=0.945 (6)", - "tab": "Bias", - "score": 0.15743560442588508 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)", - "tab": "Toxicity", - "score": 0.001287001287001287 - }, - "XSUM - SummaC": { - "description": "min=-0.196, mean=-0.189, max=-0.185, sum=-0.567 (3)", - "tab": "Summarization metrics", - "score": -0.18902428828304493 - }, - "XSUM - QAFactEval": { - "description": "min=2.852, mean=2.889, max=2.928, sum=17.336 (6)", - "tab": "Summarization metrics", - "score": 2.889265592037019 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.394, mean=0.398, max=0.403, sum=1.195 (3)", - "tab": "Summarization metrics", - "score": 0.3984961779205311 - }, - "XSUM - Coverage": { - "description": "min=0.82, mean=0.823, max=0.825, sum=4.937 (6)", - "tab": "Summarization metrics", - "score": 0.8227568594164721 - }, - "XSUM - Density": { - "description": "min=3.497, mean=3.599, max=3.746, sum=21.593 (6)", - "tab": "Summarization metrics", - "score": 3.5988000456323377 - }, - "XSUM - Compression": { - "description": "min=20.099, mean=20.712, max=21.78, sum=124.27 (6)", - "tab": "Summarization metrics", - "score": 20.711693139962097 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.933, - "details": { - "description": "min=0.929, mean=0.933, max=0.94, sum=2.8 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.098, mean=0.132, max=0.183, sum=0.396 (3)", - "tab": "Calibration", - "score": 0.13199349625828075 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.895, mean=0.902, max=0.91, sum=2.706 (3)", - "tab": "Robustness", - "score": 0.902 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.912, mean=0.92, max=0.93, sum=2.759 (3)", - "tab": "Fairness", - "score": 0.9196666666666666 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.479, mean=0.536, max=0.62, sum=1.607 (3)", - "tab": "Efficiency", - "score": 0.5358171357421871 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.846, mean=4.93, max=4.98, sum=14.79 (3)", - "tab": "General information", - "score": 4.930000000000001 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1161.854, mean=1398.654, max=1747.025, sum=4195.961 (3)", - "tab": "General information", - "score": 1398.6536666666668 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507, - "details": { - "description": "min=0, mean=0.507, max=1, sum=27.395 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.1, mean=0.384, max=0.705, sum=20.717 (54)", - "tab": "Calibration", - "score": 0.38365386942886265 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.333, max=0.95, sum=17.981 (54)", - "tab": "Robustness", - "score": 0.3329825600043121 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.443, max=1, sum=23.917 (54)", - "tab": "Fairness", - "score": 0.44290609222735455 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.29, mean=0.375, max=0.51, sum=20.235 (54)", - "tab": "Efficiency", - "score": 0.3747284900914756 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)", - "tab": "General information", - "score": 732.5144825548033 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0, mean=0.596, max=0.975, sum=19.675 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.115, mean=0.267, max=1, sum=8.804 (33)", - "tab": "Calibration", - "score": 0.26679166027291745 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.49, max=0.975, sum=16.175 (33)", - "tab": "Robustness", - "score": 0.49015151515151517 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.564, max=0.975, sum=18.625 (33)", - "tab": "Fairness", - "score": 0.5643939393939394 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.284, mean=0.444, max=0.697, sum=14.664 (33)", - "tab": "Efficiency", - "score": 0.4443553984670929 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.557, max=5, sum=150.375 (33)", - "tab": "General information", - "score": 4.556818181818182 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)", - "tab": "General information", - "score": 814.446212121212 - }, - "RAFT - # output tokens": { - "description": "min=0, mean=3.02, max=6.5, sum=99.65 (33)", - "tab": "General information", - "score": 3.01969696969697 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json deleted file mode 100644 index f0d42b850..000000000 --- a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere medium v20220720 6.1B", - "id": "cohere/Cohere-medium-v20220720-6.1B", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.23, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.5098117312502142 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.18793903538063716 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.26943181031056446 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5410910087719298 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4663309072932103 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5508257174923842 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.4311194653299916 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.279, - "details": { - "description": "min=0.18, mean=0.279, max=0.36, sum=4.182 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.067, mean=0.114, max=0.164, sum=1.703 (15)", - "tab": "Calibration", - "score": 0.11350786269483934 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.09, mean=0.184, max=0.24, sum=2.755 (15)", - "tab": "Robustness", - "score": 0.18368421052631578 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.15, mean=0.237, max=0.29, sum=3.548 (15)", - "tab": "Fairness", - "score": 0.23653801169590644 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.265, mean=0.281, max=0.301, sum=4.21 (15)", - "tab": "Efficiency", - "score": 0.2806724427425987 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)", - "tab": "General information", - "score": 481.2602105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.65, mean=0.659, max=0.667, sum=1.977 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.069, mean=0.082, max=0.093, sum=0.247 (3)", - "tab": "Calibration", - "score": 0.08218351589951171 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.556, mean=0.562, max=0.573, sum=1.686 (3)", - "tab": "Robustness", - "score": 0.5619999999999999 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.589, mean=0.597, max=0.61, sum=1.792 (3)", - "tab": "Fairness", - "score": 0.5973333333333333 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.308, mean=0.35, max=0.402, sum=1.049 (3)", - "tab": "Efficiency", - "score": 0.34952371158854173 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)", - "tab": "General information", - "score": 925.3070000000001 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.54, mean=0.559, max=0.572, sum=1.677 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.043, mean=0.047, max=0.055, sum=0.141 (3)", - "tab": "Calibration", - "score": 0.046946382998353055 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.283, mean=0.3, max=0.315, sum=0.899 (3)", - "tab": "Robustness", - "score": 0.29964626689663526 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.416, mean=0.438, max=0.455, sum=1.313 (3)", - "tab": "Fairness", - "score": 0.4376922212938658 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.525, mean=0.533, max=0.548, sum=1.599 (3)", - "tab": "Efficiency", - "score": 0.5331198741930753 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)", - "tab": "General information", - "score": 1.5624413145539906 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)", - "tab": "General information", - "score": 1634.9896713615024 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.392, mean=6.771, max=8.33, sum=20.313 (3)", - "tab": "General information", - "score": 6.770892018779342 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.394, mean=0.427, max=0.45, sum=1.282 (3)", - "tab": "Bias", - "score": 0.42718253968253966 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.373, mean=0.569, max=0.667, sum=1.706 (3)", - "tab": "Bias", - "score": 0.5686274509803922 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.152, mean=0.174, max=0.195, sum=0.521 (3)", - "tab": "Bias", - "score": 0.17371956530315583 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.02, max=0.025, sum=0.059 (3)", - "tab": "Toxicity", - "score": 0.01971830985915493 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.504, - "details": { - "description": "min=0.482, mean=0.504, max=0.516, sum=1.512 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.018, mean=0.026, max=0.036, sum=0.077 (3)", - "tab": "Calibration", - "score": 0.025653079993217736 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.129, mean=0.142, max=0.154, sum=0.425 (3)", - "tab": "Calibration", - "score": 0.14175015381424005 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.097, mean=0.102, max=0.104, sum=0.305 (3)", - "tab": "Robustness", - "score": 0.10170384904294616 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.226, mean=0.266, max=0.292, sum=0.799 (3)", - "tab": "Robustness", - "score": 0.26631844818771483 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.124, mean=0.126, max=0.127, sum=0.377 (3)", - "tab": "Fairness", - "score": 0.12565301660951664 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.41, mean=0.432, max=0.444, sum=1.297 (3)", - "tab": "Fairness", - "score": 0.4322127161835283 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.254, mean=0.259, max=0.265, sum=0.778 (3)", - "tab": "Efficiency", - "score": 0.25938733203125103 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.476, mean=0.535, max=0.583, sum=1.606 (3)", - "tab": "Efficiency", - "score": 0.5353007499999998 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)", - "tab": "General information", - "score": 111.19099999999999 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.823, mean=5.267, max=5.728, sum=15.801 (3)", - "tab": "General information", - "score": 5.267 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)", - "tab": "General information", - "score": 4.633 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)", - "tab": "General information", - "score": 1481.344 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.288, mean=9.101, max=11.307, sum=27.304 (3)", - "tab": "General information", - "score": 9.101333333333333 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.419, mean=0.441, max=0.476, sum=1.323 (3)", - "tab": "Bias", - "score": 0.4410100926954859 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.214, mean=0.251, max=0.3, sum=0.753 (3)", - "tab": "Bias", - "score": 0.2511387163561077 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.292, mean=0.354, max=0.417, sum=0.708 (2)", - "tab": "Bias", - "score": 0.3541666666666667 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.289, mean=0.325, max=0.385, sum=0.974 (3)", - "tab": "Bias", - "score": 0.3247724272114516 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.202, mean=0.234, max=0.285, sum=0.703 (3)", - "tab": "Bias", - "score": 0.23429326676087917 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)", - "tab": "Toxicity", - "score": 0.0003333333333333333 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.003, sum=0.007 (3)", - "tab": "Toxicity", - "score": 0.0023333333333333335 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.279, - "details": { - "description": "min=0.273, mean=0.279, max=0.287, sum=0.838 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.042, mean=0.048, max=0.061, sum=0.145 (3)", - "tab": "Calibration", - "score": 0.04829561557428013 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.12, mean=0.144, max=0.157, sum=0.432 (3)", - "tab": "Robustness", - "score": 0.14398518012537756 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.186, mean=0.198, max=0.207, sum=0.593 (3)", - "tab": "Fairness", - "score": 0.19765650296002213 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=0.664, mean=0.735, max=0.771, sum=2.206 (3)", - "tab": "Efficiency", - "score": 0.7354030888671875 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)", - "tab": "General information", - "score": 0.8813333333333334 - }, - "QuAC - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)", - "tab": "General information", - "score": 0.02 - }, - "QuAC - # prompt tokens": { - "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)", - "tab": "General information", - "score": 1639.784333333333 - }, - "QuAC - # output tokens": { - "description": "min=17.39, mean=23.531, max=27.056, sum=70.593 (3)", - "tab": "General information", - "score": 23.531000000000002 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2.0 (3)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.383, mean=0.412, max=0.431, sum=1.237 (3)", - "tab": "Bias", - "score": 0.41249828370040936 - }, - "QuAC - Representation (race)": { - "description": "min=0.303, mean=0.357, max=0.392, sum=1.072 (3)", - "tab": "Bias", - "score": 0.35746080227329485 - }, - "QuAC - Representation (gender)": { - "description": "min=0.233, mean=0.262, max=0.276, sum=0.786 (3)", - "tab": "Bias", - "score": 0.2618392019722732 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)", - "tab": "Toxicity", - "score": 0.0016666666666666668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.271, mean=0.271, max=0.271, sum=0.271 (1)", - "tab": "Calibration", - "score": 0.2707363482287178 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.651, mean=0.651, max=0.651, sum=0.651 (1)", - "tab": "Robustness", - "score": 0.651 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)", - "tab": "Fairness", - "score": 0.525 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)", - "tab": "Efficiency", - "score": 0.20370158203125027 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)", - "tab": "General information", - "score": 88.855 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.496, - "details": { - "description": "min=0.496, mean=0.496, max=0.496, sum=0.496 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.275, mean=0.275, max=0.275, sum=0.275 (1)", - "tab": "Calibration", - "score": 0.27530956848832144 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.382 (1)", - "tab": "Robustness", - "score": 0.382 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)", - "tab": "Fairness", - "score": 0.42 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)", - "tab": "Efficiency", - "score": 0.1870674140625 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)", - "tab": "General information", - "score": 5.358 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.19, - "details": { - "description": "min=0.176, mean=0.19, max=0.203, sum=0.57 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.082, mean=0.094, max=0.109, sum=0.282 (3)", - "tab": "Calibration", - "score": 0.09386032214108035 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.127, mean=0.149, max=0.168, sum=0.448 (3)", - "tab": "Robustness", - "score": 0.1493374108053007 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.154, mean=0.174, max=0.19, sum=0.521 (3)", - "tab": "Fairness", - "score": 0.17380224260958207 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.288, sum=0.862 (3)", - "tab": "Efficiency", - "score": 0.28723167974722846 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)", - "tab": "General information", - "score": 514.6483180428135 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374, - "details": { - "description": "min=0.337, mean=0.374, max=0.416, sum=1.122 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.101, mean=0.109, max=0.12, sum=0.326 (3)", - "tab": "Robustness", - "score": 0.10871957671957677 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.294, mean=0.315, max=0.354, sum=0.945 (3)", - "tab": "Robustness", - "score": 0.31504083631376195 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.126, mean=0.132, max=0.136, sum=0.396 (3)", - "tab": "Fairness", - "score": 0.13183915343915345 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.321, mean=0.357, max=0.398, sum=1.072 (3)", - "tab": "Fairness", - "score": 0.35726921379791293 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.286, mean=0.289, max=0.293, sum=0.867 (3)", - "tab": "Efficiency", - "score": 0.28909981347656255 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.285, mean=0.288, max=0.29, sum=0.864 (3)", - "tab": "Efficiency", - "score": 0.28804701126453486 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)", - "tab": "General information", - "score": 536.6143333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1, mean=1.005, max=1.013, sum=3.014 (3)", - "tab": "General information", - "score": 1.0046666666666666 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)", - "tab": "General information", - "score": 519.4961240310078 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1.016, max=1.023, sum=3.047 (3)", - "tab": "General information", - "score": 1.0155038759689923 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.077, - "details": { - "description": "min=0.03, mean=0.077, max=0.111, sum=0.459 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=1.073, mean=1.2, max=1.325, sum=7.2 (6)", - "tab": "Efficiency", - "score": 1.199950748558208 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)", - "tab": "General information", - "score": 1575.0364806866953 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=52.893, mean=63.193, max=73.206, sum=379.159 (6)", - "tab": "General information", - "score": 63.1931330472103 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.644, mean=0.659, max=0.667, sum=3.956 (6)", - "tab": "Bias", - "score": 0.6592592592592593 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.402, mean=0.44, max=0.476, sum=2.641 (6)", - "tab": "Bias", - "score": 0.44008624507065996 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.285, mean=0.304, max=0.333, sum=1.825 (6)", - "tab": "Bias", - "score": 0.30422478269658376 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.127, mean=0.173, max=0.229, sum=1.037 (6)", - "tab": "Bias", - "score": 0.17278322431241475 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.000715307582260372 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.174, mean=0.229, max=0.443, sum=0.686 (3)", - "tab": "Summarization metrics", - "score": 0.22880441457511005 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.552, mean=4.664, max=4.795, sum=27.982 (6)", - "tab": "Summarization metrics", - "score": 4.663724611238682 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.008, mean=0.115, max=0.197, sum=0.346 (3)", - "tab": "Summarization metrics", - "score": 0.11522739683384077 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.482, mean=0.799, max=0.965, sum=4.793 (6)", - "tab": "Summarization metrics", - "score": 0.7988868167525552 - }, - "CNN/DailyMail - Density": { - "description": "min=9.34, mean=22.176, max=32.926, sum=133.058 (6)", - "tab": "Summarization metrics", - "score": 22.17629615230217 - }, - "CNN/DailyMail - Compression": { - "description": "min=11.915, mean=13.154, max=15.457, sum=78.926 (6)", - "tab": "Summarization metrics", - "score": 13.15437099106955 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.087, - "details": { - "description": "min=0.086, mean=0.087, max=0.09, sum=0.524 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.717, mean=0.724, max=0.732, sum=4.343 (6)", - "tab": "Efficiency", - "score": 0.7239030526061776 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)", - "tab": "General information", - "score": 4.998069498069498 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)", - "tab": "General information", - "score": 1537.4517374517375 - }, - "XSUM - # output tokens": { - "description": "min=23.498, mean=24.055, max=24.463, sum=144.328 (6)", - "tab": "General information", - "score": 24.054697554697555 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.447, mean=0.461, max=0.481, sum=2.765 (6)", - "tab": "Bias", - "score": 0.46086088123125163 - }, - "XSUM - Representation (race)": { - "description": "min=0.449, mean=0.498, max=0.579, sum=2.99 (6)", - "tab": "Bias", - "score": 0.4982964658021866 - }, - "XSUM - Representation (gender)": { - "description": "min=0.167, mean=0.186, max=0.198, sum=1.115 (6)", - "tab": "Bias", - "score": 0.18582940251572325 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.17, mean=-0.159, max=-0.142, sum=-0.477 (3)", - "tab": "Summarization metrics", - "score": -0.1589340320425144 - }, - "XSUM - QAFactEval": { - "description": "min=3.197, mean=3.223, max=3.258, sum=19.336 (6)", - "tab": "Summarization metrics", - "score": 3.2227135293221596 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.364, mean=0.367, max=0.371, sum=1.102 (3)", - "tab": "Summarization metrics", - "score": 0.36729036225155814 - }, - "XSUM - Coverage": { - "description": "min=0.84, mean=0.847, max=0.855, sum=5.083 (6)", - "tab": "Summarization metrics", - "score": 0.8472154184001573 - }, - "XSUM - Density": { - "description": "min=4.485, mean=4.754, max=4.928, sum=28.525 (6)", - "tab": "Summarization metrics", - "score": 4.7541975208526 - }, - "XSUM - Compression": { - "description": "min=19.527, mean=19.748, max=20.169, sum=118.491 (6)", - "tab": "Summarization metrics", - "score": 19.748450478665102 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.935, - "details": { - "description": "min=0.917, mean=0.935, max=0.947, sum=2.805 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.335, mean=0.36, max=0.394, sum=1.08 (3)", - "tab": "Calibration", - "score": 0.360155737743892 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.878, mean=0.889, max=0.897, sum=2.666 (3)", - "tab": "Robustness", - "score": 0.8886666666666666 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.896, mean=0.918, max=0.936, sum=2.753 (3)", - "tab": "Fairness", - "score": 0.9176666666666667 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.404, mean=0.452, max=0.489, sum=1.355 (3)", - "tab": "Efficiency", - "score": 0.45160390852864607 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.903, mean=4.229, max=4.983, sum=12.688 (3)", - "tab": "General information", - "score": 4.229333333333333 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1283.038, mean=1562.808, max=1784.2, sum=4688.425 (3)", - "tab": "General information", - "score": 1562.8083333333334 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1.003, max=1.01, sum=3.01 (3)", - "tab": "General information", - "score": 1.0033333333333332 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.504, - "details": { - "description": "min=0, mean=0.504, max=1, sum=27.205 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.176, mean=0.459, max=0.641, sum=24.77 (54)", - "tab": "Calibration", - "score": 0.45870054566126006 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.136, max=0.736, sum=7.362 (54)", - "tab": "Robustness", - "score": 0.13632694985889793 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.489, max=1, sum=26.387 (54)", - "tab": "Fairness", - "score": 0.48864261081744575 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.262, mean=0.321, max=0.405, sum=17.316 (54)", - "tab": "Efficiency", - "score": 0.32067323239104795 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)", - "tab": "General information", - "score": 732.5144825548033 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.125, mean=0.52, max=0.975, sum=17.15 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.151, mean=0.304, max=0.849, sum=10.027 (33)", - "tab": "Calibration", - "score": 0.3038351531350353 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.385, max=0.975, sum=12.7 (33)", - "tab": "Robustness", - "score": 0.3848484848484848 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.125, mean=0.5, max=0.975, sum=16.5 (33)", - "tab": "Fairness", - "score": 0.5 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.244, mean=0.358, max=0.532, sum=11.817 (33)", - "tab": "Efficiency", - "score": 0.3580963386304451 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.557, max=5, sum=150.375 (33)", - "tab": "General information", - "score": 4.556818181818182 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)", - "tab": "General information", - "score": 814.446212121212 - }, - "RAFT - # output tokens": { - "description": "min=0.225, mean=2.965, max=6.15, sum=97.85 (33)", - "tab": "General information", - "score": 2.965151515151515 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json deleted file mode 100644 index 43f986e70..000000000 --- a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere medium v20221108 6.1B", - "id": "cohere/Cohere-medium-v20221108-6.1B", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.312, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6010395609917657 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.26965587249235745 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.339964744191663 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5558769690348637 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6328714495381162 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.506578947368421 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.254, - "details": { - "description": "min=0.18, mean=0.254, max=0.32, sum=3.806 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.055, mean=0.113, max=0.167, sum=1.691 (15)", - "tab": "Calibration", - "score": 0.11272299343238619 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.15, mean=0.207, max=0.25, sum=3.1 (15)", - "tab": "Robustness", - "score": 0.20667836257309943 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.14, mean=0.22, max=0.3, sum=3.299 (15)", - "tab": "Fairness", - "score": 0.21994152046783624 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)", - "tab": "General information", - "score": 481.2602105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.693, mean=0.7, max=0.704, sum=2.1 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.088, mean=0.095, max=0.105, sum=0.284 (3)", - "tab": "Calibration", - "score": 0.09459272512018041 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.508, mean=0.54, max=0.568, sum=1.62 (3)", - "tab": "Robustness", - "score": 0.54 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.626, mean=0.642, max=0.652, sum=1.925 (3)", - "tab": "Fairness", - "score": 0.6416666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)", - "tab": "General information", - "score": 925.3070000000001 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "details": { - "description": "min=0.57, mean=0.61, max=0.642, sum=1.831 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.027, mean=0.028, max=0.03, sum=0.085 (3)", - "tab": "Calibration", - "score": 0.02834267942109429 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.265, mean=0.296, max=0.321, sum=0.888 (3)", - "tab": "Robustness", - "score": 0.2960125312478054 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.441, mean=0.497, max=0.537, sum=1.491 (3)", - "tab": "Fairness", - "score": 0.49703931741598933 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)", - "tab": "General information", - "score": 1.5624413145539906 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)", - "tab": "General information", - "score": 1634.9896713615024 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.544, mean=7.144, max=9.065, sum=21.431 (3)", - "tab": "General information", - "score": 7.143661971830986 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.417, mean=0.441, max=0.469, sum=1.323 (3)", - "tab": "Bias", - "score": 0.44097222222222215 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.15, mean=0.181, max=0.213, sum=0.543 (3)", - "tab": "Bias", - "score": 0.18104985015382555 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.011, max=0.014, sum=0.034 (3)", - "tab": "Toxicity", - "score": 0.011267605633802818 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517, - "details": { - "description": "min=0.506, mean=0.517, max=0.536, sum=1.551 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.006, mean=0.015, max=0.02, sum=0.044 (3)", - "tab": "Calibration", - "score": 0.01475928497137971 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.181, mean=0.233, max=0.27, sum=0.698 (3)", - "tab": "Calibration", - "score": 0.2327617365925914 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.099, mean=0.105, max=0.11, sum=0.314 (3)", - "tab": "Robustness", - "score": 0.10457862657700777 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.164, mean=0.222, max=0.282, sum=0.665 (3)", - "tab": "Robustness", - "score": 0.22177043436006846 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.142, mean=0.149, max=0.157, sum=0.447 (3)", - "tab": "Fairness", - "score": 0.14913779301489424 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.431, mean=0.45, max=0.473, sum=1.349 (3)", - "tab": "Fairness", - "score": 0.44971949324423194 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)", - "tab": "General information", - "score": 111.19099999999999 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=6.631, mean=6.745, max=6.831, sum=20.236 (3)", - "tab": "General information", - "score": 6.745333333333334 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)", - "tab": "General information", - "score": 4.633 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)", - "tab": "General information", - "score": 1481.344 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.485, mean=8.419, max=9.746, sum=25.256 (3)", - "tab": "General information", - "score": 8.418666666666667 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.357, mean=0.45, max=0.5, sum=1.349 (3)", - "tab": "Bias", - "score": 0.44969278033794163 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.382, mean=0.451, max=0.504, sum=1.353 (3)", - "tab": "Bias", - "score": 0.4511619362542481 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.173, mean=0.314, max=0.386, sum=0.942 (3)", - "tab": "Bias", - "score": 0.3140619884317363 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.233, mean=0.308, max=0.35, sum=0.923 (3)", - "tab": "Bias", - "score": 0.30777777777777776 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.421, mean=0.452, max=0.476, sum=1.356 (3)", - "tab": "Bias", - "score": 0.4519283176992704 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.056, mean=0.061, max=0.069, sum=0.184 (3)", - "tab": "Bias", - "score": 0.06120328473269649 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.003, sum=0.005 (3)", - "tab": "Toxicity", - "score": 0.0016666666666666668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.314, - "details": { - "description": "min=0.297, mean=0.314, max=0.328, sum=0.942 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.029, mean=0.041, max=0.062, sum=0.124 (3)", - "tab": "Calibration", - "score": 0.04129669890931466 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.127, mean=0.152, max=0.171, sum=0.456 (3)", - "tab": "Robustness", - "score": 0.15189850694469184 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.208, mean=0.229, max=0.244, sum=0.688 (3)", - "tab": "Fairness", - "score": 0.22939607207059778 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)", - "tab": "General information", - "score": 0.8813333333333334 - }, - "QuAC - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)", - "tab": "General information", - "score": 0.02 - }, - "QuAC - # prompt tokens": { - "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)", - "tab": "General information", - "score": 1639.784333333333 - }, - "QuAC - # output tokens": { - "description": "min=18.756, mean=22.84, max=26.573, sum=68.519 (3)", - "tab": "General information", - "score": 22.83966666666667 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.619, mean=0.651, max=0.667, sum=1.952 (3)", - "tab": "Bias", - "score": 0.6507936507936508 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.436, mean=0.441, max=0.444, sum=1.322 (3)", - "tab": "Bias", - "score": 0.4407764298624513 - }, - "QuAC - Representation (race)": { - "description": "min=0.345, mean=0.353, max=0.359, sum=1.06 (3)", - "tab": "Bias", - "score": 0.35330965547213355 - }, - "QuAC - Representation (gender)": { - "description": "min=0.248, mean=0.251, max=0.255, sum=0.753 (3)", - "tab": "Bias", - "score": 0.2510004319407244 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)", - "tab": "Calibration", - "score": 0.2814688190554964 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)", - "tab": "Robustness", - "score": 0.687 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)", - "tab": "Fairness", - "score": 0.567 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)", - "tab": "General information", - "score": 88.855 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.538, - "details": { - "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.23, mean=0.23, max=0.23, sum=0.23 (1)", - "tab": "Calibration", - "score": 0.2303402231123461 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)", - "tab": "Robustness", - "score": 0.414 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.44 (1)", - "tab": "Fairness", - "score": 0.44 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)", - "tab": "General information", - "score": 5.358 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.215, - "details": { - "description": "min=0.19, mean=0.215, max=0.237, sum=0.645 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.057, mean=0.08, max=0.106, sum=0.24 (3)", - "tab": "Calibration", - "score": 0.07993899696218487 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.156, mean=0.17, max=0.19, sum=0.511 (3)", - "tab": "Robustness", - "score": 0.17023445463812437 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.156, mean=0.182, max=0.205, sum=0.546 (3)", - "tab": "Fairness", - "score": 0.18195718654434248 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)", - "tab": "General information", - "score": 514.6483180428135 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.373, - "details": { - "description": "min=0.329, mean=0.373, max=0.4, sum=1.118 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.11, mean=0.13, max=0.144, sum=0.389 (3)", - "tab": "Robustness", - "score": 0.12963544973544971 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.265, mean=0.314, max=0.339, sum=0.942 (3)", - "tab": "Robustness", - "score": 0.3140445596258007 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.123, mean=0.145, max=0.162, sum=0.436 (3)", - "tab": "Fairness", - "score": 0.1454550264550264 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.311, mean=0.353, max=0.384, sum=1.058 (3)", - "tab": "Fairness", - "score": 0.35251421077315565 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)", - "tab": "General information", - "score": 536.6143333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1, mean=1.005, max=1.008, sum=3.015 (3)", - "tab": "General information", - "score": 1.005 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)", - "tab": "General information", - "score": 519.4961240310078 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.121, - "details": { - "description": "min=0.116, mean=0.121, max=0.13, sum=0.728 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)", - "tab": "General information", - "score": 1575.0364806866953 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=60.474, mean=68.601, max=77.918, sum=411.605 (6)", - "tab": "General information", - "score": 68.60085836909872 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.604, mean=0.612, max=0.618, sum=3.671 (6)", - "tab": "Bias", - "score": 0.6118203882651768 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.401, mean=0.408, max=0.419, sum=2.449 (6)", - "tab": "Bias", - "score": 0.408087030039703 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.257, mean=0.287, max=0.318, sum=1.72 (6)", - "tab": "Bias", - "score": 0.2867291116025263 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.117, mean=0.141, max=0.159, sum=0.844 (6)", - "tab": "Bias", - "score": 0.14067727789435583 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.000715307582260372 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.231, mean=0.359, max=0.443, sum=1.077 (3)", - "tab": "Summarization metrics", - "score": 0.35895859214347764 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.195, mean=0.218, max=0.246, sum=0.654 (3)", - "tab": "Summarization metrics", - "score": 0.21796490870344257 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.801, mean=0.899, max=0.957, sum=5.391 (6)", - "tab": "Summarization metrics", - "score": 0.8985701854042452 - }, - "CNN/DailyMail - Density": { - "description": "min=16.696, mean=24.344, max=33.085, sum=146.063 (6)", - "tab": "Summarization metrics", - "score": 24.343863209587038 - }, - "CNN/DailyMail - Compression": { - "description": "min=9.239, mean=11.42, max=13.421, sum=68.523 (6)", - "tab": "Summarization metrics", - "score": 11.420494637224708 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.099, - "details": { - "description": "min=0.095, mean=0.099, max=0.106, sum=0.596 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)", - "tab": "General information", - "score": 4.998069498069498 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)", - "tab": "General information", - "score": 1537.4517374517375 - }, - "XSUM - # output tokens": { - "description": "min=23.5, mean=23.626, max=23.749, sum=141.757 (6)", - "tab": "General information", - "score": 23.626126126126128 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.424, mean=0.436, max=0.453, sum=2.616 (6)", - "tab": "Bias", - "score": 0.43605987410335234 - }, - "XSUM - Representation (race)": { - "description": "min=0.373, mean=0.393, max=0.404, sum=2.359 (6)", - "tab": "Bias", - "score": 0.393188854489164 - }, - "XSUM - Representation (gender)": { - "description": "min=0.181, mean=0.194, max=0.206, sum=1.165 (6)", - "tab": "Bias", - "score": 0.194128141174599 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.192, mean=-0.171, max=-0.149, sum=-0.513 (3)", - "tab": "Summarization metrics", - "score": -0.17113255308913036 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.382, mean=0.384, max=0.388, sum=1.152 (3)", - "tab": "Summarization metrics", - "score": 0.38412741233326225 - }, - "XSUM - Coverage": { - "description": "min=0.842, mean=0.842, max=0.842, sum=5.051 (6)", - "tab": "Summarization metrics", - "score": 0.8418943137133965 - }, - "XSUM - Density": { - "description": "min=3.715, mean=3.815, max=3.914, sum=22.889 (6)", - "tab": "Summarization metrics", - "score": 3.8148335440941747 - }, - "XSUM - Compression": { - "description": "min=19.45, mean=19.703, max=19.907, sum=118.221 (6)", - "tab": "Summarization metrics", - "score": 19.7034371773279 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.935, - "details": { - "description": "min=0.917, mean=0.935, max=0.947, sum=2.804 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.335, mean=0.36, max=0.394, sum=1.079 (3)", - "tab": "Calibration", - "score": 0.3598306140598746 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.878, mean=0.888, max=0.896, sum=2.665 (3)", - "tab": "Robustness", - "score": 0.8883333333333333 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.896, mean=0.917, max=0.936, sum=2.752 (3)", - "tab": "Fairness", - "score": 0.9173333333333334 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.903, mean=4.229, max=4.983, sum=12.688 (3)", - "tab": "General information", - "score": 4.229333333333333 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1283.038, mean=1562.808, max=1784.2, sum=4688.425 (3)", - "tab": "General information", - "score": 1562.8083333333334 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1.003, max=1.01, sum=3.01 (3)", - "tab": "General information", - "score": 1.0033333333333332 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0, mean=0.5, max=1, sum=27.019 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.265, mean=0.487, max=0.736, sum=26.317 (54)", - "tab": "Calibration", - "score": 0.4873543575629644 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.353, max=0.931, sum=19.089 (54)", - "tab": "Robustness", - "score": 0.35349935695509527 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.493, max=1, sum=26.609 (54)", - "tab": "Fairness", - "score": 0.49275536816045606 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)", - "tab": "General information", - "score": 732.5144825548033 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.591, - "details": { - "description": "min=0.1, mean=0.591, max=0.975, sum=19.5 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.11, mean=0.253, max=0.545, sum=8.337 (33)", - "tab": "Calibration", - "score": 0.25263340417043 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.502, max=0.975, sum=16.55 (33)", - "tab": "Robustness", - "score": 0.5015151515151515 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.1, mean=0.571, max=0.975, sum=18.85 (33)", - "tab": "Fairness", - "score": 0.5712121212121212 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.557, max=5, sum=150.375 (33)", - "tab": "General information", - "score": 4.556818181818182 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)", - "tab": "General information", - "score": 814.446212121212 - }, - "RAFT - # output tokens": { - "description": "min=0.575, mean=3.038, max=6.375, sum=100.25 (33)", - "tab": "General information", - "score": 3.0378787878787885 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json deleted file mode 100644 index adaaa9403..000000000 --- a/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere small v20220720 410M", - "id": "cohere/Cohere-small-v20220720-410M", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.109, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6085000742339626 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.1469566826886926 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.15386697669576083 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5343333333333333 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.45155563090416306 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.412334270667604 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.29156223893065997 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.264, - "details": { - "description": "min=0.18, mean=0.264, max=0.42, sum=3.963 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.049, mean=0.136, max=0.202, sum=2.04 (15)", - "tab": "Calibration", - "score": 0.13602108170852936 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.13, mean=0.226, max=0.42, sum=3.397 (15)", - "tab": "Robustness", - "score": 0.22644444444444442 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.1, mean=0.222, max=0.4, sum=3.334 (15)", - "tab": "Fairness", - "score": 0.22225730994152046 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.265, mean=0.284, max=0.312, sum=4.267 (15)", - "tab": "Efficiency", - "score": 0.284456830180921 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)", - "tab": "General information", - "score": 481.2602105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.457, - "details": { - "description": "min=0.447, mean=0.457, max=0.464, sum=1.372 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.072, mean=0.095, max=0.124, sum=0.285 (3)", - "tab": "Calibration", - "score": 0.09496766959019069 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.352, mean=0.361, max=0.378, sum=1.083 (3)", - "tab": "Robustness", - "score": 0.361 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.346, mean=0.374, max=0.396, sum=1.121 (3)", - "tab": "Fairness", - "score": 0.37366666666666665 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.319, mean=0.367, max=0.436, sum=1.101 (3)", - "tab": "Efficiency", - "score": 0.36694511328125 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)", - "tab": "General information", - "score": 925.3070000000001 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1.001, max=1.004, sum=3.004 (3)", - "tab": "General information", - "score": 1.0013333333333334 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.294, - "details": { - "description": "min=0.281, mean=0.294, max=0.309, sum=0.881 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.029, mean=0.031, max=0.033, sum=0.093 (3)", - "tab": "Calibration", - "score": 0.031094283389380417 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.076, mean=0.078, max=0.081, sum=0.235 (3)", - "tab": "Robustness", - "score": 0.07821074014295328 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.167, mean=0.179, max=0.197, sum=0.538 (3)", - "tab": "Fairness", - "score": 0.17918507973514153 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.544, mean=0.56, max=0.583, sum=1.681 (3)", - "tab": "Efficiency", - "score": 0.5603894916373239 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)", - "tab": "General information", - "score": 1.5624413145539906 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)", - "tab": "General information", - "score": 1634.9896713615024 - }, - "NarrativeQA - # output tokens": { - "description": "min=8.149, mean=11.007, max=15.597, sum=33.02 (3)", - "tab": "General information", - "score": 11.006572769953053 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.411, mean=0.418, max=0.429, sum=1.255 (3)", - "tab": "Bias", - "score": 0.4184126984126984 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)", - "tab": "Bias", - "score": 0.5555555555555556 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.186, mean=0.202, max=0.217, sum=0.606 (3)", - "tab": "Bias", - "score": 0.20205501924662395 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.025, mean=0.027, max=0.031, sum=0.082 (3)", - "tab": "Toxicity", - "score": 0.027230046948356807 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.309, - "details": { - "description": "min=0.291, mean=0.309, max=0.334, sum=0.928 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.02, mean=0.023, max=0.027, sum=0.07 (3)", - "tab": "Calibration", - "score": 0.023328620693919305 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.18, mean=0.198, max=0.221, sum=0.594 (3)", - "tab": "Calibration", - "score": 0.198062019189297 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.024, mean=0.025, max=0.027, sum=0.075 (3)", - "tab": "Robustness", - "score": 0.025009279663584086 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.066, mean=0.074, max=0.08, sum=0.222 (3)", - "tab": "Robustness", - "score": 0.07408175909872887 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.052, mean=0.055, max=0.062, sum=0.166 (3)", - "tab": "Fairness", - "score": 0.055406816944260924 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.198, mean=0.219, max=0.246, sum=0.657 (3)", - "tab": "Fairness", - "score": 0.21887630944724534 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.246, mean=0.251, max=0.259, sum=0.753 (3)", - "tab": "Efficiency", - "score": 0.2509381953124994 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.552, mean=0.605, max=0.643, sum=1.815 (3)", - "tab": "Efficiency", - "score": 0.6049964999999996 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)", - "tab": "General information", - "score": 111.19099999999999 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.325, mean=5.149, max=6.46, sum=15.446 (3)", - "tab": "General information", - "score": 5.148666666666667 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)", - "tab": "General information", - "score": 4.633 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)", - "tab": "General information", - "score": 1481.344 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=20.452, mean=22.835, max=25.41, sum=68.505 (3)", - "tab": "General information", - "score": 22.834999999999997 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.238, mean=0.415, max=0.539, sum=1.244 (3)", - "tab": "Bias", - "score": 0.41471861471861476 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.167, mean=0.234, max=0.286, sum=0.702 (3)", - "tab": "Bias", - "score": 0.2341269841269841 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.48, mean=0.485, max=0.494, sum=1.455 (3)", - "tab": "Bias", - "score": 0.48499285130718955 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.382, mean=0.435, max=0.467, sum=1.306 (3)", - "tab": "Bias", - "score": 0.43543086336382425 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.234, mean=0.265, max=0.3, sum=0.796 (3)", - "tab": "Bias", - "score": 0.2653339127915399 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006666666666666666 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.003, sum=0.008 (3)", - "tab": "Toxicity", - "score": 0.0026666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219, - "details": { - "description": "min=0.208, mean=0.219, max=0.238, sum=0.656 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.03, mean=0.036, max=0.042, sum=0.108 (3)", - "tab": "Calibration", - "score": 0.035862172954873824 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.094, mean=0.098, max=0.101, sum=0.293 (3)", - "tab": "Robustness", - "score": 0.09766108203425072 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.135, mean=0.144, max=0.162, sum=0.433 (3)", - "tab": "Fairness", - "score": 0.14446776305873513 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=0.611, mean=0.619, max=0.625, sum=1.856 (3)", - "tab": "Efficiency", - "score": 0.6185995332031252 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)", - "tab": "General information", - "score": 0.8813333333333334 - }, - "QuAC - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)", - "tab": "General information", - "score": 0.02 - }, - "QuAC - # prompt tokens": { - "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)", - "tab": "General information", - "score": 1639.784333333333 - }, - "QuAC - # output tokens": { - "description": "min=18.807, mean=20.639, max=21.99, sum=61.916 (3)", - "tab": "General information", - "score": 20.638666666666666 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.447, mean=0.458, max=0.468, sum=1.375 (3)", - "tab": "Bias", - "score": 0.45823351891324243 - }, - "QuAC - Representation (race)": { - "description": "min=0.329, mean=0.341, max=0.364, sum=1.022 (3)", - "tab": "Bias", - "score": 0.34075560523096593 - }, - "QuAC - Representation (gender)": { - "description": "min=0.277, mean=0.285, max=0.299, sum=0.854 (3)", - "tab": "Bias", - "score": 0.2847879707506289 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.003, max=0.004, sum=0.008 (3)", - "tab": "Toxicity", - "score": 0.0026666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.483, - "details": { - "description": "min=0.483, mean=0.483, max=0.483, sum=0.483 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.083, mean=0.083, max=0.083, sum=0.083 (1)", - "tab": "Calibration", - "score": 0.08312318484699062 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)", - "tab": "Robustness", - "score": 0.405 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.308 (1)", - "tab": "Fairness", - "score": 0.308 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.223, mean=0.223, max=0.223, sum=0.223 (1)", - "tab": "Efficiency", - "score": 0.22341269531249972 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)", - "tab": "General information", - "score": 88.855 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.348, - "details": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.348 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.379 (1)", - "tab": "Calibration", - "score": 0.37852917669250147 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)", - "tab": "Robustness", - "score": 0.238 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.28 (1)", - "tab": "Fairness", - "score": 0.28 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)", - "tab": "Efficiency", - "score": 0.2136278906249995 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)", - "tab": "General information", - "score": 5.358 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.217, - "details": { - "description": "min=0.202, mean=0.217, max=0.226, sum=0.65 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.059, mean=0.076, max=0.098, sum=0.229 (3)", - "tab": "Calibration", - "score": 0.07625390965133329 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.2, mean=0.204, max=0.211, sum=0.612 (3)", - "tab": "Robustness", - "score": 0.2038735983690112 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.194, mean=0.203, max=0.214, sum=0.609 (3)", - "tab": "Fairness", - "score": 0.20285423037716613 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.287, mean=0.289, max=0.295, sum=0.868 (3)", - "tab": "Efficiency", - "score": 0.2894203160837155 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)", - "tab": "General information", - "score": 514.6483180428135 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.304, - "details": { - "description": "min=0.258, mean=0.304, max=0.338, sum=0.911 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.22, mean=0.252, max=0.287, sum=0.757 (3)", - "tab": "Robustness", - "score": 0.2521940956196658 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.228, mean=0.28, max=0.324, sum=0.84 (3)", - "tab": "Fairness", - "score": 0.2798487582673837 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.282, mean=0.291, max=0.303, sum=0.872 (3)", - "tab": "Efficiency", - "score": 0.29054985767926356 - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)", - "tab": "General information", - "score": 519.4961240310078 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1.031, max=1.093, sum=3.093 (3)", - "tab": "General information", - "score": 1.0310077519379846 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.063, - "details": { - "description": "min=0.031, mean=0.063, max=0.087, sum=0.377 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=0.781, mean=0.954, max=1.052, sum=5.724 (6)", - "tab": "Efficiency", - "score": 0.9539734693535404 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)", - "tab": "General information", - "score": 1575.0364806866953 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=49.71, mean=78.352, max=93.899, sum=470.112 (6)", - "tab": "General information", - "score": 78.3519313304721 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.625, mean=0.648, max=0.667, sum=3.885 (6)", - "tab": "Bias", - "score": 0.6475615887380594 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.405, mean=0.42, max=0.449, sum=2.522 (6)", - "tab": "Bias", - "score": 0.4203329386778049 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.099, mean=0.145, max=0.201, sum=0.868 (6)", - "tab": "Bias", - "score": 0.14468337947687135 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.163, mean=0.182, max=0.21, sum=1.09 (6)", - "tab": "Bias", - "score": 0.18171396544569016 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.077, mean=0.054, max=0.168, sum=0.161 (3)", - "tab": "Summarization metrics", - "score": 0.053643734154981075 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=0.051, mean=2.638, max=4.057, sum=15.831 (6)", - "tab": "Summarization metrics", - "score": 2.6384596103973283 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=-0.069, mean=0.026, max=0.075, sum=0.077 (3)", - "tab": "Summarization metrics", - "score": 0.025643326292308758 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.532, mean=0.744, max=0.913, sum=4.465 (6)", - "tab": "Summarization metrics", - "score": 0.7441391663831297 - }, - "CNN/DailyMail - Density": { - "description": "min=11.632, mean=25.238, max=33.415, sum=151.427 (6)", - "tab": "Summarization metrics", - "score": 25.237906513316556 - }, - "CNN/DailyMail - Compression": { - "description": "min=9.053, mean=13.243, max=20.787, sum=79.46 (6)", - "tab": "Summarization metrics", - "score": 13.243377373187593 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.033, - "details": { - "description": "min=0.031, mean=0.033, max=0.037, sum=0.199 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.637, mean=0.642, max=0.649, sum=3.85 (6)", - "tab": "Efficiency", - "score": 0.6416181225868728 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)", - "tab": "General information", - "score": 4.998069498069498 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)", - "tab": "General information", - "score": 1537.4517374517375 - }, - "XSUM - # output tokens": { - "description": "min=25.859, mean=27.394, max=28.226, sum=164.363 (6)", - "tab": "General information", - "score": 27.393822393822393 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.399, mean=0.43, max=0.493, sum=2.58 (6)", - "tab": "Bias", - "score": 0.43004930254930257 - }, - "XSUM - Representation (race)": { - "description": "min=0.542, mean=0.556, max=0.583, sum=3.333 (6)", - "tab": "Bias", - "score": 0.5555555555555556 - }, - "XSUM - Representation (gender)": { - "description": "min=0.224, mean=0.246, max=0.283, sum=1.474 (6)", - "tab": "Bias", - "score": 0.2457025240044108 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.0006435006435006435 - }, - "XSUM - SummaC": { - "description": "min=0.0, mean=0.028, max=0.073, sum=0.085 (3)", - "tab": "Summarization metrics", - "score": 0.02834827232857105 - }, - "XSUM - QAFactEval": { - "description": "min=2.873, mean=3.094, max=3.373, sum=18.563 (6)", - "tab": "Summarization metrics", - "score": 3.0938511325795113 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.173, mean=0.195, max=0.221, sum=0.585 (3)", - "tab": "Summarization metrics", - "score": 0.1951040609680371 - }, - "XSUM - Coverage": { - "description": "min=0.853, mean=0.863, max=0.87, sum=5.178 (6)", - "tab": "Summarization metrics", - "score": 0.8630576414302875 - }, - "XSUM - Density": { - "description": "min=9.489, mean=10.557, max=12.063, sum=63.341 (6)", - "tab": "Summarization metrics", - "score": 10.556911526268395 - }, - "XSUM - Compression": { - "description": "min=16.738, mean=17.551, max=18.157, sum=105.306 (6)", - "tab": "Summarization metrics", - "score": 17.55096225657148 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.53, mean=0.578, max=0.618, sum=1.735 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.085, mean=0.134, max=0.174, sum=0.401 (3)", - "tab": "Calibration", - "score": 0.13354341899719424 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.447, mean=0.473, max=0.498, sum=1.418 (3)", - "tab": "Robustness", - "score": 0.4726666666666666 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.49, mean=0.518, max=0.54, sum=1.554 (3)", - "tab": "Fairness", - "score": 0.518 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.414, mean=0.458, max=0.52, sum=1.373 (3)", - "tab": "Efficiency", - "score": 0.45773176757812467 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.846, mean=4.93, max=4.98, sum=14.79 (3)", - "tab": "General information", - "score": 4.930000000000001 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1161.854, mean=1398.654, max=1747.025, sum=4195.961 (3)", - "tab": "General information", - "score": 1398.6536666666668 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.501, - "details": { - "description": "min=0, mean=0.501, max=1, sum=27.062 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.201, mean=0.486, max=0.8, sum=26.269 (54)", - "tab": "Calibration", - "score": 0.4864679961449666 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.434, max=1, sum=23.451 (54)", - "tab": "Robustness", - "score": 0.4342847473494527 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.495, max=1, sum=26.744 (54)", - "tab": "Fairness", - "score": 0.49526155082406725 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.264, mean=0.329, max=0.439, sum=17.76 (54)", - "tab": "Efficiency", - "score": 0.32889709084919744 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)", - "tab": "General information", - "score": 732.5144825548033 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.492, - "details": { - "description": "min=0, mean=0.492, max=0.975, sum=16.225 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.084, mean=0.234, max=0.631, sum=7.714 (33)", - "tab": "Calibration", - "score": 0.23374335739699753 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.403, max=0.975, sum=13.3 (33)", - "tab": "Robustness", - "score": 0.40303030303030307 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.452, max=0.975, sum=14.9 (33)", - "tab": "Fairness", - "score": 0.4515151515151515 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.256, mean=0.36, max=0.547, sum=11.878 (33)", - "tab": "Efficiency", - "score": 0.3599495087594697 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.557, max=5, sum=150.375 (33)", - "tab": "General information", - "score": 4.556818181818182 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)", - "tab": "General information", - "score": 814.446212121212 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=3.239, max=5.575, sum=106.9 (33)", - "tab": "General information", - "score": 3.2393939393939393 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json deleted file mode 100644 index 80b637746..000000000 --- a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere xlarge v20220609 52.4B", - "id": "cohere/Cohere-xlarge-v20220609-52.4B", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.5427202179052317 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.5061059259613209 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.5496737226436893 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.1992872807017544 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5983741692925366 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5744286577619911 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.546345029239766 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.353, - "details": { - "description": "min=0.228, mean=0.353, max=0.56, sum=5.296 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.089, mean=0.149, max=0.246, sum=2.242 (15)", - "tab": "Calibration", - "score": 0.14945785718149934 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.158, mean=0.29, max=0.51, sum=4.349 (15)", - "tab": "Robustness", - "score": 0.28992982456140354 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.158, mean=0.315, max=0.53, sum=4.729 (15)", - "tab": "Fairness", - "score": 0.31526315789473686 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.47, mean=0.489, max=0.506, sum=7.328 (15)", - "tab": "Efficiency", - "score": 0.4885340888157895 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)", - "tab": "General information", - "score": 481.2602105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.702, mean=0.718, max=0.74, sum=2.153 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.037, mean=0.04, max=0.043, sum=0.119 (3)", - "tab": "Calibration", - "score": 0.039674216829776156 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.601, mean=0.614, max=0.622, sum=1.842 (3)", - "tab": "Robustness", - "score": 0.614 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.657, mean=0.667, max=0.681, sum=2 (3)", - "tab": "Fairness", - "score": 0.6666666666666666 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.519, mean=0.598, max=0.705, sum=1.795 (3)", - "tab": "Efficiency", - "score": 0.5984045305989586 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)", - "tab": "General information", - "score": 925.3070000000001 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1.001, max=1.004, sum=3.004 (3)", - "tab": "General information", - "score": 1.0013333333333334 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65, - "details": { - "description": "min=0.593, mean=0.65, max=0.688, sum=1.95 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.048, mean=0.062, max=0.079, sum=0.185 (3)", - "tab": "Calibration", - "score": 0.061654179655226814 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.331, mean=0.383, max=0.42, sum=1.148 (3)", - "tab": "Robustness", - "score": 0.38251983624053415 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.481, mean=0.548, max=0.591, sum=1.644 (3)", - "tab": "Fairness", - "score": 0.5478470147843514 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=1.025, mean=1.062, max=1.132, sum=3.185 (3)", - "tab": "Efficiency", - "score": 1.061820745305164 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)", - "tab": "General information", - "score": 1.5624413145539906 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)", - "tab": "General information", - "score": 1634.9896713615024 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.794, mean=7.077, max=9.031, sum=21.231 (3)", - "tab": "General information", - "score": 7.07699530516432 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.396, mean=0.454, max=0.5, sum=1.362 (3)", - "tab": "Bias", - "score": 0.4541666666666666 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)", - "tab": "Bias", - "score": 0.5555555555555557 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.204, mean=0.208, max=0.215, sum=0.624 (3)", - "tab": "Bias", - "score": 0.20801619481196945 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.021, max=0.028, sum=0.062 (3)", - "tab": "Toxicity", - "score": 0.020657276995305163 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.595, - "details": { - "description": "min=0.576, mean=0.595, max=0.607, sum=1.785 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.061, mean=0.068, max=0.073, sum=0.203 (3)", - "tab": "Calibration", - "score": 0.06770990173751885 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.075, mean=0.085, max=0.099, sum=0.254 (3)", - "tab": "Calibration", - "score": 0.08482055822987211 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.233, mean=0.238, max=0.241, sum=0.713 (3)", - "tab": "Robustness", - "score": 0.23753663022529162 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.411, mean=0.471, max=0.518, sum=1.414 (3)", - "tab": "Robustness", - "score": 0.4713418135089589 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.248, mean=0.255, max=0.259, sum=0.764 (3)", - "tab": "Fairness", - "score": 0.25466316487855734 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.521, mean=0.535, max=0.546, sum=1.604 (3)", - "tab": "Fairness", - "score": 0.5348225692810691 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.546, mean=0.565, max=0.586, sum=1.694 (3)", - "tab": "Efficiency", - "score": 0.5647122317708332 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.95, mean=1.085, max=1.249, sum=3.256 (3)", - "tab": "Efficiency", - "score": 1.0851867500000003 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)", - "tab": "General information", - "score": 111.19099999999999 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.31, mean=5.844, max=6.407, sum=17.531 (3)", - "tab": "General information", - "score": 5.843666666666667 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)", - "tab": "General information", - "score": 4.633 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)", - "tab": "General information", - "score": 1481.344 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.154, mean=8.834, max=11.932, sum=26.502 (3)", - "tab": "General information", - "score": 8.834 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.382, mean=0.43, max=0.498, sum=1.291 (3)", - "tab": "Bias", - "score": 0.4304995528213292 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.024, mean=0.094, max=0.18, sum=0.281 (3)", - "tab": "Bias", - "score": 0.09357753357753357 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.332, mean=0.388, max=0.488, sum=1.163 (3)", - "tab": "Bias", - "score": 0.38769841269841265 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.373, mean=0.409, max=0.446, sum=1.226 (3)", - "tab": "Bias", - "score": 0.40861462430089884 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.026, mean=0.051, max=0.066, sum=0.153 (3)", - "tab": "Bias", - "score": 0.051062717190300304 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.361, - "details": { - "description": "min=0.355, mean=0.361, max=0.365, sum=1.082 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.066, mean=0.067, max=0.07, sum=0.201 (3)", - "tab": "Calibration", - "score": 0.06703451532890617 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.214, mean=0.215, max=0.216, sum=0.646 (3)", - "tab": "Robustness", - "score": 0.2154779030326859 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.274, mean=0.281, max=0.287, sum=0.844 (3)", - "tab": "Fairness", - "score": 0.2814055112322921 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=2.057, mean=2.089, max=2.151, sum=6.267 (3)", - "tab": "Efficiency", - "score": 2.0889632337239585 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)", - "tab": "General information", - "score": 0.8813333333333334 - }, - "QuAC - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)", - "tab": "General information", - "score": 0.02 - }, - "QuAC - # prompt tokens": { - "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)", - "tab": "General information", - "score": 1639.784333333333 - }, - "QuAC - # output tokens": { - "description": "min=31.783, mean=32.717, max=34.585, sum=98.152 (3)", - "tab": "General information", - "score": 32.717333333333336 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.556, mean=0.582, max=0.6, sum=1.745 (3)", - "tab": "Bias", - "score": 0.5815402704291595 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.43, mean=0.438, max=0.449, sum=1.315 (3)", - "tab": "Bias", - "score": 0.4381760996205441 - }, - "QuAC - Representation (race)": { - "description": "min=0.333, mean=0.344, max=0.355, sum=1.033 (3)", - "tab": "Bias", - "score": 0.3443830841027822 - }, - "QuAC - Representation (gender)": { - "description": "min=0.223, mean=0.23, max=0.237, sum=0.691 (3)", - "tab": "Bias", - "score": 0.23033600244512342 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811, - "details": { - "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.341 (1)", - "tab": "Calibration", - "score": 0.34142560211110756 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.759, mean=0.759, max=0.759, sum=0.759 (1)", - "tab": "Robustness", - "score": 0.759 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)", - "tab": "Fairness", - "score": 0.66 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.359 (1)", - "tab": "Efficiency", - "score": 0.35889839843750027 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)", - "tab": "General information", - "score": 88.855 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.235, mean=0.235, max=0.235, sum=0.235 (1)", - "tab": "Calibration", - "score": 0.23470136403728084 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)", - "tab": "Robustness", - "score": 0.448 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.47 (1)", - "tab": "Fairness", - "score": 0.47 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.314 (1)", - "tab": "Efficiency", - "score": 0.3138882968749995 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)", - "tab": "General information", - "score": 5.358 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.198, - "details": { - "description": "min=0.177, mean=0.198, max=0.225, sum=0.593 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.075, mean=0.099, max=0.119, sum=0.298 (3)", - "tab": "Calibration", - "score": 0.0994665665272844 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.122, mean=0.151, max=0.182, sum=0.454 (3)", - "tab": "Robustness", - "score": 0.15137614678899083 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.138, mean=0.156, max=0.182, sum=0.469 (3)", - "tab": "Fairness", - "score": 0.1564729867482161 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.49, mean=0.501, max=0.506, sum=1.502 (3)", - "tab": "Efficiency", - "score": 0.50081436353211 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)", - "tab": "General information", - "score": 514.6483180428135 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459, - "details": { - "description": "min=0.429, mean=0.459, max=0.479, sum=1.378 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.191, mean=0.207, max=0.223, sum=0.622 (3)", - "tab": "Robustness", - "score": 0.20732857142857117 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.371, mean=0.397, max=0.414, sum=1.19 (3)", - "tab": "Robustness", - "score": 0.39663320695609633 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.211, mean=0.233, max=0.251, sum=0.698 (3)", - "tab": "Fairness", - "score": 0.23262777777777743 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.394, mean=0.431, max=0.457, sum=1.292 (3)", - "tab": "Fairness", - "score": 0.4307144032412258 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.492, mean=0.499, max=0.504, sum=1.496 (3)", - "tab": "Efficiency", - "score": 0.4985355449218751 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.487, mean=0.501, max=0.511, sum=1.504 (3)", - "tab": "Efficiency", - "score": 0.501260492369186 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)", - "tab": "General information", - "score": 536.6143333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)", - "tab": "General information", - "score": 519.4961240310078 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.144, - "details": { - "description": "min=0.14, mean=0.144, max=0.146, sum=0.861 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=4.313, mean=4.337, max=4.381, sum=26.024 (6)", - "tab": "Efficiency", - "score": 4.3373758759723735 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)", - "tab": "General information", - "score": 1575.0364806866953 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=88.871, mean=89.431, max=90.324, sum=536.588 (6)", - "tab": "General information", - "score": 89.43133047210301 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.616, mean=0.626, max=0.635, sum=3.753 (6)", - "tab": "Bias", - "score": 0.6255738197534654 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.377, mean=0.387, max=0.397, sum=2.32 (6)", - "tab": "Bias", - "score": 0.38662344919565644 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.244, mean=0.301, max=0.358, sum=1.808 (6)", - "tab": "Bias", - "score": 0.30129162776221596 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.104, mean=0.117, max=0.128, sum=0.7 (6)", - "tab": "Bias", - "score": 0.116591581511673 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.013 (6)", - "tab": "Toxicity", - "score": 0.002145922746781116 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.393, mean=0.469, max=0.516, sum=1.407 (3)", - "tab": "Summarization metrics", - "score": 0.46891720389173397 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.621, mean=4.683, max=4.752, sum=28.101 (6)", - "tab": "Summarization metrics", - "score": 4.683468662049275 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.257, mean=0.264, max=0.275, sum=0.792 (3)", - "tab": "Summarization metrics", - "score": 0.2639259716833397 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.897, mean=0.945, max=0.971, sum=5.671 (6)", - "tab": "Summarization metrics", - "score": 0.945166441130516 - }, - "CNN/DailyMail - Density": { - "description": "min=43.963, mean=49.713, max=55.846, sum=298.279 (6)", - "tab": "Summarization metrics", - "score": 49.713109703758754 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.816, mean=9.072, max=9.547, sum=54.43 (6)", - "tab": "Summarization metrics", - "score": 9.071669466217989 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "min=0.978, mean=0.993, max=1, sum=5.956 (6)", - "tab": "Summarization metrics", - "score": 0.9925925925925926 - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "min=4.422, mean=4.539, max=4.667, sum=27.237 (6)", - "tab": "Summarization metrics", - "score": 4.5394335511982575 - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "min=3.556, mean=3.69, max=3.81, sum=22.142 (6)", - "tab": "Summarization metrics", - "score": 3.6903205726735138 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.129, - "details": { - "description": "min=0.125, mean=0.129, max=0.134, sum=0.775 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=1.735, mean=1.741, max=1.747, sum=10.443 (6)", - "tab": "Efficiency", - "score": 1.7405486446267702 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)", - "tab": "General information", - "score": 4.998069498069498 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)", - "tab": "General information", - "score": 1537.4517374517375 - }, - "XSUM - # output tokens": { - "description": "min=24.515, mean=24.802, max=25.066, sum=148.815 (6)", - "tab": "General information", - "score": 24.802445302445303 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.456, mean=0.463, max=0.468, sum=2.78 (6)", - "tab": "Bias", - "score": 0.4633319142897687 - }, - "XSUM - Representation (race)": { - "description": "min=0.532, mean=0.622, max=0.667, sum=3.73 (6)", - "tab": "Bias", - "score": 0.6216216216216217 - }, - "XSUM - Representation (gender)": { - "description": "min=0.184, mean=0.205, max=0.224, sum=1.231 (6)", - "tab": "Bias", - "score": 0.2051781150126976 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.0006435006435006435 - }, - "XSUM - SummaC": { - "description": "min=-0.265, mean=-0.253, max=-0.236, sum=-0.758 (3)", - "tab": "Summarization metrics", - "score": -0.252571659198599 - }, - "XSUM - QAFactEval": { - "description": "min=2.761, mean=2.981, max=3.213, sum=17.888 (6)", - "tab": "Summarization metrics", - "score": 2.981288283366219 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.431, mean=0.434, max=0.438, sum=1.301 (3)", - "tab": "Summarization metrics", - "score": 0.4335328367301425 - }, - "XSUM - Coverage": { - "description": "min=0.794, mean=0.8, max=0.803, sum=4.797 (6)", - "tab": "Summarization metrics", - "score": 0.7995514803953769 - }, - "XSUM - Density": { - "description": "min=2.71, mean=2.945, max=3.142, sum=17.67 (6)", - "tab": "Summarization metrics", - "score": 2.945005615644467 - }, - "XSUM - Compression": { - "description": "min=18.323, mean=18.422, max=18.574, sum=110.533 (6)", - "tab": "Summarization metrics", - "score": 18.422086618359014 - }, - "XSUM - HumanEval-faithfulness": { - "description": "min=0.638, mean=0.661, max=0.697, sum=3.968 (6)", - "tab": "Summarization metrics", - "score": 0.6612578878025103 - }, - "XSUM - HumanEval-relevance": { - "description": "min=4.212, mean=4.239, max=4.275, sum=25.431 (6)", - "tab": "Summarization metrics", - "score": 4.238517902133463 - }, - "XSUM - HumanEval-coherence": { - "description": "min=4.773, mean=4.825, max=4.877, sum=28.952 (6)", - "tab": "Summarization metrics", - "score": 4.825335737235052 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.956, - "details": { - "description": "min=0.941, mean=0.956, max=0.965, sum=2.867 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.05, mean=0.069, max=0.081, sum=0.206 (3)", - "tab": "Calibration", - "score": 0.06875792133691605 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.907, mean=0.923, max=0.933, sum=2.768 (3)", - "tab": "Robustness", - "score": 0.9226666666666667 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.93, mean=0.949, max=0.96, sum=2.846 (3)", - "tab": "Fairness", - "score": 0.9486666666666667 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.709, mean=0.796, max=0.865, sum=2.389 (3)", - "tab": "Efficiency", - "score": 0.7963252441406254 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.903, mean=4.229, max=4.983, sum=12.688 (3)", - "tab": "General information", - "score": 4.229333333333333 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1283.038, mean=1562.808, max=1784.2, sum=4688.425 (3)", - "tab": "General information", - "score": 1562.8083333333334 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532, - "details": { - "description": "min=0.001, mean=0.532, max=1, sum=28.726 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.051, mean=0.327, max=0.708, sum=17.639 (54)", - "tab": "Calibration", - "score": 0.32664532725883244 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.32, max=0.817, sum=17.265 (54)", - "tab": "Robustness", - "score": 0.31971446667223646 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.001, mean=0.479, max=1, sum=25.855 (54)", - "tab": "Fairness", - "score": 0.4787922217178853 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.464, mean=0.546, max=0.711, sum=29.484 (54)", - "tab": "Efficiency", - "score": 0.5459943267746123 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)", - "tab": "General information", - "score": 732.5144825548033 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.633, - "details": { - "description": "min=0.1, mean=0.633, max=0.95, sum=20.875 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.093, mean=0.274, max=0.825, sum=9.044 (33)", - "tab": "Calibration", - "score": 0.274053604040966 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.563, max=0.925, sum=18.575 (33)", - "tab": "Robustness", - "score": 0.5628787878787879 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.05, mean=0.598, max=0.95, sum=19.75 (33)", - "tab": "Fairness", - "score": 0.5984848484848486 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.458, mean=0.667, max=0.987, sum=22.019 (33)", - "tab": "Efficiency", - "score": 0.6672338778409089 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.557, max=5, sum=150.375 (33)", - "tab": "General information", - "score": 4.556818181818182 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)", - "tab": "General information", - "score": 814.446212121212 - }, - "RAFT - # output tokens": { - "description": "min=0.275, mean=3.051, max=5.95, sum=100.675 (33)", - "tab": "General information", - "score": 3.0507575757575767 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json deleted file mode 100644 index cc49de0c7..000000000 --- a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere xlarge v20221108 52.4B", - "id": "cohere/Cohere-xlarge-v20221108-52.4B", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.664, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.5846823928461301 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.5964421748070247 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.6082341462764155 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.601504827172334 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5642015392015391 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.7039473684210527 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382, - "details": { - "description": "min=0.21, mean=0.382, max=0.67, sum=5.731 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.104, mean=0.143, max=0.197, sum=2.146 (15)", - "tab": "Calibration", - "score": 0.14305203655556303 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.12, mean=0.299, max=0.6, sum=4.49 (15)", - "tab": "Robustness", - "score": 0.29933333333333334 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.13, mean=0.317, max=0.57, sum=4.748 (15)", - "tab": "Fairness", - "score": 0.31652631578947366 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)", - "tab": "General information", - "score": 481.2602105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.761, mean=0.762, max=0.763, sum=2.285 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.037, mean=0.051, max=0.062, sum=0.154 (3)", - "tab": "Calibration", - "score": 0.05127903463780418 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.712, mean=0.718, max=0.722, sum=2.153 (3)", - "tab": "Robustness", - "score": 0.7176666666666667 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.702, mean=0.708, max=0.72, sum=2.124 (3)", - "tab": "Fairness", - "score": 0.7079999999999999 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)", - "tab": "General information", - "score": 925.3070000000001 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672, - "details": { - "description": "min=0.607, mean=0.672, max=0.708, sum=2.017 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.042, mean=0.059, max=0.072, sum=0.178 (3)", - "tab": "Calibration", - "score": 0.059183266964369506 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.313, mean=0.39, max=0.434, sum=1.171 (3)", - "tab": "Robustness", - "score": 0.3901906178600691 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.486, mean=0.553, max=0.589, sum=1.659 (3)", - "tab": "Fairness", - "score": 0.5530542667501213 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)", - "tab": "General information", - "score": 1.5624413145539906 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)", - "tab": "General information", - "score": 1634.9896713615024 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.792, mean=6.729, max=8.434, sum=20.186 (3)", - "tab": "General information", - "score": 6.728638497652582 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.417, mean=0.472, max=0.5, sum=1.417 (3)", - "tab": "Bias", - "score": 0.47222222222222227 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.184, mean=0.192, max=0.197, sum=0.575 (3)", - "tab": "Bias", - "score": 0.19158509798903886 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.013, max=0.02, sum=0.039 (3)", - "tab": "Toxicity", - "score": 0.013145539906103287 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.628, - "details": { - "description": "min=0.619, mean=0.628, max=0.634, sum=1.885 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.044, mean=0.054, max=0.064, sum=0.163 (3)", - "tab": "Calibration", - "score": 0.05430103491623906 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.064, mean=0.073, max=0.08, sum=0.219 (3)", - "tab": "Calibration", - "score": 0.07296237131206641 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.276, mean=0.283, max=0.288, sum=0.85 (3)", - "tab": "Robustness", - "score": 0.28349840532468856 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.49, mean=0.533, max=0.555, sum=1.598 (3)", - "tab": "Robustness", - "score": 0.532530651706331 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.295, mean=0.299, max=0.303, sum=0.898 (3)", - "tab": "Fairness", - "score": 0.299210546403295 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.548, mean=0.566, max=0.58, sum=1.699 (3)", - "tab": "Fairness", - "score": 0.5664508489119625 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)", - "tab": "General information", - "score": 111.19099999999999 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.528, mean=4.808, max=5.211, sum=14.424 (3)", - "tab": "General information", - "score": 4.808 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)", - "tab": "General information", - "score": 4.633 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)", - "tab": "General information", - "score": 1481.344 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.836, mean=6.093, max=6.582, sum=18.278 (3)", - "tab": "General information", - "score": 6.092666666666666 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.333, mean=0.444, max=0.5, sum=1.333 (3)", - "tab": "Bias", - "score": 0.4444444444444444 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.407, mean=0.48, max=0.556, sum=1.441 (3)", - "tab": "Bias", - "score": 0.4804079441760602 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.147, mean=0.247, max=0.385, sum=0.741 (3)", - "tab": "Bias", - "score": 0.24693627450980396 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.186, mean=0.232, max=0.278, sum=0.697 (3)", - "tab": "Bias", - "score": 0.2324074074074074 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.467, mean=0.474, max=0.483, sum=1.423 (3)", - "tab": "Bias", - "score": 0.4744480248239647 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.092, mean=0.113, max=0.135, sum=0.339 (3)", - "tab": "Bias", - "score": 0.11298873219533077 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374, - "details": { - "description": "min=0.367, mean=0.374, max=0.378, sum=1.122 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.053, mean=0.063, max=0.072, sum=0.189 (3)", - "tab": "Calibration", - "score": 0.06295082132498765 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.221, mean=0.229, max=0.234, sum=0.686 (3)", - "tab": "Robustness", - "score": 0.22865454547247813 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.269, mean=0.275, max=0.278, sum=0.824 (3)", - "tab": "Fairness", - "score": 0.27469570002834404 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)", - "tab": "General information", - "score": 0.8813333333333334 - }, - "QuAC - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)", - "tab": "General information", - "score": 0.02 - }, - "QuAC - # prompt tokens": { - "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)", - "tab": "General information", - "score": 1639.784333333333 - }, - "QuAC - # output tokens": { - "description": "min=24.612, mean=27.944, max=31.344, sum=83.832 (3)", - "tab": "General information", - "score": 27.944 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.543, mean=0.571, max=0.589, sum=1.713 (3)", - "tab": "Bias", - "score": 0.570980870980871 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.371, mean=0.395, max=0.426, sum=1.185 (3)", - "tab": "Bias", - "score": 0.3948930748680999 - }, - "QuAC - Representation (race)": { - "description": "min=0.253, mean=0.304, max=0.331, sum=0.912 (3)", - "tab": "Bias", - "score": 0.3038684617631986 - }, - "QuAC - Representation (gender)": { - "description": "min=0.211, mean=0.233, max=0.263, sum=0.699 (3)", - "tab": "Bias", - "score": 0.2330910766304025 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.007 (3)", - "tab": "Toxicity", - "score": 0.0023333333333333335 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Calibration", - "score": 0.3332417863062664 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.764, mean=0.764, max=0.764, sum=0.764 (1)", - "tab": "Robustness", - "score": 0.764 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)", - "tab": "Fairness", - "score": 0.687 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)", - "tab": "General information", - "score": 88.855 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.588, - "details": { - "description": "min=0.588, mean=0.588, max=0.588, sum=0.588 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.207, mean=0.207, max=0.207, sum=0.207 (1)", - "tab": "Calibration", - "score": 0.20665896753536225 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)", - "tab": "Robustness", - "score": 0.482 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Fairness", - "score": 0.5 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)", - "tab": "General information", - "score": 5.358 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.169, - "details": { - "description": "min=0.164, mean=0.169, max=0.179, sum=0.508 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.18, mean=0.211, max=0.233, sum=0.633 (3)", - "tab": "Calibration", - "score": 0.21105124875435366 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.106, mean=0.116, max=0.13, sum=0.349 (3)", - "tab": "Robustness", - "score": 0.1162079510703364 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.112, mean=0.12, max=0.124, sum=0.359 (3)", - "tab": "Fairness", - "score": 0.1197757390417941 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)", - "tab": "General information", - "score": 514.6483180428135 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.526, mean=0.55, max=0.573, sum=1.65 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.201, mean=0.242, max=0.292, sum=0.725 (3)", - "tab": "Robustness", - "score": 0.24177817460317433 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.449, mean=0.482, max=0.527, sum=1.446 (3)", - "tab": "Robustness", - "score": 0.48206153384583117 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.239, mean=0.267, max=0.302, sum=0.802 (3)", - "tab": "Fairness", - "score": 0.2673071428571425 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.51, mean=0.522, max=0.544, sum=1.565 (3)", - "tab": "Fairness", - "score": 0.5216640091882355 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)", - "tab": "General information", - "score": 536.6143333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1, mean=1.002, max=1.005, sum=3.005 (3)", - "tab": "General information", - "score": 1.0016666666666667 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)", - "tab": "General information", - "score": 519.4961240310078 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.153, - "details": { - "description": "min=0.153, mean=0.153, max=0.154, sum=0.92 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)", - "tab": "General information", - "score": 1575.0364806866953 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=89.47, mean=91.338, max=92.403, sum=548.03 (6)", - "tab": "General information", - "score": 91.33834048640915 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.579, mean=0.607, max=0.649, sum=3.642 (6)", - "tab": "Bias", - "score": 0.606957921303154 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.362, mean=0.383, max=0.409, sum=2.3 (6)", - "tab": "Bias", - "score": 0.3833873353199473 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.223, mean=0.266, max=0.328, sum=1.597 (6)", - "tab": "Bias", - "score": 0.26620678930063096 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.128, mean=0.133, max=0.14, sum=0.796 (6)", - "tab": "Bias", - "score": 0.1326032519141558 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.469, mean=0.514, max=0.552, sum=1.542 (3)", - "tab": "Summarization metrics", - "score": 0.5141110990456594 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.281, mean=0.286, max=0.295, sum=0.858 (3)", - "tab": "Summarization metrics", - "score": 0.2858638938260981 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.946, mean=0.971, max=0.984, sum=5.823 (6)", - "tab": "Summarization metrics", - "score": 0.9705641483765838 - }, - "CNN/DailyMail - Density": { - "description": "min=41.158, mean=44.772, max=50.734, sum=268.631 (6)", - "tab": "Summarization metrics", - "score": 44.771778103334206 - }, - "CNN/DailyMail - Compression": { - "description": "min=7.733, mean=8.026, max=8.278, sum=48.156 (6)", - "tab": "Summarization metrics", - "score": 8.02592370223569 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.153, - "details": { - "description": "min=0.148, mean=0.153, max=0.158, sum=0.919 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)", - "tab": "General information", - "score": 4.998069498069498 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)", - "tab": "General information", - "score": 1537.4517374517375 - }, - "XSUM - # output tokens": { - "description": "min=25.925, mean=26.153, max=26.423, sum=156.919 (6)", - "tab": "General information", - "score": 26.153153153153156 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.447, mean=0.454, max=0.463, sum=2.724 (6)", - "tab": "Bias", - "score": 0.45401696819707577 - }, - "XSUM - Representation (race)": { - "description": "min=0.515, mean=0.537, max=0.565, sum=3.223 (6)", - "tab": "Bias", - "score": 0.5371029656743943 - }, - "XSUM - Representation (gender)": { - "description": "min=0.204, mean=0.218, max=0.236, sum=1.306 (6)", - "tab": "Bias", - "score": 0.2176913745770286 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.0006435006435006435 - }, - "XSUM - SummaC": { - "description": "min=-0.28, mean=-0.258, max=-0.245, sum=-0.774 (3)", - "tab": "Summarization metrics", - "score": -0.25799066096812756 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.447, mean=0.451, max=0.454, sum=1.354 (3)", - "tab": "Summarization metrics", - "score": 0.45133514557325344 - }, - "XSUM - Coverage": { - "description": "min=0.79, mean=0.798, max=0.803, sum=4.787 (6)", - "tab": "Summarization metrics", - "score": 0.7978456468638059 - }, - "XSUM - Density": { - "description": "min=2.823, mean=3.009, max=3.208, sum=18.053 (6)", - "tab": "Summarization metrics", - "score": 3.008801536227543 - }, - "XSUM - Compression": { - "description": "min=17.074, mean=17.188, max=17.359, sum=103.128 (6)", - "tab": "Summarization metrics", - "score": 17.187984260626735 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.956, - "details": { - "description": "min=0.941, mean=0.956, max=0.965, sum=2.868 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.05, mean=0.069, max=0.082, sum=0.207 (3)", - "tab": "Calibration", - "score": 0.06908904600115551 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.907, mean=0.923, max=0.933, sum=2.769 (3)", - "tab": "Robustness", - "score": 0.923 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.931, mean=0.949, max=0.96, sum=2.847 (3)", - "tab": "Fairness", - "score": 0.949 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.903, mean=4.229, max=4.983, sum=12.688 (3)", - "tab": "General information", - "score": 4.229333333333333 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1283.038, mean=1562.808, max=1784.2, sum=4688.425 (3)", - "tab": "General information", - "score": 1562.8083333333334 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524, - "details": { - "description": "min=0.035, mean=0.524, max=0.968, sum=28.319 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.056, mean=0.313, max=0.651, sum=16.899 (54)", - "tab": "Calibration", - "score": 0.3129455444585645 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.012, mean=0.408, max=0.908, sum=22.047 (54)", - "tab": "Robustness", - "score": 0.408272754767954 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.03, mean=0.415, max=0.875, sum=22.43 (54)", - "tab": "Fairness", - "score": 0.41537457925495214 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)", - "tab": "General information", - "score": 732.5144825548033 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.624, - "details": { - "description": "min=0, mean=0.624, max=0.975, sum=20.6 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.078, mean=0.25, max=1, sum=8.255 (33)", - "tab": "Calibration", - "score": 0.2501605016965272 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.489, max=0.925, sum=16.125 (33)", - "tab": "Robustness", - "score": 0.48863636363636365 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.604, max=0.975, sum=19.925 (33)", - "tab": "Fairness", - "score": 0.6037878787878787 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.557, max=5, sum=150.375 (33)", - "tab": "General information", - "score": 4.556818181818182 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)", - "tab": "General information", - "score": 814.446212121212 - }, - "RAFT - # output tokens": { - "description": "min=0, mean=2.99, max=7.05, sum=98.675 (33)", - "tab": "General information", - "score": 2.9901515151515157 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json deleted file mode 100644 index bc304945b..000000000 --- a/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pythia 12B", - "id": "eleutherai/Pythia-12B", - "developer": "eleutherai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.257, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.37428307123034227 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.27195804195804196 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.22631701631701634 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4331466568182155 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.38444055944055944 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.274, - "details": { - "description": "min=0.2, mean=0.274, max=0.3, sum=1.368 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.092, mean=0.111, max=0.166, sum=0.557 (5)", - "tab": "Calibration", - "score": 0.11132961223278444 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.17, mean=0.22, max=0.28, sum=1.102 (5)", - "tab": "Robustness", - "score": 0.22035087719298244 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.16, mean=0.212, max=0.29, sum=1.061 (5)", - "tab": "Fairness", - "score": 0.2121052631578947 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662, - "details": { - "description": "min=0.662, mean=0.662, max=0.662, sum=0.662 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.14, mean=0.14, max=0.14, sum=0.14 (1)", - "tab": "Calibration", - "score": 0.13986557582802048 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.51, mean=0.51, max=0.51, sum=0.51 (1)", - "tab": "Robustness", - "score": 0.51 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)", - "tab": "Fairness", - "score": 0.547 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)", - "tab": "General information", - "score": 1251.897 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.596, mean=0.596, max=0.596, sum=0.596 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.239, mean=0.239, max=0.239, sum=0.239 (1)", - "tab": "Calibration", - "score": 0.2394289121866973 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)", - "tab": "Robustness", - "score": 0.42022169799567144 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.449, mean=0.449, max=0.449, sum=0.449 (1)", - "tab": "Fairness", - "score": 0.44869513696457247 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "General information", - "score": 1.9690140845070423 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)", - "tab": "General information", - "score": 1691.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.215, mean=0.215, max=0.215, sum=0.215 (1)", - "tab": "Bias", - "score": 0.2152777777777778 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.023, mean=0.023, max=0.023, sum=0.023 (1)", - "tab": "Toxicity", - "score": 0.022535211267605635 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.581, - "details": { - "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.094, mean=0.094, max=0.094, sum=0.094 (1)", - "tab": "Calibration", - "score": 0.09399996958029097 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", - "tab": "Calibration", - "score": 0.3899944090149843 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.108, mean=0.108, max=0.108, sum=0.108 (1)", - "tab": "Robustness", - "score": 0.10849928114746796 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.47 (1)", - "tab": "Robustness", - "score": 0.46990137932247006 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.131, mean=0.131, max=0.131, sum=0.131 (1)", - "tab": "Fairness", - "score": 0.13109020655004933 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.523, mean=0.523, max=0.523, sum=0.523 (1)", - "tab": "Fairness", - "score": 0.5229768252994325 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)", - "tab": "General information", - "score": 117.299 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)", - "tab": "General information", - "score": 4.704 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)", - "tab": "General information", - "score": 1495.552 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)", - "tab": "Bias", - "score": 0.40682414698162733 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.122, mean=0.122, max=0.122, sum=0.122 (1)", - "tab": "Bias", - "score": 0.1216216216216216 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)", - "tab": "Bias", - "score": 0.4047619047619048 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", - "tab": "Bias", - "score": 0.4666666666666667 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)", - "tab": "Bias", - "score": 0.27551020408163257 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "Toxicity", - "score": 0.002 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.313, - "details": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.313 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.138, mean=0.138, max=0.138, sum=0.138 (1)", - "tab": "Calibration", - "score": 0.1383150544527575 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.171, mean=0.171, max=0.171, sum=0.171 (1)", - "tab": "Robustness", - "score": 0.17120890749036072 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.227, mean=0.227, max=0.227, sum=0.227 (1)", - "tab": "Fairness", - "score": 0.22738715021444486 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)", - "tab": "General information", - "score": 0.883 - }, - "QuAC - truncated": { - "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)", - "tab": "General information", - "score": 0.021 - }, - "QuAC - # prompt tokens": { - "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)", - "tab": "General information", - "score": 1655.708 - }, - "QuAC - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.641, mean=0.641, max=0.641, sum=0.641 (1)", - "tab": "Bias", - "score": 0.6406926406926409 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.415 (1)", - "tab": "Bias", - "score": 0.4150793650793651 - }, - "QuAC - Representation (race)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.314 (1)", - "tab": "Bias", - "score": 0.3137254901960784 - }, - "QuAC - Representation (gender)": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)", - "tab": "Bias", - "score": 0.25965665236051505 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.177, - "details": { - "description": "min=0.177, mean=0.177, max=0.177, sum=0.177 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.094, mean=0.094, max=0.094, sum=0.094 (1)", - "tab": "Calibration", - "score": 0.09363268995646454 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.138, mean=0.138, max=0.138, sum=0.138 (1)", - "tab": "Robustness", - "score": 0.13761467889908258 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.154, mean=0.154, max=0.154, sum=0.154 (1)", - "tab": "Fairness", - "score": 0.154434250764526 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)", - "tab": "General information", - "score": 505.35168195718654 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.931, - "details": { - "description": "min=0.931, mean=0.931, max=0.931, sum=0.931 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.342 (1)", - "tab": "Calibration", - "score": 0.34150363639115 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)", - "tab": "Robustness", - "score": 0.854 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)", - "tab": "Fairness", - "score": 0.916 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)", - "tab": "General information", - "score": 2.911 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)", - "tab": "General information", - "score": 1619.568 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531, - "details": { - "description": "min=0.03, mean=0.531, max=0.988, sum=9.561 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.138, mean=0.297, max=0.479, sum=5.337 (18)", - "tab": "Calibration", - "score": 0.2965193799633309 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.02, mean=0.418, max=0.973, sum=7.526 (18)", - "tab": "Robustness", - "score": 0.41812542395705293 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.01, mean=0.448, max=0.985, sum=8.071 (18)", - "tab": "Fairness", - "score": 0.44837567354282437 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)", - "tab": "General information", - "score": 771.6539847352628 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514, - "details": { - "description": "min=0.175, mean=0.514, max=0.975, sum=5.65 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.175, mean=0.514, max=0.975, sum=5.649 (11)", - "tab": "Calibration", - "score": 0.5135614568346981 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.45, max=0.975, sum=4.95 (11)", - "tab": "Robustness", - "score": 0.45 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.15, mean=0.489, max=0.975, sum=5.375 (11)", - "tab": "Fairness", - "score": 0.48863636363636365 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)", - "tab": "General information", - "score": 4.6045454545454545 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)", - "tab": "General information", - "score": 869.6909090909089 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=330 (11)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json deleted file mode 100644 index 511816a71..000000000 --- a/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pythia 6.9B", - "id": "eleutherai/Pythia-6.9B", - "developer": "eleutherai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.196, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.4304810360777058 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.1820979020979021 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.17121212121212123 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5099743679983342 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.512004662004662 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.236, - "details": { - "description": "min=0.16, mean=0.236, max=0.281, sum=1.181 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.064, mean=0.136, max=0.2, sum=0.682 (5)", - "tab": "Calibration", - "score": 0.1364262799156796 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.12, mean=0.201, max=0.263, sum=1.003 (5)", - "tab": "Robustness", - "score": 0.20063157894736844 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.14, mean=0.207, max=0.254, sum=1.034 (5)", - "tab": "Fairness", - "score": 0.20687719298245613 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.631, - "details": { - "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.106, mean=0.106, max=0.106, sum=0.106 (1)", - "tab": "Calibration", - "score": 0.10596147166386737 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.527, mean=0.527, max=0.527, sum=0.527 (1)", - "tab": "Robustness", - "score": 0.527 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.552, mean=0.552, max=0.552, sum=0.552 (1)", - "tab": "Fairness", - "score": 0.552 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)", - "tab": "General information", - "score": 1251.897 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.528, - "details": { - "description": "min=0.528, mean=0.528, max=0.528, sum=0.528 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)", - "tab": "Calibration", - "score": 0.21689349381563713 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.313 (1)", - "tab": "Robustness", - "score": 0.31250255336597976 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.389 (1)", - "tab": "Fairness", - "score": 0.38935766339772926 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "General information", - "score": 1.9690140845070423 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)", - "tab": "General information", - "score": 1691.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)", - "tab": "Bias", - "score": 0.4444444444444444 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)", - "tab": "Bias", - "score": 0.20434782608695648 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.014, max=0.014, sum=0.014 (1)", - "tab": "Toxicity", - "score": 0.014084507042253521 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539, - "details": { - "description": "min=0.539, mean=0.539, max=0.539, sum=0.539 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.07, mean=0.07, max=0.07, sum=0.07 (1)", - "tab": "Calibration", - "score": 0.06999999827276561 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.369, mean=0.369, max=0.369, sum=0.369 (1)", - "tab": "Calibration", - "score": 0.3689977017786239 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.094, mean=0.094, max=0.094, sum=0.094 (1)", - "tab": "Robustness", - "score": 0.09385332819874069 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)", - "tab": "Robustness", - "score": 0.39128308105054077 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.103, mean=0.103, max=0.103, sum=0.103 (1)", - "tab": "Fairness", - "score": 0.10301926896303132 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)", - "tab": "Fairness", - "score": 0.4640855445555752 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)", - "tab": "General information", - "score": 117.299 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)", - "tab": "General information", - "score": 4.704 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)", - "tab": "General information", - "score": 1495.552 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=299.883, mean=299.883, max=299.883, sum=299.883 (1)", - "tab": "General information", - "score": 299.883 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.312 (1)", - "tab": "Bias", - "score": 0.31182795698924726 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.188, mean=0.188, max=0.188, sum=0.188 (1)", - "tab": "Bias", - "score": 0.1875 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.387 (1)", - "tab": "Bias", - "score": 0.38690476190476186 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)", - "tab": "Bias", - "score": 0.42222222222222217 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.159, mean=0.159, max=0.159, sum=0.159 (1)", - "tab": "Bias", - "score": 0.1590909090909091 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.296, - "details": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.1, mean=0.1, max=0.1, sum=0.1 (1)", - "tab": "Calibration", - "score": 0.09977223409937552 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.171, mean=0.171, max=0.171, sum=0.171 (1)", - "tab": "Robustness", - "score": 0.17097990289529255 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.198, mean=0.198, max=0.198, sum=0.198 (1)", - "tab": "Fairness", - "score": 0.19836760191150613 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)", - "tab": "General information", - "score": 0.883 - }, - "QuAC - truncated": { - "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)", - "tab": "General information", - "score": 0.021 - }, - "QuAC - # prompt tokens": { - "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)", - "tab": "General information", - "score": 1655.708 - }, - "QuAC - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.635, mean=0.635, max=0.635, sum=0.635 (1)", - "tab": "Bias", - "score": 0.6349206349206349 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.416 (1)", - "tab": "Bias", - "score": 0.41639199007620065 - }, - "QuAC - Representation (race)": { - "description": "min=0.369, mean=0.369, max=0.369, sum=0.369 (1)", - "tab": "Bias", - "score": 0.3687074829931972 - }, - "QuAC - Representation (gender)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)", - "tab": "Bias", - "score": 0.25 - }, - "QuAC - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)", - "tab": "Toxicity", - "score": 0.003 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.213, - "details": { - "description": "min=0.213, mean=0.213, max=0.213, sum=0.213 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.076, mean=0.076, max=0.076, sum=0.076 (1)", - "tab": "Calibration", - "score": 0.07613907039385276 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.139, mean=0.139, max=0.139, sum=0.139 (1)", - "tab": "Robustness", - "score": 0.13914373088685014 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.18, mean=0.18, max=0.18, sum=0.18 (1)", - "tab": "Fairness", - "score": 0.18042813455657492 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)", - "tab": "General information", - "score": 505.35168195718654 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.302, mean=0.302, max=0.302, sum=0.302 (1)", - "tab": "Calibration", - "score": 0.3016994708797646 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.871, mean=0.871, max=0.871, sum=0.871 (1)", - "tab": "Robustness", - "score": 0.871 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.911, mean=0.911, max=0.911, sum=0.911 (1)", - "tab": "Fairness", - "score": 0.911 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)", - "tab": "General information", - "score": 2.911 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)", - "tab": "General information", - "score": 1619.568 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.511, - "details": { - "description": "min=0.02, mean=0.511, max=0.988, sum=9.207 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.074, mean=0.259, max=0.508, sum=4.655 (18)", - "tab": "Calibration", - "score": 0.25858613851508827 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.013, mean=0.363, max=0.915, sum=6.531 (18)", - "tab": "Robustness", - "score": 0.3628308048007681 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.001, mean=0.333, max=0.927, sum=5.995 (18)", - "tab": "Fairness", - "score": 0.33307716875468274 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)", - "tab": "General information", - "score": 771.6539847352628 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502, - "details": { - "description": "min=0.175, mean=0.502, max=0.975, sum=5.525 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.175, mean=0.502, max=0.975, sum=5.519 (11)", - "tab": "Calibration", - "score": 0.5016937882323235 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.075, mean=0.377, max=0.975, sum=4.15 (11)", - "tab": "Robustness", - "score": 0.3772727272727272 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.175, mean=0.45, max=0.975, sum=4.95 (11)", - "tab": "Fairness", - "score": 0.45 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)", - "tab": "General information", - "score": 4.6045454545454545 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)", - "tab": "General information", - "score": 869.6909090909089 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=330 (11)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json deleted file mode 100644 index 8d33e45b6..000000000 --- a/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra X 43B", - "id": "google/Palmyra-X-43B", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.732, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.8206682206682206 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.7968401968401968 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5458006056443556 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.462995337995338 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609, - "details": { - "description": "min=0.35, mean=0.609, max=0.88, sum=9.136 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.29, mean=0.566, max=0.86, sum=8.494 (15)", - "tab": "Robustness", - "score": 0.5662339181286549 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.34, mean=0.588, max=0.86, sum=8.822 (15)", - "tab": "Fairness", - "score": 0.5881637426900584 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.896, - "details": { - "description": "min=0.894, mean=0.896, max=0.898, sum=2.689 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.875, mean=0.878, max=0.88, sum=2.634 (3)", - "tab": "Robustness", - "score": 0.878 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.872, mean=0.875, max=0.878, sum=2.625 (3)", - "tab": "Fairness", - "score": 0.875 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1.005, mean=1.007, max=1.01, sum=3.021 (3)", - "tab": "General information", - "score": 1.007 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.732, mean=0.742, max=0.748, sum=2.226 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.667, mean=0.672, max=0.68, sum=2.016 (3)", - "tab": "Robustness", - "score": 0.6719021727640991 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.64, mean=0.651, max=0.659, sum=1.952 (3)", - "tab": "Fairness", - "score": 0.6506183133514157 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3504.577, mean=3803.911, max=3972.577, sum=11411.732 (3)", - "tab": "General information", - "score": 3803.910798122066 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.375, mean=6.272, max=7.29, sum=18.817 (3)", - "tab": "General information", - "score": 6.272300469483568 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.396, mean=0.398, max=0.403, sum=1.194 (3)", - "tab": "Bias", - "score": 0.39814814814814814 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.149, mean=0.159, max=0.181, sum=0.478 (3)", - "tab": "Bias", - "score": 0.15935305534542177 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.011, max=0.014, sum=0.034 (3)", - "tab": "Toxicity", - "score": 0.011267605633802818 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.355, mean=0.363, max=0.368, sum=1.089 (3)", - "tab": "Robustness", - "score": 0.3629707081568259 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.356, mean=0.362, max=0.367, sum=1.087 (3)", - "tab": "Fairness", - "score": 0.3624320629787478 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=3.166, mean=3.19, max=3.231, sum=9.571 (3)", - "tab": "General information", - "score": 3.1903333333333332 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NaturalQuestions (open-book) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NaturalQuestions (open-book) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NaturalQuestions (open-book) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.245, mean=0.314, max=0.378, sum=0.941 (3)", - "tab": "Bias", - "score": 0.31352905160694455 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.25, mean=0.266, max=0.278, sum=0.797 (3)", - "tab": "Bias", - "score": 0.26566951566951563 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473, - "details": { - "description": "min=0.459, mean=0.473, max=0.488, sum=1.419 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.379, mean=0.383, max=0.392, sum=1.15 (3)", - "tab": "Robustness", - "score": 0.38348793103386436 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.394, mean=0.399, max=0.408, sum=1.196 (3)", - "tab": "Fairness", - "score": 0.39873411995988545 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=4676.788, mean=5199.788, max=5842.788, sum=15599.364 (3)", - "tab": "General information", - "score": 5199.788 - }, - "QuAC - # output tokens": { - "description": "min=25.906, mean=26.581, max=27.052, sum=79.742 (3)", - "tab": "General information", - "score": 26.580666666666662 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.63, mean=0.642, max=0.667, sum=1.926 (3)", - "tab": "Bias", - "score": 0.6419753086419754 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.364, mean=0.395, max=0.447, sum=1.186 (3)", - "tab": "Bias", - "score": 0.39526937310090554 - }, - "QuAC - Representation (race)": { - "description": "min=0.286, mean=0.293, max=0.298, sum=0.878 (3)", - "tab": "Bias", - "score": 0.29267512260888473 - }, - "QuAC - Representation (gender)": { - "description": "min=0.221, mean=0.235, max=0.248, sum=0.705 (3)", - "tab": "Bias", - "score": 0.23492413534960777 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "description": "min=0.601, mean=0.616, max=0.63, sum=1.847 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.554, mean=0.568, max=0.584, sum=1.705 (3)", - "tab": "Robustness", - "score": 0.5682976554536188 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.529, mean=0.542, max=0.56, sum=1.625 (3)", - "tab": "Fairness", - "score": 0.5417940876656473 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=0.908, mean=0.949, max=0.982, sum=2.847 (3)", - "tab": "General information", - "score": 0.9490316004077473 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.049, - "details": { - "description": "min=0, mean=0.049, max=0.147, sum=0.147 (3)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=1398 (3)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=4649.758 (3)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=0, mean=17.63, max=52.891, sum=52.891 (3)", - "tab": "General information", - "score": 17.630185979971387 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.622, mean=0.622, max=0.622, sum=0.622 (1)", - "tab": "Bias", - "score": 0.6219394640447272 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.421 (1)", - "tab": "Bias", - "score": 0.42094867293009713 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)", - "tab": "Bias", - "score": 0.27642276422764234 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.114, mean=0.114, max=0.114, sum=0.114 (1)", - "tab": "Bias", - "score": 0.11422708618331054 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "min=0, mean=0.291, max=0.872, sum=0.872 (3)", - "tab": "Summarization metrics", - "score": 0.29078580039209107 - }, - "CNN/DailyMail - Density": { - "description": "min=0, mean=2.35, max=7.049, sum=7.049 (3)", - "tab": "Summarization metrics", - "score": 2.34978873721003 - }, - "CNN/DailyMail - Compression": { - "description": "min=0, mean=3.117, max=9.351, sum=9.351 (3)", - "tab": "Summarization metrics", - "score": 3.116859693035 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.149, - "details": { - "description": "min=0.144, mean=0.149, max=0.157, sum=0.447 (3)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=1554 (3)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.735, max=1539.402, sum=4532.205 (3)", - "tab": "General information", - "score": 1510.734877734878 - }, - "XSUM - # output tokens": { - "description": "min=25.077, mean=25.248, max=25.463, sum=75.745 (3)", - "tab": "General information", - "score": 25.248391248391247 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2.0 (3)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.434, mean=0.438, max=0.444, sum=1.313 (3)", - "tab": "Bias", - "score": 0.43769157088122607 - }, - "XSUM - Representation (race)": { - "description": "min=0.383, mean=0.439, max=0.494, sum=1.318 (3)", - "tab": "Bias", - "score": 0.4393992219104699 - }, - "XSUM - Representation (gender)": { - "description": "min=0.202, mean=0.205, max=0.208, sum=0.616 (3)", - "tab": "Bias", - "score": 0.2054618848004968 - }, - "XSUM - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.0019305019305019308 - }, - "XSUM - SummaC": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "min=0.77, mean=0.775, max=0.778, sum=2.324 (3)", - "tab": "Summarization metrics", - "score": 0.7746217499327193 - }, - "XSUM - Density": { - "description": "min=2.38, mean=2.466, max=2.546, sum=7.399 (3)", - "tab": "Summarization metrics", - "score": 2.4662768763204443 - }, - "XSUM - Compression": { - "description": "min=14.242, mean=14.252, max=14.266, sum=42.756 (3)", - "tab": "Summarization metrics", - "score": 14.25194669426599 - }, - "XSUM - HumanEval-faithfulness": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.935, - "details": { - "description": "min=0.928, mean=0.935, max=0.939, sum=2.806 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.896, mean=0.904, max=0.909, sum=2.713 (3)", - "tab": "Robustness", - "score": 0.9043333333333333 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.909, mean=0.918, max=0.923, sum=2.754 (3)", - "tab": "Fairness", - "score": 0.918 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1282.797, mean=1897.464, max=2572.797, sum=5692.391 (3)", - "tab": "General information", - "score": 1897.4636666666665 - }, - "IMDB - # output tokens": { - "description": "min=1.928, mean=1.939, max=1.95, sum=5.816 (3)", - "tab": "General information", - "score": 1.9386666666666665 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.008, - "details": { - "description": "min=0, mean=0.008, max=0.344, sum=0.406 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.006, max=0.319, sum=0.347 (54)", - "tab": "Robustness", - "score": 0.006429753618269135 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.006, max=0.288, sum=0.338 (54)", - "tab": "Fairness", - "score": 0.006254555939232581 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=0, mean=0.011, max=0.504, sum=0.604 (54)", - "tab": "General information", - "score": 0.011187107057192404 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.701, - "details": { - "description": "min=0, mean=0.701, max=0.975, sum=23.125 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.677, max=0.975, sum=22.35 (33)", - "tab": "Robustness", - "score": 0.6772727272727272 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.672, max=0.975, sum=22.175 (33)", - "tab": "Fairness", - "score": 0.6719696969696969 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=5, mean=5, max=5, sum=165 (33)", - "tab": "General information", - "score": 5.0 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=1279.572, max=6599.65, sum=42225.875 (33)", - "tab": "General information", - "score": 1279.5719696969697 - }, - "RAFT - # output tokens": { - "description": "min=0, mean=3.07, max=6.825, sum=101.3 (33)", - "tab": "General information", - "score": 3.06969696969697 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json deleted file mode 100644 index 2a710defd..000000000 --- a/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/google_T5-11B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "T5 11B", - "id": "google/T5-11B", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.131, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.43469010175763184 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.16445221445221445 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.14974358974358976 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.4340277777777778 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4887674914954327 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5758109174775842 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.1118421052631579 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29, - "details": { - "description": "min=0.211, mean=0.29, max=0.4, sum=4.354 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.1, mean=0.151, max=0.242, sum=2.271 (15)", - "tab": "Calibration", - "score": 0.1514046561108303 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.19, mean=0.258, max=0.38, sum=3.866 (15)", - "tab": "Robustness", - "score": 0.25776608187134503 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.167, mean=0.235, max=0.33, sum=3.525 (15)", - "tab": "Fairness", - "score": 0.23500584795321638 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.173, mean=0.218, max=0.232, sum=3.277 (15)", - "tab": "Efficiency", - "score": 0.21847905223539232 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=2.482, mean=4.326, max=5, sum=64.896 (15)", - "tab": "General information", - "score": 4.326397660818714 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=382.49, mean=420.562, max=467.75, sum=6308.426 (15)", - "tab": "General information", - "score": 420.5617309941521 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.761, - "details": { - "description": "min=0.732, mean=0.761, max=0.803, sum=2.283 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.348, mean=0.433, max=0.512, sum=1.298 (3)", - "tab": "Calibration", - "score": 0.43269382093398495 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.624, mean=0.65, max=0.688, sum=1.951 (3)", - "tab": "Robustness", - "score": 0.6503333333333333 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.697, mean=0.723, max=0.766, sum=2.168 (3)", - "tab": "Fairness", - "score": 0.7226666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.27, mean=0.271, max=0.272, sum=0.814 (3)", - "tab": "Efficiency", - "score": 0.27128291567197677 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=0.969, mean=1.588, max=2.006, sum=4.765 (3)", - "tab": "General information", - "score": 1.5883333333333332 - }, - "BoolQ - truncated": { - "description": "min=0.004, mean=0.004, max=0.004, sum=0.012 (3)", - "tab": "General information", - "score": 0.004 - }, - "BoolQ - # prompt tokens": { - "description": "min=386.367, mean=401.944, max=422.649, sum=1205.833 (3)", - "tab": "General information", - "score": 401.94433333333336 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "BoolQ - Representation (gender)": { - "description": "min=0.125, mean=0.375, max=0.5, sum=1.125 (3)", - "tab": "Bias", - "score": 0.375 - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.086, - "details": { - "description": "min=0.086, mean=0.086, max=0.086, sum=0.257 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)", - "tab": "Calibration", - "score": 8.06672937578031e-11 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.045, mean=0.045, max=0.045, sum=0.136 (3)", - "tab": "Robustness", - "score": 0.04518225074755041 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.05, mean=0.05, max=0.05, sum=0.149 (3)", - "tab": "Fairness", - "score": 0.0497772820026842 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=1.054, mean=1.054, max=1.054, sum=3.163 (3)", - "tab": "Efficiency", - "score": 1.0544504576125933 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - truncated": { - "description": "min=0.825, mean=0.825, max=0.825, sum=2.476 (3)", - "tab": "General information", - "score": 0.8253521126760562 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=492.141, mean=492.141, max=492.141, sum=1476.423 (3)", - "tab": "General information", - "score": 492.14084507042253 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=300 (3)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=1.225 (3)", - "tab": "Bias", - "score": 0.4081829027907459 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=1.1 (3)", - "tab": "Bias", - "score": 0.36666666666666664 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.156, mean=0.156, max=0.156, sum=0.469 (3)", - "tab": "Bias", - "score": 0.15620542082738947 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.011, max=0.011, sum=0.034 (3)", - "tab": "Toxicity", - "score": 0.011267605633802818 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.477, - "details": { - "description": "min=0.278, mean=0.477, max=0.588, sum=1.432 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.07, mean=0.076, max=0.082, sum=0.228 (3)", - "tab": "Calibration", - "score": 0.07599999619350188 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.051, mean=0.239, max=0.356, sum=0.717 (3)", - "tab": "Calibration", - "score": 0.23900003883193166 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.146, mean=0.153, max=0.159, sum=0.458 (3)", - "tab": "Robustness", - "score": 0.15251804391476487 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.047, mean=0.071, max=0.107, sum=0.213 (3)", - "tab": "Robustness", - "score": 0.0710016541484974 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.152, mean=0.159, max=0.164, sum=0.476 (3)", - "tab": "Fairness", - "score": 0.15857963279707157 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.227, mean=0.424, max=0.532, sum=1.271 (3)", - "tab": "Fairness", - "score": 0.42376820534695847 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=2.617, mean=2.856, max=3.211, sum=8.569 (3)", - "tab": "Efficiency", - "score": 2.856322434252687 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=6.926, mean=12.846, max=24.675, sum=38.539 (3)", - "tab": "Efficiency", - "score": 12.84636455836454 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=109.556, mean=113.556, max=118.556, sum=340.668 (3)", - "tab": "General information", - "score": 113.556 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=900 (3)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=0.096, mean=0.924, max=1.792, sum=2.771 (3)", - "tab": "General information", - "score": 0.9236666666666666 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.094, mean=0.349, max=0.839, sum=1.048 (3)", - "tab": "General information", - "score": 0.34933333333333333 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=233.452, mean=301.907, max=339.767, sum=905.721 (3)", - "tab": "General information", - "score": 301.907 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=900 (3)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.529, mean=0.533, max=0.535, sum=1.6 (3)", - "tab": "Bias", - "score": 0.5332530194915516 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.071, mean=0.103, max=0.125, sum=0.308 (3)", - "tab": "Bias", - "score": 0.10251322751322754 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.389, mean=0.417, max=0.472, sum=1.25 (3)", - "tab": "Bias", - "score": 0.4166666666666666 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.483, mean=0.516, max=0.552, sum=1.549 (3)", - "tab": "Bias", - "score": 0.5163891020108681 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.218, mean=0.243, max=0.26, sum=0.728 (3)", - "tab": "Bias", - "score": 0.24276995305164317 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.116, - "details": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.348 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)", - "tab": "Calibration", - "score": 1.908717030577995e-9 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.064, mean=0.064, max=0.064, sum=0.191 (3)", - "tab": "Robustness", - "score": 0.06378325242260692 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.074, mean=0.074, max=0.074, sum=0.221 (3)", - "tab": "Fairness", - "score": 0.07376443691909672 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.032, mean=1.032, max=1.032, sum=3.097 (3)", - "tab": "Efficiency", - "score": 1.0323945961168868 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - truncated": { - "description": "min=0.999, mean=0.999, max=0.999, sum=2.997 (3)", - "tab": "General information", - "score": 0.999 - }, - "QuAC - # prompt tokens": { - "description": "min=510.923, mean=510.923, max=510.923, sum=1532.769 (3)", - "tab": "General information", - "score": 510.923 - }, - "QuAC - # output tokens": { - "description": "min=100, mean=100, max=100, sum=300 (3)", - "tab": "General information", - "score": 100.0 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.949 (3)", - "tab": "Bias", - "score": 0.6495726495726497 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=1.32 (3)", - "tab": "Bias", - "score": 0.4400900674211062 - }, - "QuAC - Representation (race)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=1.192 (3)", - "tab": "Bias", - "score": 0.39717891610987377 - }, - "QuAC - Representation (gender)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.771 (3)", - "tab": "Bias", - "score": 0.25702629193109705 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.133, - "details": { - "description": "min=0.104, mean=0.133, max=0.15, sum=0.532 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.109, mean=0.143, max=0.195, sum=0.574 (4)", - "tab": "Calibration", - "score": 0.1434693835940009 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.09, mean=0.122, max=0.148, sum=0.489 (4)", - "tab": "Robustness", - "score": 0.12232415902140673 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.058, mean=0.101, max=0.136, sum=0.405 (4)", - "tab": "Fairness", - "score": 0.10129969418960244 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.174, mean=0.21, max=0.249, sum=0.838 (4)", - "tab": "Efficiency", - "score": 0.2095953345265857 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.547, max=4.869, sum=14.19 (4)", - "tab": "General information", - "score": 3.547400611620795 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=85.896, mean=371.92, max=471.52, sum=1487.679 (4)", - "tab": "General information", - "score": 371.9197247706422 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.043, - "details": { - "description": "min=0.043, mean=0.043, max=0.043, sum=0.257 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=1.653, mean=1.654, max=1.655, sum=9.926 (6)", - "tab": "Efficiency", - "score": 1.6543884711070522 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=0.062, mean=0.064, max=0.067, sum=0.382 (6)", - "tab": "General information", - "score": 0.06366237482117311 - }, - "CNN/DailyMail - truncated": { - "description": "min=0.929, mean=0.932, max=0.933, sum=5.592 (6)", - "tab": "General information", - "score": 0.9320457796852647 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=500.412, mean=500.553, max=500.835, sum=3003.318 (6)", - "tab": "General information", - "score": 500.5529327610873 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=128, mean=128, max=128, sum=768 (6)", - "tab": "General information", - "score": 128.0 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.632, mean=0.632, max=0.632, sum=3.789 (6)", - "tab": "Bias", - "score": 0.631578947368421 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=2.709 (6)", - "tab": "Bias", - "score": 0.4515726043503821 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.264, mean=0.264, max=0.264, sum=1.581 (6)", - "tab": "Bias", - "score": 0.26356589147286824 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.119, mean=0.119, max=0.12, sum=0.713 (6)", - "tab": "Bias", - "score": 0.11890102842483792 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.125, mean=-0.122, max=-0.117, sum=-0.365 (3)", - "tab": "Summarization metrics", - "score": -0.12151602946968616 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=-0.173, mean=-0.17, max=-0.165, sum=-0.509 (3)", - "tab": "Summarization metrics", - "score": -0.16977369097758946 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.55, mean=0.555, max=0.56, sum=3.329 (6)", - "tab": "Summarization metrics", - "score": 0.5547542182286073 - }, - "CNN/DailyMail - Density": { - "description": "min=2.69, mean=2.698, max=2.706, sum=16.19 (6)", - "tab": "Summarization metrics", - "score": 2.698337926712314 - }, - "CNN/DailyMail - Compression": { - "description": "min=19.085, mean=19.248, max=19.44, sum=115.49 (6)", - "tab": "Summarization metrics", - "score": 19.248383205041776 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.015, - "details": { - "description": "min=0.008, mean=0.015, max=0.018, sum=0.087 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=1.096, mean=1.159, max=1.283, sum=6.953 (6)", - "tab": "Efficiency", - "score": 1.15883249730996 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=0.239, mean=0.3, max=0.373, sum=1.799 (6)", - "tab": "General information", - "score": 0.29987129987129985 - }, - "XSUM - truncated": { - "description": "min=0.602, mean=0.671, max=0.73, sum=4.023 (6)", - "tab": "General information", - "score": 0.6705276705276706 - }, - "XSUM - # prompt tokens": { - "description": "min=432.851, mean=436.826, max=442.064, sum=2620.958 (6)", - "tab": "General information", - "score": 436.8262548262548 - }, - "XSUM - # output tokens": { - "description": "min=64, mean=64, max=64, sum=384 (6)", - "tab": "General information", - "score": 64.0 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2.667 (4)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=3 (6)", - "tab": "Bias", - "score": 0.5 - }, - "XSUM - Representation (race)": { - "description": "min=0.333, mean=0.358, max=0.394, sum=2.15 (6)", - "tab": "Bias", - "score": 0.3582634859230604 - }, - "XSUM - Representation (gender)": { - "description": "min=0.214, mean=0.222, max=0.231, sum=1.332 (6)", - "tab": "Bias", - "score": 0.2219358310118288 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.267, mean=-0.258, max=-0.244, sum=-0.775 (3)", - "tab": "Summarization metrics", - "score": -0.2584302846171323 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=-0.379, mean=-0.315, max=-0.276, sum=-0.944 (3)", - "tab": "Summarization metrics", - "score": -0.3147063674770794 - }, - "XSUM - Coverage": { - "description": "min=0.324, mean=0.355, max=0.372, sum=2.133 (6)", - "tab": "Summarization metrics", - "score": 0.3554524422801694 - }, - "XSUM - Density": { - "description": "min=0.763, mean=0.831, max=0.866, sum=4.987 (6)", - "tab": "Summarization metrics", - "score": 0.831154946558878 - }, - "XSUM - Compression": { - "description": "min=16.29, mean=16.544, max=16.714, sum=99.261 (6)", - "tab": "Summarization metrics", - "score": 16.543527805806836 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.379, - "details": { - "description": "min=0.248, mean=0.379, max=0.568, sum=1.137 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.108, mean=0.236, max=0.374, sum=0.707 (3)", - "tab": "Calibration", - "score": 0.23573461605966659 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.17, mean=0.304, max=0.51, sum=0.911 (3)", - "tab": "Robustness", - "score": 0.3036666666666667 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.162, mean=0.303, max=0.502, sum=0.91 (3)", - "tab": "Fairness", - "score": 0.30333333333333334 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.276, mean=0.278, max=0.28, sum=0.834 (3)", - "tab": "Efficiency", - "score": 0.27797461745258367 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=0.33, mean=0.466, max=0.701, sum=1.397 (3)", - "tab": "General information", - "score": 0.4656666666666666 - }, - "IMDB - truncated": { - "description": "min=0.172, mean=0.173, max=0.173, sum=0.518 (3)", - "tab": "General information", - "score": 0.17266666666666666 - }, - "IMDB - # prompt tokens": { - "description": "min=391.442, mean=408.425, max=434.668, sum=1225.274 (3)", - "tab": "General information", - "score": 408.4246666666666 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "description": "min=0, mean=0.509, max=0.998, sum=27.462 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.108, mean=0.38, max=0.553, sum=20.519 (54)", - "tab": "Calibration", - "score": 0.3799801119037254 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.392, max=0.991, sum=21.175 (54)", - "tab": "Robustness", - "score": 0.39212772273586344 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.329, max=0.991, sum=17.759 (54)", - "tab": "Fairness", - "score": 0.32887358622117774 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.269, mean=0.27, max=0.273, sum=14.596 (54)", - "tab": "Efficiency", - "score": 0.27030228534077655 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=1.019, mean=2.636, max=4.881, sum=142.352 (54)", - "tab": "General information", - "score": 2.6361556323380086 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0.002, max=0.022, sum=0.094 (54)", - "tab": "General information", - "score": 0.0017482982997674094 - }, - "CivilComments - # prompt tokens": { - "description": "min=331.768, mean=416.791, max=477.628, sum=22506.741 (54)", - "tab": "General information", - "score": 416.79149386044713 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37, - "details": { - "description": "min=0, mean=0.37, max=0.925, sum=12.2 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.0, mean=0.367, max=0.925, sum=12.1 (33)", - "tab": "Calibration", - "score": 0.36667176546312147 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.331, max=0.875, sum=10.925 (33)", - "tab": "Robustness", - "score": 0.33106060606060606 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.351, max=0.85, sum=11.575 (33)", - "tab": "Fairness", - "score": 0.3507575757575757 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.411, mean=0.448, max=0.835, sum=14.799 (33)", - "tab": "Efficiency", - "score": 0.4484652494441787 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=2.433, max=5, sum=80.3 (33)", - "tab": "General information", - "score": 2.433333333333333 - }, - "RAFT - truncated": { - "description": "min=0, mean=0.394, max=1, sum=13 (33)", - "tab": "General information", - "score": 0.3939393939393939 - }, - "RAFT - # prompt tokens": { - "description": "min=263.4, mean=420.742, max=511, sum=13884.475 (33)", - "tab": "General information", - "score": 420.7416666666667 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=990 (33)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json deleted file mode 100644 index bb571aece..000000000 --- a/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/google_UL2-20B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "UL2 20B", - "id": "google/UL2-20B", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.167, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.464477335800185 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.2572027972027972 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.1858974358974359 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5056944444444444 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5601766236691538 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.2902378485711819 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.11842105263157894 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.291, - "details": { - "description": "min=0.2, mean=0.291, max=0.39, sum=4.368 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.084, mean=0.134, max=0.202, sum=2.004 (15)", - "tab": "Calibration", - "score": 0.13362255376880447 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.2, mean=0.272, max=0.37, sum=4.079 (15)", - "tab": "Robustness", - "score": 0.2719415204678362 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.19, mean=0.273, max=0.36, sum=4.102 (15)", - "tab": "Fairness", - "score": 0.2734502923976609 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.178, mean=0.182, max=0.184, sum=2.725 (15)", - "tab": "Efficiency", - "score": 0.18164482078684702 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=2.465, mean=4.316, max=5, sum=64.743 (15)", - "tab": "General information", - "score": 4.316222222222222 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=385.228, mean=423.395, max=467.79, sum=6350.919 (15)", - "tab": "General information", - "score": 423.39457309941525 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.717, mean=0.746, max=0.762, sum=2.237 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.416, mean=0.46, max=0.512, sum=1.379 (3)", - "tab": "Calibration", - "score": 0.45980755585445926 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.638, mean=0.646, max=0.651, sum=1.938 (3)", - "tab": "Robustness", - "score": 0.646 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.672, mean=0.698, max=0.714, sum=2.095 (3)", - "tab": "Fairness", - "score": 0.6983333333333334 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.292, mean=0.313, max=0.341, sum=0.938 (3)", - "tab": "Efficiency", - "score": 0.3127442524572212 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=0.953, mean=1.57, max=1.978, sum=4.709 (3)", - "tab": "General information", - "score": 1.5696666666666668 - }, - "BoolQ - truncated": { - "description": "min=0.004, mean=0.004, max=0.004, sum=0.012 (3)", - "tab": "General information", - "score": 0.004 - }, - "BoolQ - # prompt tokens": { - "description": "min=386.826, mean=402.285, max=424.449, sum=1206.854 (3)", - "tab": "General information", - "score": 402.2846666666667 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "min=0.167, mean=0.23, max=0.357, sum=0.69 (3)", - "tab": "Bias", - "score": 0.23015873015873015 - }, - "BoolQ - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.083, - "details": { - "description": "min=0.083, mean=0.083, max=0.083, sum=0.248 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)", - "tab": "Calibration", - "score": 4.840114578300129e-6 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.059, mean=0.059, max=0.059, sum=0.178 (3)", - "tab": "Robustness", - "score": 0.05920683866208649 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.053, mean=0.053, max=0.053, sum=0.159 (3)", - "tab": "Fairness", - "score": 0.05305645886768214 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=1.182, mean=1.182, max=1.182, sum=3.546 (3)", - "tab": "Efficiency", - "score": 1.1820060481894892 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - truncated": { - "description": "min=0.834, mean=0.834, max=0.834, sum=2.501 (3)", - "tab": "General information", - "score": 0.8338028169014086 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=492.876, mean=492.876, max=492.876, sum=1478.628 (3)", - "tab": "General information", - "score": 492.87605633802815 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=300 (3)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=1.01 (3)", - "tab": "Bias", - "score": 0.3368016513369257 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=1.026 (3)", - "tab": "Bias", - "score": 0.3419913419913419 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.154, mean=0.154, max=0.154, sum=0.462 (3)", - "tab": "Bias", - "score": 0.15399534522885955 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.017, mean=0.017, max=0.017, sum=0.051 (3)", - "tab": "Toxicity", - "score": 0.016901408450704224 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349, - "details": { - "description": "min=0.195, mean=0.349, max=0.432, sum=1.048 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.088, mean=0.092, max=0.095, sum=0.276 (3)", - "tab": "Calibration", - "score": 0.09200000000000001 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.028, mean=0.179, max=0.258, sum=0.537 (3)", - "tab": "Calibration", - "score": 0.17899999902043598 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.139, mean=0.141, max=0.143, sum=0.423 (3)", - "tab": "Robustness", - "score": 0.1409495030072503 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.154, mean=0.291, max=0.365, sum=0.872 (3)", - "tab": "Robustness", - "score": 0.2906387285430619 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.159, mean=0.162, max=0.167, sum=0.486 (3)", - "tab": "Fairness", - "score": 0.16184307849771043 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.153, mean=0.303, max=0.389, sum=0.908 (3)", - "tab": "Fairness", - "score": 0.30281096844711025 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=1.912, mean=1.994, max=2.142, sum=5.981 (3)", - "tab": "Efficiency", - "score": 1.993551874854462 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=2.941, mean=3.093, max=3.306, sum=9.279 (3)", - "tab": "Efficiency", - "score": 3.0931644739895567 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=113.556, mean=117.556, max=122.556, sum=352.668 (3)", - "tab": "General information", - "score": 117.556 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=900 (3)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=0.083, mean=0.918, max=1.789, sum=2.755 (3)", - "tab": "General information", - "score": 0.9183333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.097, mean=0.355, max=0.852, sum=1.064 (3)", - "tab": "General information", - "score": 0.3546666666666667 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=231.47, mean=303.619, max=343.479, sum=910.857 (3)", - "tab": "General information", - "score": 303.61899999999997 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=900 (3)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.333, mean=0.387, max=0.44, sum=1.162 (3)", - "tab": "Bias", - "score": 0.3874074074074074 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.444, mean=0.519, max=0.562, sum=1.558 (3)", - "tab": "Bias", - "score": 0.5194689485314483 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.079, mean=0.183, max=0.239, sum=0.549 (3)", - "tab": "Bias", - "score": 0.1829490113242974 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.41, mean=0.449, max=0.5, sum=1.346 (3)", - "tab": "Bias", - "score": 0.44858553791887124 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.451, mean=0.538, max=0.595, sum=1.615 (3)", - "tab": "Bias", - "score": 0.5381999649472214 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.069, mean=0.111, max=0.136, sum=0.332 (3)", - "tab": "Bias", - "score": 0.11064384639781977 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.144, - "details": { - "description": "min=0.144, mean=0.144, max=0.144, sum=0.433 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)", - "tab": "Calibration", - "score": 0.00013015946539738277 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.111, mean=0.111, max=0.111, sum=0.333 (3)", - "tab": "Robustness", - "score": 0.11096938073772407 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.107, mean=0.107, max=0.107, sum=0.32 (3)", - "tab": "Fairness", - "score": 0.10672699918485114 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.226, mean=1.226, max=1.226, sum=3.679 (3)", - "tab": "Efficiency", - "score": 1.2264695519389521 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - truncated": { - "description": "min=0.999, mean=0.999, max=0.999, sum=2.997 (3)", - "tab": "General information", - "score": 0.999 - }, - "QuAC - # prompt tokens": { - "description": "min=510.938, mean=510.938, max=510.938, sum=1532.814 (3)", - "tab": "General information", - "score": 510.93799999999993 - }, - "QuAC - # output tokens": { - "description": "min=100, mean=100, max=100, sum=300 (3)", - "tab": "General information", - "score": 100.0 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.614, mean=0.614, max=0.614, sum=1.843 (3)", - "tab": "Bias", - "score": 0.6143486267149368 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=1.207 (3)", - "tab": "Bias", - "score": 0.40228575253954807 - }, - "QuAC - Representation (race)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.951 (3)", - "tab": "Bias", - "score": 0.3169129720853858 - }, - "QuAC - Representation (gender)": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.758 (3)", - "tab": "Bias", - "score": 0.2525635309852876 - }, - "QuAC - Toxic fraction": { - "description": "min=0.006, mean=0.006, max=0.006, sum=0.018 (3)", - "tab": "Toxicity", - "score": 0.006000000000000001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.193, - "details": { - "description": "min=0.162, mean=0.193, max=0.232, sum=0.772 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.096, mean=0.125, max=0.139, sum=0.498 (4)", - "tab": "Calibration", - "score": 0.12460869505528777 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.162, mean=0.178, max=0.209, sum=0.711 (4)", - "tab": "Robustness", - "score": 0.17775229357798167 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.15, mean=0.162, max=0.176, sum=0.647 (4)", - "tab": "Fairness", - "score": 0.16169724770642202 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.122, mean=0.168, max=0.183, sum=0.671 (4)", - "tab": "Efficiency", - "score": 0.16779271445154526 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.513, max=4.838, sum=14.05 (4)", - "tab": "General information", - "score": 3.5126146788990824 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=89.896, mean=372.668, max=473.333, sum=1490.671 (4)", - "tab": "General information", - "score": 372.66781345565744 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.03, - "details": { - "description": "min=0.03, mean=0.03, max=0.03, sum=0.182 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=1.108, mean=1.108, max=1.109, sum=6.651 (6)", - "tab": "Efficiency", - "score": 1.1084291968542619 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=0.06, mean=0.061, max=0.062, sum=0.365 (6)", - "tab": "General information", - "score": 0.060801144492131615 - }, - "CNN/DailyMail - truncated": { - "description": "min=0.933, mean=0.935, max=0.936, sum=5.609 (6)", - "tab": "General information", - "score": 0.9349070100143061 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=500.788, mean=500.829, max=500.912, sum=3004.974 (6)", - "tab": "General information", - "score": 500.8290414878398 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=128, mean=128, max=128, sum=768 (6)", - "tab": "General information", - "score": 128.0 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=2.411 (6)", - "tab": "Bias", - "score": 0.4018787714810442 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.361, mean=0.361, max=0.361, sum=2.163 (6)", - "tab": "Bias", - "score": 0.3605442176870748 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.188, mean=0.188, max=0.188, sum=1.129 (6)", - "tab": "Bias", - "score": 0.1882129277566539 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0.009, mean=0.009, max=0.009, sum=0.052 (6)", - "tab": "Toxicity", - "score": 0.008583690987124463 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.27, mean=-0.27, max=-0.27, sum=-0.81 (3)", - "tab": "Summarization metrics", - "score": -0.2698551726198464 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=-0.122, mean=-0.121, max=-0.12, sum=-0.362 (3)", - "tab": "Summarization metrics", - "score": -0.12078049146748136 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.72, mean=0.72, max=0.72, sum=4.319 (6)", - "tab": "Summarization metrics", - "score": 0.7197585278365729 - }, - "CNN/DailyMail - Density": { - "description": "min=5.044, mean=5.044, max=5.044, sum=30.265 (6)", - "tab": "Summarization metrics", - "score": 5.044183333839311 - }, - "CNN/DailyMail - Compression": { - "description": "min=7.173, mean=7.186, max=7.2, sum=43.118 (6)", - "tab": "Summarization metrics", - "score": 7.186281356409094 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.058, - "details": { - "description": "min=0.049, mean=0.058, max=0.066, sum=0.345 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.771, mean=0.774, max=0.781, sum=4.646 (6)", - "tab": "Efficiency", - "score": 0.7743015579914415 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=0.234, mean=0.293, max=0.361, sum=1.761 (6)", - "tab": "General information", - "score": 0.29343629343629346 - }, - "XSUM - truncated": { - "description": "min=0.614, mean=0.677, max=0.736, sum=4.062 (6)", - "tab": "General information", - "score": 0.676962676962677 - }, - "XSUM - # prompt tokens": { - "description": "min=433.917, mean=437.97, max=442.292, sum=2627.819 (6)", - "tab": "General information", - "score": 437.96975546975546 - }, - "XSUM - # output tokens": { - "description": "min=64, mean=64, max=64, sum=384 (6)", - "tab": "General information", - "score": 64.0 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.45, mean=0.455, max=0.463, sum=2.729 (6)", - "tab": "Bias", - "score": 0.45478395061728394 - }, - "XSUM - Representation (race)": { - "description": "min=0.489, mean=0.524, max=0.556, sum=3.145 (6)", - "tab": "Bias", - "score": 0.5241150528821762 - }, - "XSUM - Representation (gender)": { - "description": "min=0.236, mean=0.251, max=0.262, sum=1.508 (6)", - "tab": "Bias", - "score": 0.251389993488347 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.0006435006435006435 - }, - "XSUM - SummaC": { - "description": "min=-0.28, mean=-0.275, max=-0.272, sum=-0.826 (3)", - "tab": "Summarization metrics", - "score": -0.2753430534988641 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.028, mean=0.072, max=0.121, sum=0.215 (3)", - "tab": "Summarization metrics", - "score": 0.07156637071699196 - }, - "XSUM - Coverage": { - "description": "min=0.617, mean=0.643, max=0.671, sum=3.856 (6)", - "tab": "Summarization metrics", - "score": 0.6426528869383965 - }, - "XSUM - Density": { - "description": "min=3.058, mean=3.208, max=3.428, sum=19.25 (6)", - "tab": "Summarization metrics", - "score": 3.2083925287601787 - }, - "XSUM - Compression": { - "description": "min=7.31, mean=7.853, max=8.427, sum=47.12 (6)", - "tab": "Summarization metrics", - "score": 7.853257861418139 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.337, - "details": { - "description": "min=0.13, mean=0.337, max=0.556, sum=1.01 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.037, mean=0.225, max=0.41, sum=0.675 (3)", - "tab": "Calibration", - "score": 0.22500123786419848 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.091, mean=0.276, max=0.485, sum=0.827 (3)", - "tab": "Robustness", - "score": 0.27566666666666667 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.092, mean=0.271, max=0.484, sum=0.814 (3)", - "tab": "Fairness", - "score": 0.2713333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.214, mean=0.215, max=0.217, sum=0.645 (3)", - "tab": "Efficiency", - "score": 0.21490736543138858 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=0.309, mean=0.449, max=0.689, sum=1.347 (3)", - "tab": "General information", - "score": 0.449 - }, - "IMDB - truncated": { - "description": "min=0.175, mean=0.176, max=0.176, sum=0.527 (3)", - "tab": "General information", - "score": 0.17566666666666664 - }, - "IMDB - # prompt tokens": { - "description": "min=388.254, mean=407.098, max=435.686, sum=1221.293 (3)", - "tab": "General information", - "score": 407.0976666666666 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.521, - "details": { - "description": "min=0, mean=0.521, max=1, sum=28.146 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.123, mean=0.404, max=0.585, sum=21.802 (54)", - "tab": "Calibration", - "score": 0.40373338964571226 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.45, max=0.983, sum=24.293 (54)", - "tab": "Robustness", - "score": 0.4498711194026963 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.423, max=0.975, sum=22.816 (54)", - "tab": "Fairness", - "score": 0.4225225679997762 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.21, mean=0.264, max=0.45, sum=14.236 (54)", - "tab": "Efficiency", - "score": 0.2636334561494892 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=1.01, mean=2.608, max=4.878, sum=140.857 (54)", - "tab": "General information", - "score": 2.608459470057463 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0.003, max=0.032, sum=0.138 (54)", - "tab": "General information", - "score": 0.0025500084787325617 - }, - "CivilComments - # prompt tokens": { - "description": "min=335.768, mean=416.896, max=479.235, sum=22512.361 (54)", - "tab": "General information", - "score": 416.89557696196465 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.404, - "details": { - "description": "min=0, mean=0.404, max=0.95, sum=13.325 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.0, mean=0.401, max=0.95, sum=13.228 (33)", - "tab": "Calibration", - "score": 0.40084433515818857 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.349, max=0.95, sum=11.525 (33)", - "tab": "Robustness", - "score": 0.3492424242424242 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.375, max=0.95, sum=12.375 (33)", - "tab": "Fairness", - "score": 0.375 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.316, mean=0.434, max=0.454, sum=14.32 (33)", - "tab": "Efficiency", - "score": 0.43394225670679076 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=2.433, max=5, sum=80.3 (33)", - "tab": "General information", - "score": 2.433333333333333 - }, - "RAFT - truncated": { - "description": "min=0, mean=0.394, max=1, sum=13 (33)", - "tab": "General information", - "score": 0.3939393939393939 - }, - "RAFT - # prompt tokens": { - "description": "min=267.4, mean=423.537, max=511, sum=13976.725 (33)", - "tab": "General information", - "score": 423.53712121212124 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=990 (33)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "min=0.079, mean=0.079, max=0.079, sum=0.237 (3)", - "tab": "Bias", - "score": 0.07894736842105265 - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json deleted file mode 100644 index e1d9662a3..000000000 --- a/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Vicuna v1.3 13B", - "id": "lmsys/Vicuna-v1.3-13B", - "developer": "lmsys", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.27488436632747454 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.7320745920745921 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.7154545454545455 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5333173629091996 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5758158508158508 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462, - "details": { - "description": "min=0.298, mean=0.462, max=0.72, sum=2.308 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.156, mean=0.194, max=0.246, sum=0.972 (5)", - "tab": "Calibration", - "score": 0.19445587267296924 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.237, mean=0.413, max=0.69, sum=2.067 (5)", - "tab": "Robustness", - "score": 0.4133684210526316 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.228, mean=0.424, max=0.7, sum=2.118 (5)", - "tab": "Fairness", - "score": 0.4236140350877193 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "description": "min=0.808, mean=0.808, max=0.808, sum=0.808 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.159, mean=0.159, max=0.159, sum=0.159 (1)", - "tab": "Calibration", - "score": 0.15912327464389103 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.757, mean=0.757, max=0.757, sum=0.757 (1)", - "tab": "Robustness", - "score": 0.757 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.748, mean=0.748, max=0.748, sum=0.748 (1)", - "tab": "Fairness", - "score": 0.748 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=4.996, mean=4.996, max=4.996, sum=4.996 (1)", - "tab": "General information", - "score": 4.996 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691, - "details": { - "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.257 (1)", - "tab": "Calibration", - "score": 0.25677737638719905 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)", - "tab": "Robustness", - "score": 0.5253621693457193 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)", - "tab": "Fairness", - "score": 0.6066076692752655 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)", - "tab": "General information", - "score": 1.4366197183098592 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)", - "tab": "General information", - "score": 1541.1154929577465 - }, - "NarrativeQA - # output tokens": { - "description": "min=67.575, mean=67.575, max=67.575, sum=67.575 (1)", - "tab": "General information", - "score": 67.57464788732395 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)", - "tab": "Bias", - "score": 0.41666666666666663 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.181, mean=0.181, max=0.181, sum=0.181 (1)", - "tab": "Bias", - "score": 0.1806282722513089 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)", - "tab": "Toxicity", - "score": 0.008450704225352112 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.202, mean=0.202, max=0.202, sum=0.202 (1)", - "tab": "Calibration", - "score": 0.20199999735253094 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)", - "tab": "Calibration", - "score": 0.4297157164166979 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.273, mean=0.273, max=0.273, sum=0.273 (1)", - "tab": "Robustness", - "score": 0.2732835109469542 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)", - "tab": "Robustness", - "score": 0.6205537766211775 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)", - "tab": "Fairness", - "score": 0.26608326669652704 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)", - "tab": "Fairness", - "score": 0.6295785534387982 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=299.508, mean=299.508, max=299.508, sum=299.508 (1)", - "tab": "General information", - "score": 299.508 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)", - "tab": "General information", - "score": 3.722 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)", - "tab": "General information", - "score": 0.049 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)", - "tab": "General information", - "score": 1407.178 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=266.895, mean=266.895, max=266.895, sum=266.895 (1)", - "tab": "General information", - "score": 266.895 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.364, mean=0.364, max=0.364, sum=0.364 (1)", - "tab": "Bias", - "score": 0.363914373088685 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)", - "tab": "Bias", - "score": 0.13157894736842105 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.484, mean=0.484, max=0.484, sum=0.484 (1)", - "tab": "Bias", - "score": 0.4838709677419355 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)", - "tab": "Bias", - "score": 0.29310344827586204 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403, - "details": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.103, mean=0.103, max=0.103, sum=0.103 (1)", - "tab": "Calibration", - "score": 0.10339686685910766 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.247, mean=0.247, max=0.247, sum=0.247 (1)", - "tab": "Robustness", - "score": 0.24738453163162216 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)", - "tab": "Fairness", - "score": 0.32414193488324744 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "General information", - "score": 0.507 - }, - "QuAC - truncated": { - "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)", - "tab": "General information", - "score": 0.06 - }, - "QuAC - # prompt tokens": { - "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)", - "tab": "General information", - "score": 1498.657 - }, - "QuAC - # output tokens": { - "description": "min=77.743, mean=77.743, max=77.743, sum=77.743 (1)", - "tab": "General information", - "score": 77.743 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)", - "tab": "Bias", - "score": 0.6296296296296295 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.408 (1)", - "tab": "Bias", - "score": 0.4083074125172457 - }, - "QuAC - Representation (race)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.289 (1)", - "tab": "Bias", - "score": 0.28888888888888886 - }, - "QuAC - Representation (gender)": { - "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)", - "tab": "Bias", - "score": 0.2418952618453865 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.385, - "details": { - "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.316 (1)", - "tab": "Calibration", - "score": 0.31581376966800645 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.341 (1)", - "tab": "Robustness", - "score": 0.3409785932721712 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.315 (1)", - "tab": "Fairness", - "score": 0.3149847094801223 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.183, mean=0.183, max=0.183, sum=0.183 (1)", - "tab": "Calibration", - "score": 0.18259660460611343 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)", - "tab": "Robustness", - "score": 0.674 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.707, mean=0.707, max=0.707, sum=0.707 (1)", - "tab": "Fairness", - "score": 0.707 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)", - "tab": "General information", - "score": 2.781 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)", - "tab": "General information", - "score": 1751.213 - }, - "IMDB - # output tokens": { - "description": "min=3.32, mean=3.32, max=3.32, sum=3.32 (1)", - "tab": "General information", - "score": 3.32 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.645, - "details": { - "description": "min=0.247, mean=0.645, max=0.946, sum=11.602 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.086, mean=0.253, max=0.415, sum=4.559 (18)", - "tab": "Calibration", - "score": 0.25325054290553783 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.177, mean=0.593, max=0.932, sum=10.679 (18)", - "tab": "Robustness", - "score": 0.5932501359027997 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.139, mean=0.569, max=0.946, sum=10.248 (18)", - "tab": "Fairness", - "score": 0.5693148383516141 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=2, mean=2.59, max=4.159, sum=46.618 (18)", - "tab": "General information", - "score": 2.589879611958418 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.657, - "details": { - "description": "min=0.175, mean=0.657, max=0.9, sum=7.225 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.212, mean=0.376, max=0.701, sum=4.137 (11)", - "tab": "Calibration", - "score": 0.37612291287489436 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.591, max=0.875, sum=6.5 (11)", - "tab": "Robustness", - "score": 0.5909090909090909 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.125, mean=0.62, max=0.875, sum=6.825 (11)", - "tab": "Fairness", - "score": 0.6204545454545454 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)", - "tab": "General information", - "score": 4.552272727272727 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)", - "tab": "General information", - "score": 954.1113636363635 - }, - "RAFT - # output tokens": { - "description": "min=5.3, mean=15.4, max=30, sum=169.4 (11)", - "tab": "General information", - "score": 15.399999999999999 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json deleted file mode 100644 index b03d7afe6..000000000 --- a/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Vicuna v1.3 7B", - "id": "lmsys/Vicuna-v1.3-7B", - "developer": "lmsys", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.20388529139685477 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.662027972027972 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.6221212121212122 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5093893164757827 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.8238927738927739 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.434, - "details": { - "description": "min=0.228, mean=0.434, max=0.7, sum=2.168 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.121, mean=0.176, max=0.315, sum=0.88 (5)", - "tab": "Calibration", - "score": 0.17593793416924502 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.175, mean=0.371, max=0.65, sum=1.855 (5)", - "tab": "Robustness", - "score": 0.3710877192982456 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.184, mean=0.385, max=0.68, sum=1.924 (5)", - "tab": "Fairness", - "score": 0.38484210526315793 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=0.76 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.322 (1)", - "tab": "Calibration", - "score": 0.322404542566261 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)", - "tab": "Robustness", - "score": 0.672 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)", - "tab": "Fairness", - "score": 0.67 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=4.412, mean=4.412, max=4.412, sum=4.412 (1)", - "tab": "General information", - "score": 4.412 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.643, mean=0.643, max=0.643, sum=0.643 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.084, mean=0.084, max=0.084, sum=0.084 (1)", - "tab": "Calibration", - "score": 0.08355639800803456 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Robustness", - "score": 0.499695916561912 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)", - "tab": "Fairness", - "score": 0.5528194590567359 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)", - "tab": "General information", - "score": 1.4366197183098592 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)", - "tab": "General information", - "score": 1541.1154929577465 - }, - "NarrativeQA - # output tokens": { - "description": "min=19.287, mean=19.287, max=19.287, sum=19.287 (1)", - "tab": "General information", - "score": 19.28732394366197 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.379 (1)", - "tab": "Bias", - "score": 0.3794642857142857 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)", - "tab": "Bias", - "score": 0.37254901960784315 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.186, mean=0.186, max=0.186, sum=0.186 (1)", - "tab": "Bias", - "score": 0.18604651162790695 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)", - "tab": "Toxicity", - "score": 0.008450704225352112 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.634, - "details": { - "description": "min=0.634, mean=0.634, max=0.634, sum=0.634 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.162, mean=0.162, max=0.162, sum=0.162 (1)", - "tab": "Calibration", - "score": 0.16180078530132275 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)", - "tab": "Calibration", - "score": 0.41328409267406696 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)", - "tab": "Robustness", - "score": 0.213860378689308 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.539, mean=0.539, max=0.539, sum=0.539 (1)", - "tab": "Robustness", - "score": 0.5393637207184442 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)", - "tab": "Fairness", - "score": 0.22422961995096835 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.575, mean=0.575, max=0.575, sum=0.575 (1)", - "tab": "Fairness", - "score": 0.5749345098495453 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=296.95, mean=296.95, max=296.95, sum=296.95 (1)", - "tab": "General information", - "score": 296.95 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)", - "tab": "General information", - "score": 3.722 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)", - "tab": "General information", - "score": 0.049 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)", - "tab": "General information", - "score": 1407.178 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=286.175, mean=286.175, max=286.175, sum=286.175 (1)", - "tab": "General information", - "score": 286.175 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Bias", - "score": 0.3333333333333333 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)", - "tab": "Bias", - "score": 0.5497835497835497 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)", - "tab": "Bias", - "score": 0.32352941176470584 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.521, mean=0.521, max=0.521, sum=0.521 (1)", - "tab": "Bias", - "score": 0.5205992509363295 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.458, mean=0.458, max=0.458, sum=0.458 (1)", - "tab": "Bias", - "score": 0.45833333333333326 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392, - "details": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.109, mean=0.109, max=0.109, sum=0.109 (1)", - "tab": "Calibration", - "score": 0.10940664349880716 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)", - "tab": "Robustness", - "score": 0.24986961512093836 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.304 (1)", - "tab": "Fairness", - "score": 0.3036739587215963 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "General information", - "score": 0.507 - }, - "QuAC - truncated": { - "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)", - "tab": "General information", - "score": 0.06 - }, - "QuAC - # prompt tokens": { - "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)", - "tab": "General information", - "score": 1498.657 - }, - "QuAC - # output tokens": { - "description": "min=77.25, mean=77.25, max=77.25, sum=77.25 (1)", - "tab": "General information", - "score": 77.25 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.633, mean=0.633, max=0.633, sum=0.633 (1)", - "tab": "Bias", - "score": 0.6333333333333334 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.416 (1)", - "tab": "Bias", - "score": 0.41569852337396196 - }, - "QuAC - Representation (race)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.277 (1)", - "tab": "Bias", - "score": 0.27653213751868466 - }, - "QuAC - Representation (gender)": { - "description": "min=0.255, mean=0.255, max=0.255, sum=0.255 (1)", - "tab": "Bias", - "score": 0.2550295857988165 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.292, - "details": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.227, mean=0.227, max=0.227, sum=0.227 (1)", - "tab": "Calibration", - "score": 0.22667464300561196 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.258, mean=0.258, max=0.258, sum=0.258 (1)", - "tab": "Robustness", - "score": 0.25840978593272174 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.235, mean=0.235, max=0.235, sum=0.235 (1)", - "tab": "Fairness", - "score": 0.23547400611620795 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.916, - "details": { - "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.348 (1)", - "tab": "Calibration", - "score": 0.34781631358579634 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.882, mean=0.882, max=0.882, sum=0.882 (1)", - "tab": "Robustness", - "score": 0.882 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.906, mean=0.906, max=0.906, sum=0.906 (1)", - "tab": "Fairness", - "score": 0.906 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)", - "tab": "General information", - "score": 2.781 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)", - "tab": "General information", - "score": 1751.213 - }, - "IMDB - # output tokens": { - "description": "min=3.258, mean=3.258, max=3.258, sum=3.258 (1)", - "tab": "General information", - "score": 3.258 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62, - "details": { - "description": "min=0.154, mean=0.62, max=0.98, sum=11.166 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.13, mean=0.346, max=0.589, sum=6.236 (18)", - "tab": "Calibration", - "score": 0.3464227204141308 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.125, mean=0.543, max=0.918, sum=9.77 (18)", - "tab": "Robustness", - "score": 0.5427815962078022 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.116, mean=0.564, max=0.974, sum=10.144 (18)", - "tab": "Fairness", - "score": 0.5635727085389178 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=4.854, mean=4.98, max=5, sum=89.64 (18)", - "tab": "General information", - "score": 4.980000522687608 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.275, mean=0.693, max=0.975, sum=7.625 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.126, mean=0.601, max=0.963, sum=6.61 (11)", - "tab": "Calibration", - "score": 0.6009008385490167 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.6, max=0.85, sum=6.6 (11)", - "tab": "Robustness", - "score": 0.6000000000000001 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.2, mean=0.643, max=0.975, sum=7.075 (11)", - "tab": "Fairness", - "score": 0.6431818181818182 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)", - "tab": "General information", - "score": 4.552272727272727 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)", - "tab": "General information", - "score": 954.1113636363635 - }, - "RAFT - # output tokens": { - "description": "min=5.8, mean=24.4, max=30, sum=268.4 (11)", - "tab": "General information", - "score": 24.4 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json deleted file mode 100644 index 959b52195..000000000 --- a/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_LLaMA-13B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA 13B", - "id": "meta/LLaMA-13B", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.595, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.6374592074592075 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.6022144522144522 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5777177774710669 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6102564102564103 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422, - "details": { - "description": "min=0.2, mean=0.422, max=0.76, sum=2.111 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.127, mean=0.15, max=0.18, sum=0.748 (5)", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.14, mean=0.37, max=0.68, sum=1.848 (5)", - "tab": "Robustness", - "score": 0.3696140350877193 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.18, mean=0.385, max=0.71, sum=1.927 (5)", - "tab": "Fairness", - "score": 0.3853684210526316 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.714, - "details": { - "description": "min=0.714, mean=0.714, max=0.714, sum=0.714 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.078, mean=0.078, max=0.078, sum=0.078 (1)", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)", - "tab": "Robustness", - "score": 0.67 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)", - "tab": "Fairness", - "score": 0.666 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.544, mean=0.544, max=0.544, sum=0.544 (1)", - "tab": "Robustness", - "score": 0.543905538434645 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)", - "tab": "Fairness", - "score": 0.6277072207288055 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)", - "tab": "General information", - "score": 1.4366197183098592 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)", - "tab": "General information", - "score": 1541.1154929577465 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)", - "tab": "Bias", - "score": 0.4166666666666667 - }, - "NarrativeQA - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)", - "tab": "Bias", - "score": 0.22357723577235772 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)", - "tab": "Toxicity", - "score": 0.008450704225352112 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614, - "details": { - "description": "min=0.614, mean=0.614, max=0.614, sum=0.614 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.227, mean=0.227, max=0.227, sum=0.227 (1)", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)", - "tab": "Robustness", - "score": 0.27211691617574163 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.556, mean=0.556, max=0.556, sum=0.556 (1)", - "tab": "Robustness", - "score": 0.5559403134593146 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)", - "tab": "Fairness", - "score": 0.28794490645078735 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)", - "tab": "Fairness", - "score": 0.5608161827325524 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)", - "tab": "General information", - "score": 3.722 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)", - "tab": "General information", - "score": 0.049 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)", - "tab": "General information", - "score": 1407.178 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)", - "tab": "Bias", - "score": 0.43775100401606426 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)", - "tab": "Bias", - "score": 0.16666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Bias", - "score": 0.3333333333333333 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.083, mean=0.083, max=0.083, sum=0.083 (1)", - "tab": "Bias", - "score": 0.08333333333333334 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.347, - "details": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.347 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.172, mean=0.172, max=0.172, sum=0.172 (1)", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.194, mean=0.194, max=0.194, sum=0.194 (1)", - "tab": "Robustness", - "score": 0.19407861446110536 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.267 (1)", - "tab": "Fairness", - "score": 0.26734169068478314 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "General information", - "score": 0.507 - }, - "QuAC - truncated": { - "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)", - "tab": "General information", - "score": 0.06 - }, - "QuAC - # prompt tokens": { - "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)", - "tab": "General information", - "score": 1498.657 - }, - "QuAC - # output tokens": { - "description": "min=99.882, mean=99.882, max=99.882, sum=99.882 (1)", - "tab": "General information", - "score": 99.882 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)", - "tab": "Bias", - "score": 0.6047619047619048 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)", - "tab": "Bias", - "score": 0.44425076013311304 - }, - "QuAC - Representation (race)": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)", - "tab": "Bias", - "score": 0.2761904761904762 - }, - "QuAC - Representation (gender)": { - "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)", - "tab": "Bias", - "score": 0.22388059701492535 - }, - "QuAC - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)", - "tab": "Toxicity", - "score": 0.003 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.324, - "details": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.193, mean=0.193, max=0.193, sum=0.193 (1)", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.274, mean=0.274, max=0.274, sum=0.274 (1)", - "tab": "Robustness", - "score": 0.27370030581039756 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)", - "tab": "Fairness", - "score": 0.23394495412844038 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.302, mean=0.302, max=0.302, sum=0.302 (1)", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.875, mean=0.875, max=0.875, sum=0.875 (1)", - "tab": "Robustness", - "score": 0.875 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.903, mean=0.903, max=0.903, sum=0.903 (1)", - "tab": "Fairness", - "score": 0.903 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)", - "tab": "General information", - "score": 2.781 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)", - "tab": "General information", - "score": 1751.213 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.118, mean=0.6, max=0.947, sum=10.797 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.098, mean=0.295, max=0.455, sum=5.305 (18)", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.079, mean=0.529, max=0.947, sum=9.523 (18)", - "tab": "Robustness", - "score": 0.529079897678074 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.054, mean=0.533, max=0.947, sum=9.585 (18)", - "tab": "Fairness", - "score": 0.5325232651113918 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.125, mean=0.643, max=0.925, sum=7.075 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.132, mean=0.644, max=0.925, sum=7.081 (11)", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.559, max=0.9, sum=6.15 (11)", - "tab": "Robustness", - "score": 0.5590909090909091 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.075, mean=0.605, max=0.9, sum=6.65 (11)", - "tab": "Fairness", - "score": 0.6045454545454545 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)", - "tab": "General information", - "score": 4.552272727272727 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)", - "tab": "General information", - "score": 954.1113636363635 - }, - "RAFT - # output tokens": { - "description": "min=22.975, mean=29.361, max=30, sum=322.975 (11)", - "tab": "General information", - "score": 29.361363636363638 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json deleted file mode 100644 index 7f604e015..000000000 --- a/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_LLaMA-30B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA 30B", - "id": "meta/LLaMA-30B", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.781, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.8149650349650349 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.8224708624708624 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5955016826844834 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6467365967365968 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531, - "details": { - "description": "min=0.33, mean=0.531, max=0.83, sum=2.657 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.051, mean=0.093, max=0.139, sum=0.464 (5)", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.22, mean=0.461, max=0.82, sum=2.305 (5)", - "tab": "Robustness", - "score": 0.4609122807017544 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.28, mean=0.496, max=0.81, sum=2.481 (5)", - "tab": "Fairness", - "score": 0.49617543859649127 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=0.861 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.164, mean=0.164, max=0.164, sum=0.164 (1)", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)", - "tab": "Robustness", - "score": 0.791 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.813, mean=0.813, max=0.813, sum=0.813 (1)", - "tab": "Fairness", - "score": 0.813 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)", - "tab": "Robustness", - "score": 0.6105202153922532 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.657, mean=0.657, max=0.657, sum=0.657 (1)", - "tab": "Fairness", - "score": 0.6567447414077484 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)", - "tab": "General information", - "score": 1.4366197183098592 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)", - "tab": "General information", - "score": 1541.1154929577465 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.4 (1)", - "tab": "Bias", - "score": 0.4 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)", - "tab": "Bias", - "score": 0.2142857142857143 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)", - "tab": "Toxicity", - "score": 0.011267605633802818 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.666, - "details": { - "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.264, mean=0.264, max=0.264, sum=0.264 (1)", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.451 (1)", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.36 (1)", - "tab": "Robustness", - "score": 0.36029476515740994 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.612, mean=0.612, max=0.612, sum=0.612 (1)", - "tab": "Robustness", - "score": 0.6123442768470954 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.356 (1)", - "tab": "Fairness", - "score": 0.35638449124084753 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)", - "tab": "Fairness", - "score": 0.6212987885688864 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)", - "tab": "General information", - "score": 3.722 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)", - "tab": "General information", - "score": 0.049 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)", - "tab": "General information", - "score": 1407.178 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)", - "tab": "Bias", - "score": 0.32753623188405795 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Bias", - "score": 0.33333333333333337 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.1, mean=0.1, max=0.1, sum=0.1 (1)", - "tab": "Bias", - "score": 0.09999999999999998 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", - "tab": "Bias", - "score": 0.3900709219858156 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.257 (1)", - "tab": "Bias", - "score": 0.2567567567567568 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.146, mean=0.146, max=0.146, sum=0.146 (1)", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.273, mean=0.273, max=0.273, sum=0.273 (1)", - "tab": "Robustness", - "score": 0.27320176375521127 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)", - "tab": "Fairness", - "score": 0.3253423128866467 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "General information", - "score": 0.507 - }, - "QuAC - truncated": { - "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)", - "tab": "General information", - "score": 0.06 - }, - "QuAC - # prompt tokens": { - "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)", - "tab": "General information", - "score": 1498.657 - }, - "QuAC - # output tokens": { - "description": "min=99.987, mean=99.987, max=99.987, sum=99.987 (1)", - "tab": "General information", - "score": 99.987 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.571, mean=0.571, max=0.571, sum=0.571 (1)", - "tab": "Bias", - "score": 0.5714285714285715 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.436 (1)", - "tab": "Bias", - "score": 0.43576827288346653 - }, - "QuAC - Representation (race)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.229 (1)", - "tab": "Bias", - "score": 0.22891566265060237 - }, - "QuAC - Representation (gender)": { - "description": "min=0.222, mean=0.222, max=0.222, sum=0.222 (1)", - "tab": "Bias", - "score": 0.22215709261430247 - }, - "QuAC - Toxic fraction": { - "description": "min=0.004, mean=0.004, max=0.004, sum=0.004 (1)", - "tab": "Toxicity", - "score": 0.004 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344, - "details": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.344 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.15, mean=0.15, max=0.15, sum=0.15 (1)", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)", - "tab": "Robustness", - "score": 0.28134556574923547 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)", - "tab": "Fairness", - "score": 0.26605504587155965 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927, - "details": { - "description": "min=0.927, mean=0.927, max=0.927, sum=0.927 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.255, mean=0.255, max=0.255, sum=0.255 (1)", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.893, mean=0.893, max=0.893, sum=0.893 (1)", - "tab": "Robustness", - "score": 0.893 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.913, mean=0.913, max=0.913, sum=0.913 (1)", - "tab": "Fairness", - "score": 0.913 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)", - "tab": "General information", - "score": 2.781 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)", - "tab": "General information", - "score": 1751.213 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549, - "details": { - "description": "min=0.027, mean=0.549, max=0.998, sum=9.887 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.161, mean=0.4, max=0.513, sum=7.208 (18)", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.016, mean=0.503, max=0.97, sum=9.055 (18)", - "tab": "Robustness", - "score": 0.503044804739656 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.006, mean=0.508, max=0.998, sum=9.137 (18)", - "tab": "Fairness", - "score": 0.5075946750657245 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.15, mean=0.752, max=1, sum=8.275 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.156, mean=0.753, max=1.0, sum=8.279 (11)", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.05, mean=0.67, max=0.95, sum=7.375 (11)", - "tab": "Robustness", - "score": 0.6704545454545454 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.1, mean=0.718, max=0.975, sum=7.9 (11)", - "tab": "Fairness", - "score": 0.7181818181818181 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)", - "tab": "General information", - "score": 4.552272727272727 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)", - "tab": "General information", - "score": 954.1113636363635 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=330 (11)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json deleted file mode 100644 index ad8c1c451..000000000 --- a/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_LLaMA-65B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA 65B", - "id": "meta/LLaMA-65B", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.8851981351981352 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.9235431235431235 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4059399223461723 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5910839160839161 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.34, mean=0.584, max=0.89, sum=2.919 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.27, mean=0.504, max=0.81, sum=2.518 (5)", - "tab": "Robustness", - "score": 0.5036842105263158 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.34, mean=0.551, max=0.84, sum=2.757 (5)", - "tab": "Fairness", - "score": 0.5514385964912281 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=0.871 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.84, mean=0.84, max=0.84, sum=0.84 (1)", - "tab": "Robustness", - "score": 0.84 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.847, mean=0.847, max=0.847, sum=0.847 (1)", - "tab": "Fairness", - "score": 0.847 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)", - "tab": "Robustness", - "score": 0.5674436891870642 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.661, mean=0.661, max=0.661, sum=0.661 (1)", - "tab": "Fairness", - "score": 0.6614214785759094 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)", - "tab": "General information", - "score": 1.4366197183098592 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)", - "tab": "General information", - "score": 1541.1154929577465 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.198, mean=0.198, max=0.198, sum=0.198 (1)", - "tab": "Bias", - "score": 0.1981132075471698 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)", - "tab": "Toxicity", - "score": 0.008450704225352112 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672, - "details": { - "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.388 (1)", - "tab": "Robustness", - "score": 0.3875883665002626 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)", - "tab": "Robustness", - "score": 0.623794662165915 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)", - "tab": "Fairness", - "score": 0.3753249636782112 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.633, mean=0.633, max=0.633, sum=0.633 (1)", - "tab": "Fairness", - "score": 0.6326996444457361 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)", - "tab": "General information", - "score": 3.722 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)", - "tab": "General information", - "score": 0.049 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)", - "tab": "General information", - "score": 1407.178 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)", - "tab": "General information", - "score": 0.987 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)", - "tab": "Bias", - "score": 0.35238095238095235 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)", - "tab": "Bias", - "score": 0.30000000000000004 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.436 (1)", - "tab": "Bias", - "score": 0.4358974358974359 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.393 (1)", - "tab": "Bias", - "score": 0.3928571428571429 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.401, - "details": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.275, mean=0.275, max=0.275, sum=0.275 (1)", - "tab": "Robustness", - "score": 0.2748605351114493 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Fairness", - "score": 0.33296543407590734 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "General information", - "score": 0.507 - }, - "QuAC - truncated": { - "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)", - "tab": "General information", - "score": 0.06 - }, - "QuAC - # prompt tokens": { - "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)", - "tab": "General information", - "score": 1498.657 - }, - "QuAC - # output tokens": { - "description": "min=0.997, mean=0.997, max=0.997, sum=0.997 (1)", - "tab": "General information", - "score": 0.997 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)", - "tab": "Bias", - "score": 0.6210526315789473 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.394, mean=0.394, max=0.394, sum=0.394 (1)", - "tab": "Bias", - "score": 0.3944670750705233 - }, - "QuAC - Representation (race)": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.38 (1)", - "tab": "Bias", - "score": 0.3804713804713804 - }, - "QuAC - Representation (gender)": { - "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)", - "tab": "Bias", - "score": 0.24335260115606938 - }, - "QuAC - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)", - "tab": "Toxicity", - "score": 0.003 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508, - "details": { - "description": "min=0.508, mean=0.508, max=0.508, sum=0.508 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)", - "tab": "Robustness", - "score": 0.44801223241590216 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)", - "tab": "Fairness", - "score": 0.42048929663608564 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.962, - "details": { - "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)", - "tab": "Robustness", - "score": 0.935 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.953, mean=0.953, max=0.953, sum=0.953 (1)", - "tab": "Fairness", - "score": 0.953 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)", - "tab": "General information", - "score": 2.781 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)", - "tab": "General information", - "score": 1751.213 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655, - "details": { - "description": "min=0.395, mean=0.655, max=0.863, sum=11.783 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.247, mean=0.566, max=0.853, sum=10.188 (18)", - "tab": "Robustness", - "score": 0.565986035612513 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.32, mean=0.574, max=0.8, sum=10.336 (18)", - "tab": "Fairness", - "score": 0.57420608635975 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.125, mean=0.702, max=0.975, sum=7.725 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.655, max=0.975, sum=7.2 (11)", - "tab": "Robustness", - "score": 0.6545454545454545 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.075, mean=0.668, max=0.975, sum=7.35 (11)", - "tab": "Fairness", - "score": 0.6681818181818182 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)", - "tab": "General information", - "score": 4.552272727272727 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)", - "tab": "General information", - "score": 954.1113636363635 - }, - "RAFT - # output tokens": { - "description": "min=0.8, mean=0.982, max=1, sum=10.8 (11)", - "tab": "General information", - "score": 0.9818181818181819 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json deleted file mode 100644 index 152b9e683..000000000 --- a/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_LLaMA-7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA 7B", - "id": "meta/LLaMA-7B", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.533, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.567972027972028 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.5526107226107226 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5501935339738984 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.7582167832167832 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.321, - "details": { - "description": "min=0.23, mean=0.321, max=0.45, sum=1.603 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.063, mean=0.111, max=0.138, sum=0.557 (5)", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.18, mean=0.268, max=0.36, sum=1.338 (5)", - "tab": "Robustness", - "score": 0.2676140350877193 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.19, mean=0.284, max=0.42, sum=1.421 (5)", - "tab": "Fairness", - "score": 0.28410526315789475 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.756, - "details": { - "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.688, mean=0.688, max=0.688, sum=0.688 (1)", - "tab": "Robustness", - "score": 0.688 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.71, mean=0.71, max=0.71, sum=0.71 (1)", - "tab": "Fairness", - "score": 0.71 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.669, - "details": { - "description": "min=0.669, mean=0.669, max=0.669, sum=0.669 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.265, mean=0.265, max=0.265, sum=0.265 (1)", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.485, mean=0.485, max=0.485, sum=0.485 (1)", - "tab": "Robustness", - "score": 0.48451305318378857 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.552, mean=0.552, max=0.552, sum=0.552 (1)", - "tab": "Fairness", - "score": 0.5523890751544673 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)", - "tab": "General information", - "score": 1.4366197183098592 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)", - "tab": "General information", - "score": 1541.1154929577465 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)", - "tab": "Bias", - "score": 0.4444444444444444 - }, - "NarrativeQA - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.178, mean=0.178, max=0.178, sum=0.178 (1)", - "tab": "Bias", - "score": 0.17785234899328858 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.006, mean=0.006, max=0.006, sum=0.006 (1)", - "tab": "Toxicity", - "score": 0.005633802816901409 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589, - "details": { - "description": "min=0.589, mean=0.589, max=0.589, sum=0.589 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.176, mean=0.176, max=0.176, sum=0.176 (1)", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.402 (1)", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.222, mean=0.222, max=0.222, sum=0.222 (1)", - "tab": "Robustness", - "score": 0.22150747696392029 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)", - "tab": "Robustness", - "score": 0.5190244505397503 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.241, mean=0.241, max=0.241, sum=0.241 (1)", - "tab": "Fairness", - "score": 0.24052468144533276 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.537, mean=0.537, max=0.537, sum=0.537 (1)", - "tab": "Fairness", - "score": 0.5368535244140038 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)", - "tab": "General information", - "score": 3.722 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)", - "tab": "General information", - "score": 0.049 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)", - "tab": "General information", - "score": 1407.178 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.374 (1)", - "tab": "Bias", - "score": 0.3739837398373984 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.111, mean=0.111, max=0.111, sum=0.111 (1)", - "tab": "Bias", - "score": 0.11111111111111116 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)", - "tab": "Bias", - "score": 0.3 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=0.506 (1)", - "tab": "Bias", - "score": 0.5061728395061729 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.36 (1)", - "tab": "Bias", - "score": 0.3604651162790698 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338, - "details": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.114, mean=0.114, max=0.114, sum=0.114 (1)", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.223, mean=0.223, max=0.223, sum=0.223 (1)", - "tab": "Robustness", - "score": 0.22309180806281237 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.257 (1)", - "tab": "Fairness", - "score": 0.2568299506065861 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "General information", - "score": 0.507 - }, - "QuAC - truncated": { - "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)", - "tab": "General information", - "score": 0.06 - }, - "QuAC - # prompt tokens": { - "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)", - "tab": "General information", - "score": 1498.657 - }, - "QuAC - # output tokens": { - "description": "min=99.794, mean=99.794, max=99.794, sum=99.794 (1)", - "tab": "General information", - "score": 99.794 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.571, mean=0.571, max=0.571, sum=0.571 (1)", - "tab": "Bias", - "score": 0.5714285714285715 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)", - "tab": "Bias", - "score": 0.42791413680110835 - }, - "QuAC - Representation (race)": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)", - "tab": "Bias", - "score": 0.28395061728395066 - }, - "QuAC - Representation (gender)": { - "description": "min=0.259, mean=0.259, max=0.259, sum=0.259 (1)", - "tab": "Bias", - "score": 0.2594070695553022 - }, - "QuAC - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)", - "tab": "Toxicity", - "score": 0.003 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.28, - "details": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.28 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.057, mean=0.057, max=0.057, sum=0.057 (1)", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.229 (1)", - "tab": "Robustness", - "score": 0.22935779816513763 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.219, mean=0.219, max=0.219, sum=0.219 (1)", - "tab": "Fairness", - "score": 0.21865443425076453 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.947, - "details": { - "description": "min=0.947, mean=0.947, max=0.947, sum=0.947 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.336 (1)", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.897, mean=0.897, max=0.897, sum=0.897 (1)", - "tab": "Robustness", - "score": 0.897 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)", - "tab": "Fairness", - "score": 0.936 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)", - "tab": "General information", - "score": 2.781 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)", - "tab": "General information", - "score": 1751.213 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.563, - "details": { - "description": "min=0.015, mean=0.563, max=0.99, sum=10.13 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.13, mean=0.334, max=0.562, sum=6.012 (18)", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.012, mean=0.492, max=0.958, sum=8.864 (18)", - "tab": "Robustness", - "score": 0.4924249260198337 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.008, mean=0.505, max=0.98, sum=9.086 (18)", - "tab": "Fairness", - "score": 0.5047868294149912 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.573, - "details": { - "description": "min=0.125, mean=0.573, max=0.975, sum=6.3 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.122, mean=0.572, max=0.975, sum=6.295 (11)", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.486, max=0.95, sum=5.35 (11)", - "tab": "Robustness", - "score": 0.4863636363636364 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.1, mean=0.545, max=0.975, sum=6 (11)", - "tab": "Fairness", - "score": 0.5454545454545454 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)", - "tab": "General information", - "score": 4.552272727272727 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)", - "tab": "General information", - "score": 954.1113636363635 - }, - "RAFT - # output tokens": { - "description": "min=29.575, mean=29.961, max=30, sum=329.575 (11)", - "tab": "General information", - "score": 29.961363636363636 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json deleted file mode 100644 index f2cd54e60..000000000 --- a/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_Llama-2-13B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 2 13B", - "id": "meta/Llama-2-13B", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.823, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.8231701631701632 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.8078088578088578 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.46948265409803874 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.4142191142191142 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507, - "details": { - "description": "min=0.28, mean=0.507, max=0.84, sum=2.533 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.22, mean=0.444, max=0.76, sum=2.222 (5)", - "tab": "Robustness", - "score": 0.44438596491228066 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.26, mean=0.466, max=0.79, sum=2.331 (5)", - "tab": "Fairness", - "score": 0.46614035087719297 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811, - "details": { - "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.116 (1)", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)", - "tab": "Robustness", - "score": 0.753 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.732, mean=0.732, max=0.732, sum=0.732 (1)", - "tab": "Fairness", - "score": 0.732 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.744, - "details": { - "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.682, mean=0.682, max=0.682, sum=0.682 (1)", - "tab": "Robustness", - "score": 0.681791424099214 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.657, mean=0.657, max=0.657, sum=0.657 (1)", - "tab": "Fairness", - "score": 0.6567284210865421 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.414, mean=4.414, max=4.414, sum=4.414 (1)", - "tab": "General information", - "score": 4.414084507042253 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3673.268, mean=3673.268, max=3673.268, sum=3673.268 (1)", - "tab": "General information", - "score": 3673.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)", - "tab": "Bias", - "score": 0.4166666666666667 - }, - "NarrativeQA - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.218, mean=0.218, max=0.218, sum=0.218 (1)", - "tab": "Bias", - "score": 0.21830985915492954 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.014, max=0.014, sum=0.014 (1)", - "tab": "Toxicity", - "score": 0.014084507042253521 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "description": "min=0.637, mean=0.637, max=0.637, sum=0.637 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)", - "tab": "Robustness", - "score": 0.3243542710528751 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.563, mean=0.563, max=0.563, sum=0.563 (1)", - "tab": "Robustness", - "score": 0.5631882717621935 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.309 (1)", - "tab": "Fairness", - "score": 0.30927547433853436 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.58, mean=0.58, max=0.58, sum=0.58 (1)", - "tab": "Fairness", - "score": 0.5801102053016279 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)", - "tab": "General information", - "score": 4.831 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2289.409, mean=2289.409, max=2289.409, sum=2289.409 (1)", - "tab": "General information", - "score": 2289.409 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.984, mean=0.984, max=0.984, sum=0.984 (1)", - "tab": "General information", - "score": 0.984 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.521, mean=0.521, max=0.521, sum=0.521 (1)", - "tab": "Bias", - "score": 0.5205992509363295 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.15, mean=0.15, max=0.15, sum=0.15 (1)", - "tab": "Bias", - "score": 0.15000000000000002 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", - "tab": "Bias", - "score": 0.4666666666666667 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.357, mean=0.357, max=0.357, sum=0.357 (1)", - "tab": "Bias", - "score": 0.3571428571428571 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.424, - "details": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.294 (1)", - "tab": "Robustness", - "score": 0.2939019916232739 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.351 (1)", - "tab": "Fairness", - "score": 0.35074944218906556 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=3.204, mean=3.204, max=3.204, sum=3.204 (1)", - "tab": "General information", - "score": 3.204 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=3617.038, mean=3617.038, max=3617.038, sum=3617.038 (1)", - "tab": "General information", - "score": 3617.038 - }, - "QuAC - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.549, mean=0.549, max=0.549, sum=0.549 (1)", - "tab": "Bias", - "score": 0.5485347985347986 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)", - "tab": "Bias", - "score": 0.39214643381310055 - }, - "QuAC - Representation (race)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)", - "tab": "Bias", - "score": 0.3248945147679325 - }, - "QuAC - Representation (gender)": { - "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)", - "tab": "Bias", - "score": 0.24197860962566847 - }, - "QuAC - Toxic fraction": { - "description": "min=0.004, mean=0.004, max=0.004, sum=0.004 (1)", - "tab": "Toxicity", - "score": 0.004 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.33 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)", - "tab": "Robustness", - "score": 0.2874617737003058 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.274, mean=0.274, max=0.274, sum=0.274 (1)", - "tab": "Fairness", - "score": 0.27370030581039756 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.962, - "details": { - "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)", - "tab": "Robustness", - "score": 0.954 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.957, mean=0.957, max=0.957, sum=0.957 (1)", - "tab": "Fairness", - "score": 0.957 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=2897.409, mean=2897.409, max=2897.409, sum=2897.409 (1)", - "tab": "General information", - "score": 2897.409 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.588, - "details": { - "description": "min=0.087, mean=0.588, max=0.968, sum=10.579 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.098, mean=0.323, max=0.788, sum=4.519 (14)", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.022, mean=0.47, max=0.958, sum=8.468 (18)", - "tab": "Robustness", - "score": 0.47042658911281887 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.006, mean=0.489, max=0.968, sum=8.81 (18)", - "tab": "Fairness", - "score": 0.4894481246425394 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=2.692, max=5, sum=48.448 (18)", - "tab": "General information", - "score": 2.6915388744093813 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.707, - "details": { - "description": "min=0.1, mean=0.707, max=0.975, sum=7.775 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.05, mean=0.652, max=0.95, sum=7.175 (11)", - "tab": "Robustness", - "score": 0.6522727272727272 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.075, mean=0.673, max=0.975, sum=7.4 (11)", - "tab": "Fairness", - "score": 0.6727272727272727 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=2.575, mean=4.78, max=5, sum=52.575 (11)", - "tab": "General information", - "score": 4.779545454545455 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=1153.852, max=3623.9, sum=12692.375 (11)", - "tab": "General information", - "score": 1153.8522727272727 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json deleted file mode 100644 index de031e670..000000000 --- a/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_Llama-2-70B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 2 70B", - "id": "meta/Llama-2-70B", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.944, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.9649184149184149 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.9587645687645687 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5375895851224799 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.643006993006993 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.582, - "details": { - "description": "min=0.29, mean=0.582, max=0.92, sum=2.909 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.22, mean=0.545, max=0.9, sum=2.726 (5)", - "tab": "Robustness", - "score": 0.5451929824561403 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.26, mean=0.557, max=0.91, sum=2.786 (5)", - "tab": "Fairness", - "score": 0.5571929824561404 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=0.886 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)", - "tab": "Robustness", - "score": 0.863 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)", - "tab": "Fairness", - "score": 0.859 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.722, mean=0.722, max=0.722, sum=0.722 (1)", - "tab": "Robustness", - "score": 0.7215317388650366 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.709, mean=0.709, max=0.709, sum=0.709 (1)", - "tab": "Fairness", - "score": 0.709497495841271 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.414, mean=4.414, max=4.414, sum=4.414 (1)", - "tab": "General information", - "score": 4.414084507042253 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3673.268, mean=3673.268, max=3673.268, sum=3673.268 (1)", - "tab": "General information", - "score": 3673.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)", - "tab": "Bias", - "score": 0.18695652173913044 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)", - "tab": "Toxicity", - "score": 0.008450704225352112 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674, - "details": { - "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)", - "tab": "Robustness", - "score": 0.42009390434309946 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.639, mean=0.639, max=0.639, sum=0.639 (1)", - "tab": "Robustness", - "score": 0.6385366212170214 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.4 (1)", - "tab": "Fairness", - "score": 0.3997609830959401 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.637, mean=0.637, max=0.637, sum=0.637 (1)", - "tab": "Fairness", - "score": 0.6365724774019619 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)", - "tab": "General information", - "score": 4.831 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2289.409, mean=2289.409, max=2289.409, sum=2289.409 (1)", - "tab": "General information", - "score": 2289.409 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)", - "tab": "General information", - "score": 0.998 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)", - "tab": "Bias", - "score": 0.16666666666666666 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)", - "tab": "Bias", - "score": 0.5238095238095237 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.312 (1)", - "tab": "Bias", - "score": 0.3125 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)", - "tab": "Bias", - "score": 0.5655430711610487 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.184, mean=0.184, max=0.184, sum=0.184 (1)", - "tab": "Bias", - "score": 0.1842105263157895 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.484, - "details": { - "description": "min=0.484, mean=0.484, max=0.484, sum=0.484 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.362, mean=0.362, max=0.362, sum=0.362 (1)", - "tab": "Robustness", - "score": 0.36189050917141447 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)", - "tab": "Fairness", - "score": 0.4139340894194124 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=3.204, mean=3.204, max=3.204, sum=3.204 (1)", - "tab": "General information", - "score": 3.204 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=3617.038, mean=3617.038, max=3617.038, sum=3617.038 (1)", - "tab": "General information", - "score": 3617.038 - }, - "QuAC - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)", - "tab": "Bias", - "score": 0.6111111111111112 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)", - "tab": "Bias", - "score": 0.4025455927051672 - }, - "QuAC - Representation (race)": { - "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)", - "tab": "Bias", - "score": 0.27183271832718325 - }, - "QuAC - Representation (gender)": { - "description": "min=0.239, mean=0.239, max=0.239, sum=0.239 (1)", - "tab": "Bias", - "score": 0.23913043478260873 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=0.554 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.468 (1)", - "tab": "Robustness", - "score": 0.46788990825688076 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.434 (1)", - "tab": "Fairness", - "score": 0.43425076452599387 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.961, - "details": { - "description": "min=0.961, mean=0.961, max=0.961, sum=0.961 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)", - "tab": "Robustness", - "score": 0.949 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)", - "tab": "Fairness", - "score": 0.954 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=2897.409, mean=2897.409, max=2897.409, sum=2897.409 (1)", - "tab": "General information", - "score": 2897.409 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.337, mean=0.652, max=0.919, sum=11.733 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.272, mean=0.59, max=0.884, sum=10.619 (18)", - "tab": "Robustness", - "score": 0.5899239945803259 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.125, mean=0.551, max=0.892, sum=9.924 (18)", - "tab": "Fairness", - "score": 0.551334119704094 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.125, mean=0.727, max=0.975, sum=8 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.075, mean=0.673, max=0.975, sum=7.4 (11)", - "tab": "Robustness", - "score": 0.6727272727272727 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.1, mean=0.7, max=0.975, sum=7.7 (11)", - "tab": "Fairness", - "score": 0.7 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=2.575, mean=4.78, max=5, sum=52.575 (11)", - "tab": "General information", - "score": 4.779545454545455 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=1153.852, max=3623.9, sum=12692.375 (11)", - "tab": "General information", - "score": 1153.8522727272727 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json deleted file mode 100644 index eac315fea..000000000 --- a/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_Llama-2-7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 2 7B", - "id": "meta/Llama-2-7B", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.607, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.6437529137529138 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.6102097902097903 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4576728062932413 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.8121794871794872 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431, - "details": { - "description": "min=0.28, mean=0.431, max=0.64, sum=2.153 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.22, mean=0.373, max=0.57, sum=1.866 (5)", - "tab": "Robustness", - "score": 0.37312280701754386 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.26, mean=0.392, max=0.59, sum=1.961 (5)", - "tab": "Fairness", - "score": 0.392140350877193 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.215, mean=0.215, max=0.215, sum=0.215 (1)", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.676, mean=0.676, max=0.676, sum=0.676 (1)", - "tab": "Robustness", - "score": 0.676 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)", - "tab": "Fairness", - "score": 0.706 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=1.296, mean=1.296, max=1.296, sum=1.296 (1)", - "tab": "General information", - "score": 1.296 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691, - "details": { - "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.573, mean=0.573, max=0.573, sum=0.573 (1)", - "tab": "Robustness", - "score": 0.5726018964106345 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.596, mean=0.596, max=0.596, sum=0.596 (1)", - "tab": "Fairness", - "score": 0.5960691234215144 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.414, mean=4.414, max=4.414, sum=4.414 (1)", - "tab": "General information", - "score": 4.414084507042253 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3673.268, mean=3673.268, max=3673.268, sum=3673.268 (1)", - "tab": "General information", - "score": 3673.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Bias", - "score": 0.3333333333333333 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.203, mean=0.203, max=0.203, sum=0.203 (1)", - "tab": "Bias", - "score": 0.20348837209302328 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)", - "tab": "Toxicity", - "score": 0.011267605633802818 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611, - "details": { - "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.261, mean=0.261, max=0.261, sum=0.261 (1)", - "tab": "Robustness", - "score": 0.2606038875824225 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.501, mean=0.501, max=0.501, sum=0.501 (1)", - "tab": "Robustness", - "score": 0.5010811862440044 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.264, mean=0.264, max=0.264, sum=0.264 (1)", - "tab": "Fairness", - "score": 0.26403309290317406 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)", - "tab": "Fairness", - "score": 0.5499198184166533 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)", - "tab": "General information", - "score": 0.998 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)", - "tab": "General information", - "score": 4.831 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2289.409, mean=2289.409, max=2289.409, sum=2289.409 (1)", - "tab": "General information", - "score": 2289.409 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.955, mean=0.955, max=0.955, sum=0.955 (1)", - "tab": "General information", - "score": 0.955 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.381 (1)", - "tab": "Bias", - "score": 0.38095238095238093 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.182, mean=0.182, max=0.182, sum=0.182 (1)", - "tab": "Bias", - "score": 0.18181818181818182 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.577, mean=0.577, max=0.577, sum=0.577 (1)", - "tab": "Bias", - "score": 0.5770114942528735 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.486 (1)", - "tab": "Bias", - "score": 0.48630136986301375 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406, - "details": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.271, mean=0.271, max=0.271, sum=0.271 (1)", - "tab": "Robustness", - "score": 0.27069315379336467 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.321 (1)", - "tab": "Fairness", - "score": 0.32122644280851614 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=3.204, mean=3.204, max=3.204, sum=3.204 (1)", - "tab": "General information", - "score": 3.204 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=3617.038, mean=3617.038, max=3617.038, sum=3617.038 (1)", - "tab": "General information", - "score": 3617.038 - }, - "QuAC - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)", - "tab": "Bias", - "score": 0.5833333333333334 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)", - "tab": "Bias", - "score": 0.4264652792029702 - }, - "QuAC - Representation (race)": { - "description": "min=0.283, mean=0.283, max=0.283, sum=0.283 (1)", - "tab": "Bias", - "score": 0.2831541218637993 - }, - "QuAC - Representation (gender)": { - "description": "min=0.231, mean=0.231, max=0.231, sum=0.231 (1)", - "tab": "Bias", - "score": 0.23093681917211328 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.272, - "details": { - "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)", - "tab": "Robustness", - "score": 0.23394495412844038 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.223, mean=0.223, max=0.223, sum=0.223 (1)", - "tab": "Fairness", - "score": 0.22324159021406728 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.907, - "details": { - "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.808, mean=0.808, max=0.808, sum=0.808 (1)", - "tab": "Robustness", - "score": 0.808 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.871, mean=0.871, max=0.871, sum=0.871 (1)", - "tab": "Fairness", - "score": 0.871 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=2897.409, mean=2897.409, max=2897.409, sum=2897.409 (1)", - "tab": "General information", - "score": 2897.409 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.025, mean=0.562, max=1, sum=10.108 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.025, mean=0.516, max=0.989, sum=9.28 (18)", - "tab": "Robustness", - "score": 0.5155612610622284 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.01, mean=0.503, max=0.998, sum=9.057 (18)", - "tab": "Fairness", - "score": 0.5031757189564859 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.125, mean=0.643, max=0.95, sum=7.075 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.05, mean=0.573, max=0.875, sum=6.3 (11)", - "tab": "Robustness", - "score": 0.5727272727272728 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.1, mean=0.609, max=0.95, sum=6.7 (11)", - "tab": "Fairness", - "score": 0.6090909090909092 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=2.575, mean=4.78, max=5, sum=52.575 (11)", - "tab": "General information", - "score": 4.779545454545455 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=1153.852, max=3623.9, sum=12692.375 (11)", - "tab": "General information", - "score": 1153.8522727272727 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json deleted file mode 100644 index 63a0c348d..000000000 --- a/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_OPT-175B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OPT 175B", - "id": "meta/OPT-175B", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.33807716905928437 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.5191448151403657 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.6221815633384042 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.24121162280701755 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.58013310485115 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.43513523513523517 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5927318295739348 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318, - "details": { - "description": "min=0.21, mean=0.318, max=0.48, sum=4.775 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.115, mean=0.147, max=0.194, sum=2.207 (15)", - "tab": "Calibration", - "score": 0.14714449343481936 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.13, mean=0.27, max=0.45, sum=4.048 (15)", - "tab": "Robustness", - "score": 0.2698479532163743 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.167, mean=0.287, max=0.43, sum=4.298 (15)", - "tab": "Fairness", - "score": 0.28651461988304094 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.11, mean=0.12, max=0.138, sum=1.793 (15)", - "tab": "Efficiency", - "score": 0.1195572826114746 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.777, mean=0.793, max=0.813, sum=2.379 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.177, mean=0.194, max=0.218, sum=0.581 (3)", - "tab": "Calibration", - "score": 0.19360710050007168 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.584, mean=0.623, max=0.662, sum=1.869 (3)", - "tab": "Robustness", - "score": 0.623 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.712, mean=0.731, max=0.746, sum=2.193 (3)", - "tab": "Fairness", - "score": 0.731 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.71, mean=0.869, max=0.954, sum=2.608 (3)", - "tab": "Efficiency", - "score": 0.869335141547284 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.671, - "details": { - "description": "min=0.657, mean=0.671, max=0.692, sum=2.013 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.25, mean=0.254, max=0.261, sum=0.763 (3)", - "tab": "Calibration", - "score": 0.25442494535286947 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.365, mean=0.409, max=0.447, sum=1.227 (3)", - "tab": "Robustness", - "score": 0.4090933797146052 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.545, mean=0.573, max=0.6, sum=1.718 (3)", - "tab": "Fairness", - "score": 0.5725951072978767 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=2.375, mean=2.783, max=3.573, sum=8.348 (3)", - "tab": "Efficiency", - "score": 2.7825779012238017 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=27.152, mean=40.781, max=56.166, sum=122.344 (3)", - "tab": "General information", - "score": 40.781220657277 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.472, mean=0.491, max=0.5, sum=1.472 (3)", - "tab": "Bias", - "score": 0.49074074074074076 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.213, mean=0.232, max=0.257, sum=0.695 (3)", - "tab": "Bias", - "score": 0.23182834585691858 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.017, mean=0.019, max=0.023, sum=0.056 (3)", - "tab": "Toxicity", - "score": 0.018779342723004692 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.615, - "details": { - "description": "min=0.607, mean=0.615, max=0.619, sum=1.845 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.169, mean=0.173, max=0.178, sum=0.52 (3)", - "tab": "Calibration", - "score": 0.17321815784980257 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.365, mean=0.372, max=0.38, sum=1.117 (3)", - "tab": "Calibration", - "score": 0.3723122842871363 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.202, mean=0.208, max=0.213, sum=0.623 (3)", - "tab": "Robustness", - "score": 0.2076699169323979 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.382, mean=0.408, max=0.445, sum=1.224 (3)", - "tab": "Robustness", - "score": 0.40794279599736244 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.244, mean=0.246, max=0.248, sum=0.738 (3)", - "tab": "Fairness", - "score": 0.2461285688311032 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.557, mean=0.561, max=0.566, sum=1.684 (3)", - "tab": "Fairness", - "score": 0.5613201936765554 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=4.226, mean=4.548, max=4.977, sum=13.645 (3)", - "tab": "Efficiency", - "score": 4.5482187833781085 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=6.761, mean=7.78, max=8.516, sum=23.341 (3)", - "tab": "Efficiency", - "score": 7.78018927021878 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=272.695, mean=278.02, max=287.118, sum=834.059 (3)", - "tab": "General information", - "score": 278.01966666666664 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=168.53, mean=194.671, max=213.115, sum=584.014 (3)", - "tab": "General information", - "score": 194.67133333333334 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.279, mean=0.327, max=0.375, sum=0.654 (2)", - "tab": "Bias", - "score": 0.32684426229508196 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.48, mean=0.521, max=0.562, sum=1.563 (3)", - "tab": "Bias", - "score": 0.5211641167340236 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.029, mean=0.081, max=0.119, sum=0.243 (3)", - "tab": "Bias", - "score": 0.0811320308714203 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.433, mean=0.439, max=0.45, sum=1.317 (3)", - "tab": "Bias", - "score": 0.4388888888888889 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.423, mean=0.461, max=0.48, sum=1.384 (3)", - "tab": "Bias", - "score": 0.4612918002748511 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.318, mean=0.325, max=0.332, sum=0.974 (3)", - "tab": "Bias", - "score": 0.324702218997521 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006666666666666666 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.005 (3)", - "tab": "Toxicity", - "score": 0.0016666666666666668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.36, - "details": { - "description": "min=0.347, mean=0.36, max=0.369, sum=1.08 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.128, mean=0.148, max=0.173, sum=0.443 (3)", - "tab": "Calibration", - "score": 0.14774672207107284 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.194, mean=0.2, max=0.209, sum=0.6 (3)", - "tab": "Robustness", - "score": 0.2000302607507829 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.255, mean=0.266, max=0.274, sum=0.798 (3)", - "tab": "Fairness", - "score": 0.26591098840755784 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=3.951, mean=4.049, max=4.154, sum=12.147 (3)", - "tab": "Efficiency", - "score": 4.049007016242971 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=75.972, mean=77.836, max=79.528, sum=233.507 (3)", - "tab": "General information", - "score": 77.83566666666667 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.561, mean=0.591, max=0.614, sum=1.773 (3)", - "tab": "Bias", - "score": 0.5910808767951625 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.376, mean=0.386, max=0.399, sum=1.159 (3)", - "tab": "Bias", - "score": 0.38627685600159944 - }, - "QuAC - Representation (race)": { - "description": "min=0.167, mean=0.243, max=0.304, sum=0.73 (3)", - "tab": "Bias", - "score": 0.2433558772540988 - }, - "QuAC - Representation (gender)": { - "description": "min=0.195, mean=0.207, max=0.218, sum=0.621 (3)", - "tab": "Bias", - "score": 0.2069846056271054 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.003, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)", - "tab": "Calibration", - "score": 0.324637159664446 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)", - "tab": "Robustness", - "score": 0.744 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)", - "tab": "Fairness", - "score": 0.66 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.71, mean=0.71, max=0.71, sum=0.71 (1)", - "tab": "Efficiency", - "score": 0.7096132577732451 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.586, - "details": { - "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)", - "tab": "Calibration", - "score": 0.20889829455743214 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.488, mean=0.488, max=0.488, sum=0.488 (1)", - "tab": "Robustness", - "score": 0.488 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Fairness", - "score": 0.5 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)", - "tab": "Efficiency", - "score": 0.03760148134353242 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25, - "details": { - "description": "min=0.228, mean=0.25, max=0.269, sum=1.002 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.042, mean=0.054, max=0.061, sum=0.216 (4)", - "tab": "Calibration", - "score": 0.05404322346973557 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.167, mean=0.205, max=0.249, sum=0.818 (4)", - "tab": "Robustness", - "score": 0.20451070336391436 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.165, mean=0.203, max=0.249, sum=0.812 (4)", - "tab": "Fairness", - "score": 0.2029816513761468 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.079, mean=0.141, max=0.246, sum=0.563 (4)", - "tab": "Efficiency", - "score": 0.1406602569641055 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.75, max=5, sum=15 (4)", - "tab": "General information", - "score": 3.75 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=85.121, mean=404.621, max=529.121, sum=1618.483 (4)", - "tab": "General information", - "score": 404.62079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.448, - "details": { - "description": "min=0.425, mean=0.448, max=0.467, sum=1.344 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.198, mean=0.235, max=0.263, sum=0.705 (3)", - "tab": "Robustness", - "score": 0.23496613756613724 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.386, mean=0.408, max=0.422, sum=1.225 (3)", - "tab": "Robustness", - "score": 0.4083455179340017 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.229, mean=0.26, max=0.288, sum=0.779 (3)", - "tab": "Fairness", - "score": 0.25959669312169276 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.4, mean=0.419, max=0.428, sum=1.256 (3)", - "tab": "Fairness", - "score": 0.41868435186381264 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.229, mean=0.241, max=0.262, sum=0.724 (3)", - "tab": "Efficiency", - "score": 0.24148347487755295 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.19, mean=0.226, max=0.254, sum=0.678 (3)", - "tab": "Efficiency", - "score": 0.2261325473631569 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.146, - "details": { - "description": "min=0.132, mean=0.146, max=0.156, sum=0.875 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=4.705, mean=4.729, max=4.742, sum=28.373 (6)", - "tab": "Efficiency", - "score": 4.728843353285813 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=72.006, mean=73.533, max=75.564, sum=441.197 (6)", - "tab": "General information", - "score": 73.53290414878398 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.584, mean=0.591, max=0.602, sum=3.548 (6)", - "tab": "Bias", - "score": 0.5912557147615382 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.389, mean=0.407, max=0.423, sum=2.439 (6)", - "tab": "Bias", - "score": 0.406575836707982 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.258, mean=0.294, max=0.328, sum=1.765 (6)", - "tab": "Bias", - "score": 0.29422007838910086 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.109, mean=0.123, max=0.15, sum=0.74 (6)", - "tab": "Bias", - "score": 0.1233558384477443 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.000715307582260372 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.094, mean=0.202, max=0.259, sum=0.605 (3)", - "tab": "Summarization metrics", - "score": 0.20179927196685032 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.642, mean=4.67, max=4.721, sum=28.022 (6)", - "tab": "Summarization metrics", - "score": 4.67041236939807 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.234, mean=0.276, max=0.301, sum=0.827 (3)", - "tab": "Summarization metrics", - "score": 0.2755570292220846 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.855, mean=0.933, max=0.973, sum=5.599 (6)", - "tab": "Summarization metrics", - "score": 0.9331599358896452 - }, - "CNN/DailyMail - Density": { - "description": "min=28.251, mean=31.307, max=33.584, sum=187.839 (6)", - "tab": "Summarization metrics", - "score": 31.306505459997258 - }, - "CNN/DailyMail - Compression": { - "description": "min=9.442, mean=9.8, max=10.068, sum=58.802 (6)", - "tab": "Summarization metrics", - "score": 9.800322939057557 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "min=1, mean=1, max=1, sum=6 (6)", - "tab": "Summarization metrics", - "score": 1.0 - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "min=4.333, mean=4.378, max=4.467, sum=26.267 (6)", - "tab": "Summarization metrics", - "score": 4.377777777777777 - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "min=2.833, mean=3.233, max=3.867, sum=19.4 (6)", - "tab": "Summarization metrics", - "score": 3.233333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.155, - "details": { - "description": "min=0.153, mean=0.155, max=0.158, sum=0.929 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=2.509, mean=2.523, max=2.545, sum=15.138 (6)", - "tab": "Efficiency", - "score": 2.522969657178858 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=26.037, mean=26.229, max=26.481, sum=157.375 (6)", - "tab": "General information", - "score": 26.22908622908623 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.444, mean=0.449, max=0.459, sum=2.697 (6)", - "tab": "Bias", - "score": 0.44948914431673054 - }, - "XSUM - Representation (race)": { - "description": "min=0.429, mean=0.453, max=0.481, sum=2.719 (6)", - "tab": "Bias", - "score": 0.45310942412391686 - }, - "XSUM - Representation (gender)": { - "description": "min=0.188, mean=0.218, max=0.235, sum=1.309 (6)", - "tab": "Bias", - "score": 0.21820243248814677 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)", - "tab": "Toxicity", - "score": 0.001287001287001287 - }, - "XSUM - SummaC": { - "description": "min=-0.271, mean=-0.253, max=-0.224, sum=-0.76 (3)", - "tab": "Summarization metrics", - "score": -0.25337265715073337 - }, - "XSUM - QAFactEval": { - "description": "min=3.343, mean=3.523, max=3.7, sum=21.139 (6)", - "tab": "Summarization metrics", - "score": 3.5231601957035803 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.458, mean=0.46, max=0.461, sum=1.38 (3)", - "tab": "Summarization metrics", - "score": 0.45990517032509515 - }, - "XSUM - Coverage": { - "description": "min=0.792, mean=0.793, max=0.795, sum=4.76 (6)", - "tab": "Summarization metrics", - "score": 0.7933759020774565 - }, - "XSUM - Density": { - "description": "min=2.672, mean=2.732, max=2.852, sum=16.393 (6)", - "tab": "Summarization metrics", - "score": 2.732196710488823 - }, - "XSUM - Compression": { - "description": "min=16.442, mean=16.792, max=17.056, sum=100.753 (6)", - "tab": "Summarization metrics", - "score": 16.79220871639349 - }, - "XSUM - HumanEval-faithfulness": { - "description": "min=0.583, mean=0.798, max=0.944, sum=4.789 (6)", - "tab": "Summarization metrics", - "score": 0.7981481481481479 - }, - "XSUM - HumanEval-relevance": { - "description": "min=4.167, mean=4.3, max=4.4, sum=25.8 (6)", - "tab": "Summarization metrics", - "score": 4.300000000000001 - }, - "XSUM - HumanEval-coherence": { - "description": "min=4.867, mean=4.891, max=4.917, sum=29.344 (6)", - "tab": "Summarization metrics", - "score": 4.890740740740742 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.947, - "details": { - "description": "min=0.932, mean=0.947, max=0.96, sum=2.842 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.164, mean=0.19, max=0.216, sum=0.569 (3)", - "tab": "Calibration", - "score": 0.18962950165784687 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.904, mean=0.919, max=0.937, sum=2.756 (3)", - "tab": "Robustness", - "score": 0.9186666666666667 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.929, mean=0.944, max=0.958, sum=2.831 (3)", - "tab": "Fairness", - "score": 0.9436666666666667 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=1.488, mean=1.575, max=1.732, sum=4.724 (3)", - "tab": "Efficiency", - "score": 1.5747312279142403 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.846, mean=4.933, max=4.986, sum=14.798 (3)", - "tab": "General information", - "score": 4.932666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1152.694, mean=1389.454, max=1744.631, sum=4168.363 (3)", - "tab": "General information", - "score": 1389.4543333333331 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.505, - "details": { - "description": "min=0, mean=0.505, max=1, sum=27.251 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.226, mean=0.462, max=0.633, sum=24.957 (54)", - "tab": "Calibration", - "score": 0.46216217374926066 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.184, max=0.769, sum=9.952 (54)", - "tab": "Robustness", - "score": 0.18428995439708568 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.491, max=1, sum=26.489 (54)", - "tab": "Fairness", - "score": 0.4905409716584098 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.299, mean=0.498, max=0.974, sum=26.871 (54)", - "tab": "Efficiency", - "score": 0.4976179389529128 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.606, - "details": { - "description": "min=0.075, mean=0.606, max=0.975, sum=20 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.1, mean=0.352, max=0.74, sum=11.606 (33)", - "tab": "Calibration", - "score": 0.35168585204039804 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.48, max=0.975, sum=15.85 (33)", - "tab": "Robustness", - "score": 0.4803030303030303 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.075, mean=0.58, max=0.975, sum=19.125 (33)", - "tab": "Fairness", - "score": 0.5795454545454547 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.403, mean=0.962, max=1.712, sum=31.76 (33)", - "tab": "Efficiency", - "score": 0.9624239013413396 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=5, mean=9.057, max=18.95, sum=298.875 (33)", - "tab": "General information", - "score": 9.056818181818182 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json deleted file mode 100644 index 2f3d2ad96..000000000 --- a/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/meta_OPT-66B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OPT 66B", - "id": "meta/OPT-66B", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.448, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.2888771827640159 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.43828848200372117 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.4763117490592463 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.466875 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.6312224376358433 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.3347556764223431 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5785714285714286 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276, - "details": { - "description": "min=0.2, mean=0.276, max=0.37, sum=4.141 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.101, mean=0.135, max=0.172, sum=2.031 (15)", - "tab": "Calibration", - "score": 0.13542563946906333 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.13, mean=0.216, max=0.32, sum=3.242 (15)", - "tab": "Robustness", - "score": 0.21610526315789472 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.18, mean=0.229, max=0.33, sum=3.44 (15)", - "tab": "Fairness", - "score": 0.22935672514619884 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.041, mean=0.055, max=0.081, sum=0.818 (15)", - "tab": "Efficiency", - "score": 0.05452067670741475 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.753, mean=0.76, max=0.764, sum=2.281 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.193, mean=0.2, max=0.206, sum=0.601 (3)", - "tab": "Calibration", - "score": 0.20047176103986394 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.666, mean=0.683, max=0.701, sum=2.049 (3)", - "tab": "Robustness", - "score": 0.6829999999999999 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.696, mean=0.71, max=0.721, sum=2.131 (3)", - "tab": "Fairness", - "score": 0.7103333333333333 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.272, mean=0.834, max=1.907, sum=2.501 (3)", - "tab": "Efficiency", - "score": 0.8336340090708299 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.638, - "details": { - "description": "min=0.618, mean=0.638, max=0.655, sum=1.913 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.224, mean=0.245, max=0.264, sum=0.734 (3)", - "tab": "Calibration", - "score": 0.2445466042880168 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.364, mean=0.397, max=0.421, sum=1.19 (3)", - "tab": "Robustness", - "score": 0.39653941552028354 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.497, mean=0.526, max=0.543, sum=1.579 (3)", - "tab": "Fairness", - "score": 0.5262433008374211 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.939, mean=1.98, max=3.714, sum=5.939 (3)", - "tab": "Efficiency", - "score": 1.979606440811339 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=39.707, mean=50.904, max=65.363, sum=152.713 (3)", - "tab": "General information", - "score": 50.90422535211267 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.406, mean=0.416, max=0.425, sum=1.248 (3)", - "tab": "Bias", - "score": 0.41597222222222224 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)", - "tab": "Bias", - "score": 0.5555555555555556 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.164, mean=0.191, max=0.207, sum=0.574 (3)", - "tab": "Bias", - "score": 0.1911771437726737 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.02, mean=0.022, max=0.025, sum=0.065 (3)", - "tab": "Toxicity", - "score": 0.0215962441314554 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.582, mean=0.596, max=0.615, sum=1.788 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.134, mean=0.141, max=0.149, sum=0.423 (3)", - "tab": "Calibration", - "score": 0.14107540425227785 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.379, mean=0.384, max=0.387, sum=1.153 (3)", - "tab": "Calibration", - "score": 0.38437204570087863 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.2, mean=0.206, max=0.216, sum=0.619 (3)", - "tab": "Robustness", - "score": 0.20625206311676839 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.419, mean=0.458, max=0.503, sum=1.373 (3)", - "tab": "Robustness", - "score": 0.45767430702477907 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.215, mean=0.218, max=0.221, sum=0.654 (3)", - "tab": "Fairness", - "score": 0.2180459446078801 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.52, mean=0.536, max=0.558, sum=1.607 (3)", - "tab": "Fairness", - "score": 0.5357020972773482 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.471, mean=0.611, max=0.739, sum=1.834 (3)", - "tab": "Efficiency", - "score": 0.611190575244526 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=2.887, mean=3.632, max=4.314, sum=10.896 (3)", - "tab": "Efficiency", - "score": 3.631964569965005 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=128.956, mean=153.231, max=173.545, sum=459.692 (3)", - "tab": "General information", - "score": 153.23066666666668 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=168.231, mean=211.805, max=244.906, sum=635.415 (3)", - "tab": "General information", - "score": 211.80499999999998 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0, mean=0.278, max=0.5, sum=0.833 (3)", - "tab": "Bias", - "score": 0.27777777777777773 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.467, mean=0.481, max=0.491, sum=1.444 (3)", - "tab": "Bias", - "score": 0.481339792158324 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.106, mean=0.156, max=0.233, sum=0.469 (3)", - "tab": "Bias", - "score": 0.156341189674523 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.287, mean=0.338, max=0.395, sum=1.015 (3)", - "tab": "Bias", - "score": 0.33841269841269833 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.4, mean=0.427, max=0.48, sum=1.281 (3)", - "tab": "Bias", - "score": 0.42701178032188486 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.116, mean=0.119, max=0.124, sum=0.357 (3)", - "tab": "Bias", - "score": 0.11888541157186479 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)", - "tab": "Toxicity", - "score": 0.0016666666666666668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.357, - "details": { - "description": "min=0.35, mean=0.357, max=0.366, sum=1.07 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.135, mean=0.154, max=0.176, sum=0.461 (3)", - "tab": "Calibration", - "score": 0.15357329550060583 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.177, mean=0.199, max=0.217, sum=0.597 (3)", - "tab": "Robustness", - "score": 0.19914898808715295 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.267, mean=0.268, max=0.27, sum=0.805 (3)", - "tab": "Fairness", - "score": 0.26839685415319225 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=2.636, mean=2.658, max=2.683, sum=7.974 (3)", - "tab": "Efficiency", - "score": 2.6581093871351746 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=89.614, mean=91.909, max=95.996, sum=275.728 (3)", - "tab": "General information", - "score": 91.90933333333334 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.556, mean=0.592, max=0.619, sum=1.775 (3)", - "tab": "Bias", - "score": 0.5915343915343915 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.398, mean=0.413, max=0.424, sum=1.239 (3)", - "tab": "Bias", - "score": 0.41297615039041286 - }, - "QuAC - Representation (race)": { - "description": "min=0.228, mean=0.272, max=0.324, sum=0.816 (3)", - "tab": "Bias", - "score": 0.27205505897640186 - }, - "QuAC - Representation (gender)": { - "description": "min=0.239, mean=0.245, max=0.252, sum=0.734 (3)", - "tab": "Bias", - "score": 0.2445248639131045 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)", - "tab": "Toxicity", - "score": 0.0013333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=0.745 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)", - "tab": "Calibration", - "score": 0.29326475041918015 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)", - "tab": "Robustness", - "score": 0.699 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)", - "tab": "Fairness", - "score": 0.597 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.971, mean=0.971, max=0.971, sum=0.971 (1)", - "tab": "Efficiency", - "score": 0.9708148735597889 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0.2, mean=0.2, max=0.2, sum=0.2 (1)", - "tab": "General information", - "score": 0.2 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534, - "details": { - "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.237, mean=0.237, max=0.237, sum=0.237 (1)", - "tab": "Calibration", - "score": 0.2373615873422732 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.45 (1)", - "tab": "Robustness", - "score": 0.45 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.454, mean=0.454, max=0.454, sum=0.454 (1)", - "tab": "Fairness", - "score": 0.454 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.188, mean=0.188, max=0.188, sum=0.188 (1)", - "tab": "Efficiency", - "score": 0.18798254558309685 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.201, - "details": { - "description": "min=0.185, mean=0.201, max=0.22, sum=0.804 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.047, mean=0.073, max=0.084, sum=0.293 (4)", - "tab": "Calibration", - "score": 0.07328356622626138 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.135, mean=0.174, max=0.206, sum=0.694 (4)", - "tab": "Robustness", - "score": 0.1735474006116208 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.145, mean=0.173, max=0.206, sum=0.693 (4)", - "tab": "Fairness", - "score": 0.17316513761467892 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.033, mean=0.041, max=0.046, sum=0.163 (4)", - "tab": "Efficiency", - "score": 0.04074840224276806 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.75, max=5, sum=15 (4)", - "tab": "General information", - "score": 3.75 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=85.121, mean=404.621, max=529.121, sum=1618.483 (4)", - "tab": "General information", - "score": 404.62079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482, - "details": { - "description": "min=0.467, mean=0.482, max=0.511, sum=1.446 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.175, mean=0.179, max=0.187, sum=0.537 (3)", - "tab": "Robustness", - "score": 0.1788788359788358 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.421, mean=0.437, max=0.46, sum=1.31 (3)", - "tab": "Robustness", - "score": 0.436684763137285 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.21, mean=0.214, max=0.221, sum=0.642 (3)", - "tab": "Fairness", - "score": 0.2139329365079363 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.45, mean=0.471, max=0.501, sum=1.412 (3)", - "tab": "Fairness", - "score": 0.4706976603850948 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.066, mean=0.076, max=0.089, sum=0.227 (3)", - "tab": "Efficiency", - "score": 0.07567241383876121 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.061, mean=0.102, max=0.183, sum=0.305 (3)", - "tab": "Efficiency", - "score": 0.10182954292591756 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.136, - "details": { - "description": "min=0.119, mean=0.136, max=0.149, sum=0.816 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=1.828, mean=1.972, max=2.045, sum=11.831 (6)", - "tab": "Efficiency", - "score": 1.971851329588582 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=72.955, mean=77.928, max=83.685, sum=467.567 (6)", - "tab": "General information", - "score": 77.9277539341917 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.589, mean=0.609, max=0.627, sum=3.657 (6)", - "tab": "Bias", - "score": 0.6094903870639165 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.396, mean=0.404, max=0.412, sum=2.424 (6)", - "tab": "Bias", - "score": 0.40393077624581836 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.287, mean=0.337, max=0.37, sum=2.024 (6)", - "tab": "Bias", - "score": 0.33739205476866063 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.121, mean=0.128, max=0.139, sum=0.766 (6)", - "tab": "Bias", - "score": 0.12773227690338504 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.000715307582260372 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.064, mean=0.197, max=0.291, sum=0.592 (3)", - "tab": "Summarization metrics", - "score": 0.19745183659958473 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.708, mean=4.735, max=4.771, sum=28.41 (6)", - "tab": "Summarization metrics", - "score": 4.735075808555843 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.206, mean=0.256, max=0.287, sum=0.769 (3)", - "tab": "Summarization metrics", - "score": 0.2564336767010044 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.829, mean=0.92, max=0.97, sum=5.522 (6)", - "tab": "Summarization metrics", - "score": 0.9202647711974157 - }, - "CNN/DailyMail - Density": { - "description": "min=34.301, mean=41.595, max=46.027, sum=249.573 (6)", - "tab": "Summarization metrics", - "score": 41.59545904426739 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.796, mean=9.759, max=10.302, sum=58.557 (6)", - "tab": "Summarization metrics", - "score": 9.759458553538733 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.126, - "details": { - "description": "min=0.123, mean=0.126, max=0.131, sum=0.757 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.833, mean=0.885, max=0.939, sum=5.309 (6)", - "tab": "Efficiency", - "score": 0.8849094198151292 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=23.931, mean=24.362, max=24.873, sum=146.17 (6)", - "tab": "General information", - "score": 24.361647361647357 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.453, mean=0.469, max=0.478, sum=2.812 (6)", - "tab": "Bias", - "score": 0.46873713991769544 - }, - "XSUM - Representation (race)": { - "description": "min=0.356, mean=0.462, max=0.532, sum=2.769 (6)", - "tab": "Bias", - "score": 0.46156957217464706 - }, - "XSUM - Representation (gender)": { - "description": "min=0.168, mean=0.186, max=0.201, sum=1.118 (6)", - "tab": "Bias", - "score": 0.18640980232047377 - }, - "XSUM - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.004, sum=0.015 (6)", - "tab": "Toxicity", - "score": 0.002574002574002574 - }, - "XSUM - SummaC": { - "description": "min=-0.208, mean=-0.189, max=-0.166, sum=-0.566 (3)", - "tab": "Summarization metrics", - "score": -0.18875486064192462 - }, - "XSUM - QAFactEval": { - "description": "min=3.146, mean=3.324, max=3.669, sum=19.946 (6)", - "tab": "Summarization metrics", - "score": 3.3243234460347995 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.416, mean=0.417, max=0.419, sum=1.251 (3)", - "tab": "Summarization metrics", - "score": 0.4169695047035986 - }, - "XSUM - Coverage": { - "description": "min=0.815, mean=0.817, max=0.819, sum=4.904 (6)", - "tab": "Summarization metrics", - "score": 0.8172878337570123 - }, - "XSUM - Density": { - "description": "min=3.708, mean=3.899, max=4.102, sum=23.393 (6)", - "tab": "Summarization metrics", - "score": 3.898863398596404 - }, - "XSUM - Compression": { - "description": "min=18.005, mean=18.414, max=18.872, sum=110.483 (6)", - "tab": "Summarization metrics", - "score": 18.413782867028814 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.906, mean=0.917, max=0.926, sum=2.752 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.289, mean=0.302, max=0.327, sum=0.905 (3)", - "tab": "Calibration", - "score": 0.30155451934186406 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.872, mean=0.886, max=0.901, sum=2.659 (3)", - "tab": "Robustness", - "score": 0.8863333333333333 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.898, mean=0.908, max=0.919, sum=2.725 (3)", - "tab": "Fairness", - "score": 0.9083333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.515, mean=0.54, max=0.569, sum=1.62 (3)", - "tab": "Efficiency", - "score": 0.5398914054599924 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.846, mean=4.933, max=4.986, sum=14.798 (3)", - "tab": "General information", - "score": 4.932666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1152.694, mean=1389.454, max=1744.631, sum=4168.363 (3)", - "tab": "General information", - "score": 1389.4543333333331 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.506, - "details": { - "description": "min=0, mean=0.506, max=1, sum=27.302 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.299, mean=0.474, max=0.666, sum=25.591 (54)", - "tab": "Calibration", - "score": 0.47391416538592424 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.305, max=0.939, sum=16.459 (54)", - "tab": "Robustness", - "score": 0.30478947142198615 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.5, max=1, sum=27.006 (54)", - "tab": "Fairness", - "score": 0.5001070006147802 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.173, mean=0.212, max=0.325, sum=11.459 (54)", - "tab": "Efficiency", - "score": 0.21220531272072915 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.557, - "details": { - "description": "min=0.175, mean=0.557, max=0.975, sum=18.375 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.126, mean=0.468, max=0.975, sum=15.455 (33)", - "tab": "Calibration", - "score": 0.468339884912531 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.405, max=0.85, sum=13.35 (33)", - "tab": "Robustness", - "score": 0.4045454545454546 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.175, mean=0.536, max=0.975, sum=17.7 (33)", - "tab": "Fairness", - "score": 0.5363636363636364 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.069, mean=1.871, max=6.606, sum=61.732 (33)", - "tab": "Efficiency", - "score": 1.8706600076246471 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=5, mean=18.712, max=30, sum=617.5 (33)", - "tab": "General information", - "score": 18.712121212121207 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json deleted file mode 100644 index ddcfa82ef..000000000 --- a/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TNLG v2 530B", - "id": "microsoft/TNLG-v2-530B", - "developer": "microsoft", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.787, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6152996196936993 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.6503510949562118 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.7516679834811092 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5308990441173578 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.3298371381704715 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.756578947368421 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.469, - "details": { - "description": "min=0.24, mean=0.469, max=0.78, sum=7.035 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.073, mean=0.127, max=0.202, sum=1.908 (15)", - "tab": "Calibration", - "score": 0.12722994020701678 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.15, mean=0.403, max=0.75, sum=6.051 (15)", - "tab": "Robustness", - "score": 0.40336842105263154 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.17, mean=0.418, max=0.75, sum=6.266 (15)", - "tab": "Fairness", - "score": 0.41770760233918125 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.798, mean=0.809, max=0.829, sum=2.428 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.017, mean=0.048, max=0.088, sum=0.144 (3)", - "tab": "Calibration", - "score": 0.04811928896988451 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.724, mean=0.733, max=0.747, sum=2.198 (3)", - "tab": "Robustness", - "score": 0.7326666666666667 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.756, mean=0.767, max=0.777, sum=2.3 (3)", - "tab": "Fairness", - "score": 0.7666666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722, - "details": { - "description": "min=0.692, mean=0.722, max=0.743, sum=2.166 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.026, mean=0.05, max=0.075, sum=0.15 (3)", - "tab": "Calibration", - "score": 0.05012197972633472 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.22, mean=0.319, max=0.405, sum=0.957 (3)", - "tab": "Robustness", - "score": 0.31894751591392195 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.601, mean=0.632, max=0.664, sum=1.895 (3)", - "tab": "Fairness", - "score": 0.6318169391667601 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.646, max=2.085, sum=4.938 (3)", - "tab": "General information", - "score": 1.6460093896713615 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1600.366, mean=1651.848, max=1705.003, sum=4955.544 (3)", - "tab": "General information", - "score": 1651.8478873239437 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.113, mean=5.982, max=7.265, sum=17.946 (3)", - "tab": "General information", - "score": 5.982159624413145 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.375, mean=0.395, max=0.436, sum=1.186 (3)", - "tab": "Bias", - "score": 0.3952991452991453 - }, - "NarrativeQA - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.204, mean=0.221, max=0.239, sum=0.663 (3)", - "tab": "Bias", - "score": 0.22112892189926373 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.012, max=0.014, sum=0.037 (3)", - "tab": "Toxicity", - "score": 0.012206572769953052 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.642, - "details": { - "description": "min=0.617, mean=0.642, max=0.656, sum=1.926 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.038, mean=0.04, max=0.041, sum=0.119 (3)", - "tab": "Calibration", - "score": 0.039723290660202144 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.071, mean=0.075, max=0.078, sum=0.225 (3)", - "tab": "Calibration", - "score": 0.07490014228309726 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.291, mean=0.307, max=0.322, sum=0.922 (3)", - "tab": "Robustness", - "score": 0.3074701383832172 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.483, mean=0.525, max=0.549, sum=1.576 (3)", - "tab": "Robustness", - "score": 0.5253631735860874 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.306, mean=0.318, max=0.324, sum=0.953 (3)", - "tab": "Fairness", - "score": 0.3175020164111731 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.575, mean=0.598, max=0.61, sum=1.794 (3)", - "tab": "Fairness", - "score": 0.5979278798197498 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=3.8, mean=4.569, max=5.632, sum=13.707 (3)", - "tab": "General information", - "score": 4.569 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.723, sum=14.072 (3)", - "tab": "General information", - "score": 4.690666666666666 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.056, mean=1419.328, max=1523.222, sum=4257.983 (3)", - "tab": "General information", - "score": 1419.3276666666668 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.953, mean=6.015, max=6.134, sum=18.045 (3)", - "tab": "General information", - "score": 6.015000000000001 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.25, mean=0.342, max=0.443, sum=1.026 (3)", - "tab": "Bias", - "score": 0.342063492063492 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.53, mean=0.559, max=0.573, sum=1.676 (3)", - "tab": "Bias", - "score": 0.5587121212121212 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.206, mean=0.289, max=0.419, sum=0.867 (3)", - "tab": "Bias", - "score": 0.2891147156537034 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.191, mean=0.277, max=0.345, sum=0.83 (3)", - "tab": "Bias", - "score": 0.27656250000000004 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.457, mean=0.469, max=0.484, sum=1.408 (3)", - "tab": "Bias", - "score": 0.4693006584979578 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.254, mean=0.259, max=0.261, sum=0.776 (3)", - "tab": "Bias", - "score": 0.2587447378492154 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.388, mean=0.39, max=0.393, sum=1.171 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.059, mean=0.08, max=0.106, sum=0.241 (3)", - "tab": "Calibration", - "score": 0.08020003145494241 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.183, mean=0.194, max=0.203, sum=0.583 (3)", - "tab": "Robustness", - "score": 0.19421481147358363 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.304, mean=0.313, max=0.32, sum=0.94 (3)", - "tab": "Fairness", - "score": 0.3132392185201357 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.084, sum=2.831 (3)", - "tab": "General information", - "score": 0.9436666666666667 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1624.371, mean=1644.436, max=1670.589, sum=4933.308 (3)", - "tab": "General information", - "score": 1644.436 - }, - "QuAC - # output tokens": { - "description": "min=25.915, mean=29.956, max=32.756, sum=89.867 (3)", - "tab": "General information", - "score": 29.95566666666667 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.56, mean=0.579, max=0.599, sum=1.738 (3)", - "tab": "Bias", - "score": 0.5794166151309009 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.428, mean=0.435, max=0.448, sum=1.305 (3)", - "tab": "Bias", - "score": 0.43504680341335694 - }, - "QuAC - Representation (race)": { - "description": "min=0.282, mean=0.333, max=0.369, sum=0.999 (3)", - "tab": "Bias", - "score": 0.33315102716024375 - }, - "QuAC - Representation (gender)": { - "description": "min=0.24, mean=0.25, max=0.259, sum=0.75 (3)", - "tab": "Bias", - "score": 0.2499075403684782 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.003, sum=0.008 (3)", - "tab": "Toxicity", - "score": 0.0026666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.799, - "details": { - "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.322 (1)", - "tab": "Calibration", - "score": 0.32242755675811835 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.757, mean=0.757, max=0.757, sum=0.757 (1)", - "tab": "Robustness", - "score": 0.757 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.678, mean=0.678, max=0.678, sum=0.678 (1)", - "tab": "Fairness", - "score": 0.678 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.562, mean=0.562, max=0.562, sum=0.562 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)", - "tab": "Calibration", - "score": 0.2425759072363007 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.476 (1)", - "tab": "Robustness", - "score": 0.476 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.504, mean=0.504, max=0.504, sum=0.504 (1)", - "tab": "Fairness", - "score": 0.504 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.251, - "details": { - "description": "min=0.22, mean=0.251, max=0.275, sum=0.752 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.174, mean=0.226, max=0.252, sum=0.678 (3)", - "tab": "Calibration", - "score": 0.22594889867402287 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.187, mean=0.202, max=0.217, sum=0.607 (3)", - "tab": "Robustness", - "score": 0.20234454638124363 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.177, mean=0.197, max=0.213, sum=0.59 (3)", - "tab": "Fairness", - "score": 0.19673802242609584 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.621, mean=0.643, max=0.662, sum=1.93 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.264, mean=0.287, max=0.315, sum=0.86 (3)", - "tab": "Robustness", - "score": 0.28667883597883553 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.54, mean=0.565, max=0.586, sum=1.696 (3)", - "tab": "Robustness", - "score": 0.5653481865448796 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.332, mean=0.341, max=0.354, sum=1.024 (3)", - "tab": "Fairness", - "score": 0.3414910052910049 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.592, mean=0.612, max=0.629, sum=1.836 (3)", - "tab": "Fairness", - "score": 0.6120938886543282 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.004, mean=1.011, max=1.02, sum=3.034 (3)", - "tab": "General information", - "score": 1.0113333333333334 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1.016, max=1.023, sum=3.047 (3)", - "tab": "General information", - "score": 1.0155038759689923 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.161, - "details": { - "description": "min=0.151, mean=0.161, max=0.166, sum=0.966 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=64.44, mean=66.904, max=70.5, sum=401.425 (6)", - "tab": "General information", - "score": 66.9041487839771 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.601, mean=0.629, max=0.647, sum=3.773 (6)", - "tab": "Bias", - "score": 0.6288257738993034 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.377, mean=0.398, max=0.411, sum=2.388 (6)", - "tab": "Bias", - "score": 0.3980717194410541 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.135, mean=0.227, max=0.309, sum=1.359 (6)", - "tab": "Bias", - "score": 0.22651255675216078 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.114, mean=0.12, max=0.124, sum=0.721 (6)", - "tab": "Bias", - "score": 0.12013592572007394 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.004, sum=0.017 (6)", - "tab": "Toxicity", - "score": 0.002861230329041488 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.553, mean=0.573, max=0.595, sum=1.718 (3)", - "tab": "Summarization metrics", - "score": 0.5727510890981916 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.296, mean=0.316, max=0.326, sum=0.947 (3)", - "tab": "Summarization metrics", - "score": 0.3157002201673737 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.975, mean=0.977, max=0.981, sum=5.862 (6)", - "tab": "Summarization metrics", - "score": 0.9770276969879915 - }, - "CNN/DailyMail - Density": { - "description": "min=25.944, mean=26.968, max=27.893, sum=161.808 (6)", - "tab": "Summarization metrics", - "score": 26.967920888770376 - }, - "CNN/DailyMail - Compression": { - "description": "min=9.708, mean=10.317, max=10.928, sum=61.905 (6)", - "tab": "Summarization metrics", - "score": 10.317434111699901 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.169, - "details": { - "description": "min=0.162, mean=0.169, max=0.172, sum=1.013 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=27.172, mean=27.501, max=27.815, sum=165.008 (6)", - "tab": "General information", - "score": 27.501287001287 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.443, mean=0.449, max=0.459, sum=2.696 (6)", - "tab": "Bias", - "score": 0.4493607590885817 - }, - "XSUM - Representation (race)": { - "description": "min=0.362, mean=0.486, max=0.567, sum=2.914 (6)", - "tab": "Bias", - "score": 0.4857302118171683 - }, - "XSUM - Representation (gender)": { - "description": "min=0.195, mean=0.204, max=0.217, sum=1.223 (6)", - "tab": "Bias", - "score": 0.2037662889603199 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.003, max=0.004, sum=0.015 (6)", - "tab": "Toxicity", - "score": 0.002574002574002574 - }, - "XSUM - SummaC": { - "description": "min=-0.297, mean=-0.281, max=-0.266, sum=-0.842 (3)", - "tab": "Summarization metrics", - "score": -0.2807751739040458 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.472, mean=0.473, max=0.476, sum=1.42 (3)", - "tab": "Summarization metrics", - "score": 0.4734549353569219 - }, - "XSUM - Coverage": { - "description": "min=0.772, mean=0.774, max=0.777, sum=4.641 (6)", - "tab": "Summarization metrics", - "score": 0.7735373951395458 - }, - "XSUM - Density": { - "description": "min=2.174, mean=2.322, max=2.471, sum=13.929 (6)", - "tab": "Summarization metrics", - "score": 2.321577703631062 - }, - "XSUM - Compression": { - "description": "min=15.596, mean=15.776, max=15.931, sum=94.655 (6)", - "tab": "Summarization metrics", - "score": 15.775903485860036 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.939, mean=0.941, max=0.942, sum=2.822 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.065, mean=0.087, max=0.106, sum=0.262 (3)", - "tab": "Calibration", - "score": 0.08729270886734875 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.92, mean=0.921, max=0.922, sum=2.763 (3)", - "tab": "Robustness", - "score": 0.9210000000000002 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.933, mean=0.936, max=0.94, sum=2.807 (3)", - "tab": "Fairness", - "score": 0.9356666666666666 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.845, mean=4.932, max=4.985, sum=14.796 (3)", - "tab": "General information", - "score": 4.9319999999999995 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1152.524, mean=1389.183, max=1743.988, sum=4167.55 (3)", - "tab": "General information", - "score": 1389.1833333333332 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.601, - "details": { - "description": "min=0.171, mean=0.601, max=0.983, sum=32.472 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.058, mean=0.213, max=0.447, sum=11.516 (54)", - "tab": "Calibration", - "score": 0.2132557883443423 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.069, mean=0.409, max=0.689, sum=22.106 (54)", - "tab": "Robustness", - "score": 0.4093704023963013 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.047, mean=0.48, max=0.97, sum=25.944 (54)", - "tab": "Fairness", - "score": 0.48044223702694133 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679, - "details": { - "description": "min=0.025, mean=0.679, max=0.975, sum=22.4 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.089, mean=0.244, max=0.908, sum=8.049 (33)", - "tab": "Calibration", - "score": 0.24392205141094134 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.545, max=0.85, sum=17.975 (33)", - "tab": "Robustness", - "score": 0.5446969696969698 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.025, mean=0.644, max=0.975, sum=21.25 (33)", - "tab": "Fairness", - "score": 0.6439393939393939 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=0.15, mean=3.023, max=6.625, sum=99.75 (33)", - "tab": "General information", - "score": 3.022727272727273 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json deleted file mode 100644 index b3f527a04..000000000 --- a/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TNLG v2 6.7B", - "id": "microsoft/TNLG-v2-6.7B", - "developer": "microsoft", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.309, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.60170195635043 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.2395553093550869 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.2912077355347656 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.43656162406269206 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.4445961445961446 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.611842105263158 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.242, - "details": { - "description": "min=0.2, mean=0.242, max=0.35, sum=3.627 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.103, mean=0.132, max=0.175, sum=1.983 (15)", - "tab": "Calibration", - "score": 0.13220035950695058 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.09, mean=0.169, max=0.24, sum=2.542 (15)", - "tab": "Robustness", - "score": 0.1694970760233918 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.17, mean=0.212, max=0.31, sum=3.186 (15)", - "tab": "Fairness", - "score": 0.2124327485380117 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.698, - "details": { - "description": "min=0.685, mean=0.698, max=0.709, sum=2.095 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.063, mean=0.065, max=0.067, sum=0.195 (3)", - "tab": "Calibration", - "score": 0.06514212406382298 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.623, mean=0.638, max=0.653, sum=1.914 (3)", - "tab": "Robustness", - "score": 0.638 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.649, mean=0.665, max=0.674, sum=1.996 (3)", - "tab": "Fairness", - "score": 0.6653333333333333 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.631, - "details": { - "description": "min=0.612, mean=0.631, max=0.644, sum=1.893 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.045, mean=0.046, max=0.047, sum=0.138 (3)", - "tab": "Calibration", - "score": 0.0461090042242735 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.314, mean=0.352, max=0.375, sum=1.056 (3)", - "tab": "Robustness", - "score": 0.35196743378602896 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.492, mean=0.517, max=0.532, sum=1.552 (3)", - "tab": "Fairness", - "score": 0.5173113464127798 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.646, max=2.085, sum=4.938 (3)", - "tab": "General information", - "score": 1.6460093896713615 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1600.366, mean=1651.848, max=1705.003, sum=4955.544 (3)", - "tab": "General information", - "score": 1651.8478873239437 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.189, mean=6.499, max=7.989, sum=19.496 (3)", - "tab": "General information", - "score": 6.498591549295774 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.46, mean=0.476, max=0.5, sum=1.429 (3)", - "tab": "Bias", - "score": 0.47625 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.203, mean=0.212, max=0.221, sum=0.637 (3)", - "tab": "Bias", - "score": 0.21227319042207152 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.011, max=0.014, sum=0.034 (3)", - "tab": "Toxicity", - "score": 0.011267605633802816 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.532, mean=0.561, max=0.585, sum=1.683 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.028, mean=0.031, max=0.033, sum=0.093 (3)", - "tab": "Calibration", - "score": 0.031006448164221535 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.071, mean=0.089, max=0.108, sum=0.266 (3)", - "tab": "Calibration", - "score": 0.08866228023213817 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.144, mean=0.149, max=0.159, sum=0.448 (3)", - "tab": "Robustness", - "score": 0.149387882661448 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.215, mean=0.299, max=0.355, sum=0.896 (3)", - "tab": "Robustness", - "score": 0.2985499982493553 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.152, mean=0.162, max=0.17, sum=0.485 (3)", - "tab": "Fairness", - "score": 0.16163226517271406 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.463, mean=0.501, max=0.532, sum=1.502 (3)", - "tab": "Fairness", - "score": 0.5005776676014201 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.189, mean=5.6, max=5.896, sum=16.8 (3)", - "tab": "General information", - "score": 5.6000000000000005 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.723, sum=14.072 (3)", - "tab": "General information", - "score": 4.690666666666666 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.056, mean=1419.328, max=1523.222, sum=4257.983 (3)", - "tab": "General information", - "score": 1419.3276666666668 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.244, mean=8.369, max=10.389, sum=25.107 (3)", - "tab": "General information", - "score": 8.369 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.494, mean=0.498, max=0.5, sum=1.494 (3)", - "tab": "Bias", - "score": 0.4981481481481482 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.32, mean=0.479, max=0.588, sum=1.437 (3)", - "tab": "Bias", - "score": 0.47890062007709067 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.179, mean=0.274, max=0.437, sum=0.821 (3)", - "tab": "Bias", - "score": 0.2737208807573663 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.167, mean=0.333, max=0.417, sum=1.0 (3)", - "tab": "Bias", - "score": 0.3333333333333333 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.399, mean=0.446, max=0.489, sum=1.338 (3)", - "tab": "Bias", - "score": 0.4460824634464231 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.115, mean=0.228, max=0.345, sum=0.684 (3)", - "tab": "Bias", - "score": 0.22804989848201077 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)", - "tab": "Toxicity", - "score": 0.0003333333333333333 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.003, sum=0.007 (3)", - "tab": "Toxicity", - "score": 0.0023333333333333335 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.345, - "details": { - "description": "min=0.334, mean=0.345, max=0.365, sum=1.034 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.046, mean=0.056, max=0.064, sum=0.169 (3)", - "tab": "Calibration", - "score": 0.056431419773363155 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.143, mean=0.159, max=0.17, sum=0.477 (3)", - "tab": "Robustness", - "score": 0.1590786964332521 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.26, mean=0.267, max=0.281, sum=0.801 (3)", - "tab": "Fairness", - "score": 0.26693937921563893 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.084, sum=2.831 (3)", - "tab": "General information", - "score": 0.9436666666666667 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1624.371, mean=1644.436, max=1670.589, sum=4933.308 (3)", - "tab": "General information", - "score": 1644.436 - }, - "QuAC - # output tokens": { - "description": "min=17.622, mean=19.574, max=21.058, sum=58.723 (3)", - "tab": "General information", - "score": 19.574333333333332 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.598, mean=0.618, max=0.639, sum=1.855 (3)", - "tab": "Bias", - "score": 0.6181852538995397 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.451, mean=0.472, max=0.486, sum=1.416 (3)", - "tab": "Bias", - "score": 0.47198334521620583 - }, - "QuAC - Representation (race)": { - "description": "min=0.32, mean=0.351, max=0.412, sum=1.054 (3)", - "tab": "Bias", - "score": 0.35120217651448443 - }, - "QuAC - Representation (gender)": { - "description": "min=0.213, mean=0.232, max=0.259, sum=0.695 (3)", - "tab": "Bias", - "score": 0.23164076323994623 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)", - "tab": "Toxicity", - "score": 0.0013333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704, - "details": { - "description": "min=0.704, mean=0.704, max=0.704, sum=0.704 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.268, mean=0.268, max=0.268, sum=0.268 (1)", - "tab": "Calibration", - "score": 0.2676753668258396 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)", - "tab": "Robustness", - "score": 0.656 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)", - "tab": "Fairness", - "score": 0.53 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478, - "details": { - "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.282 (1)", - "tab": "Calibration", - "score": 0.28175565698884514 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.408 (1)", - "tab": "Robustness", - "score": 0.408 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)", - "tab": "Fairness", - "score": 0.412 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.167, - "details": { - "description": "min=0.156, mean=0.167, max=0.173, sum=0.5 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.101, mean=0.117, max=0.128, sum=0.35 (3)", - "tab": "Calibration", - "score": 0.11656099093897697 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.128, mean=0.136, max=0.148, sum=0.408 (3)", - "tab": "Robustness", - "score": 0.13608562691131498 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.133, mean=0.144, max=0.162, sum=0.431 (3)", - "tab": "Fairness", - "score": 0.1437308868501529 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.332, - "details": { - "description": "min=0.273, mean=0.332, max=0.382, sum=0.997 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.074, mean=0.105, max=0.125, sum=0.315 (3)", - "tab": "Robustness", - "score": 0.1048433862433863 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.227, mean=0.278, max=0.312, sum=0.835 (3)", - "tab": "Robustness", - "score": 0.2783978738136928 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.109, mean=0.14, max=0.166, sum=0.419 (3)", - "tab": "Fairness", - "score": 0.13970383597883587 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.256, mean=0.317, max=0.363, sum=0.95 (3)", - "tab": "Fairness", - "score": 0.31652617829212154 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.028, mean=1.067, max=1.136, sum=3.2 (3)", - "tab": "General information", - "score": 1.0666666666666667 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1.047, mean=1.047, max=1.047, sum=3.14 (3)", - "tab": "General information", - "score": 1.0465116279069768 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.146, - "details": { - "description": "min=0.139, mean=0.146, max=0.157, sum=0.877 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=70.732, mean=83.556, max=100.29, sum=501.335 (6)", - "tab": "General information", - "score": 83.55579399141631 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.605, mean=0.616, max=0.623, sum=3.698 (6)", - "tab": "Bias", - "score": 0.6163696620441931 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.387, mean=0.404, max=0.42, sum=2.422 (6)", - "tab": "Bias", - "score": 0.4036032258152607 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.306, mean=0.326, max=0.352, sum=1.955 (6)", - "tab": "Bias", - "score": 0.32584352768289004 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.125, mean=0.146, max=0.173, sum=0.878 (6)", - "tab": "Bias", - "score": 0.1463963556163381 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.487, mean=0.493, max=0.501, sum=1.48 (3)", - "tab": "Summarization metrics", - "score": 0.4933195613927493 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.278, mean=0.282, max=0.284, sum=0.845 (3)", - "tab": "Summarization metrics", - "score": 0.2815425075266347 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.973, mean=0.976, max=0.981, sum=5.857 (6)", - "tab": "Summarization metrics", - "score": 0.9761546866038108 - }, - "CNN/DailyMail - Density": { - "description": "min=38.053, mean=48.951, max=68.464, sum=293.707 (6)", - "tab": "Summarization metrics", - "score": 48.951173188846475 - }, - "CNN/DailyMail - Compression": { - "description": "min=7.327, mean=9.598, max=11.919, sum=57.585 (6)", - "tab": "Summarization metrics", - "score": 9.59754128304669 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.11, - "details": { - "description": "min=0.107, mean=0.11, max=0.113, sum=0.661 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=23.276, mean=23.579, max=24.127, sum=141.471 (6)", - "tab": "General information", - "score": 23.578507078507084 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.451, mean=0.462, max=0.473, sum=2.775 (6)", - "tab": "Bias", - "score": 0.46245791245791246 - }, - "XSUM - Representation (race)": { - "description": "min=0.373, mean=0.489, max=0.579, sum=2.933 (6)", - "tab": "Bias", - "score": 0.4888826343934703 - }, - "XSUM - Representation (gender)": { - "description": "min=0.136, mean=0.182, max=0.23, sum=1.089 (6)", - "tab": "Bias", - "score": 0.18150391082886233 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)", - "tab": "Toxicity", - "score": 0.0019305019305019308 - }, - "XSUM - SummaC": { - "description": "min=-0.217, mean=-0.203, max=-0.192, sum=-0.61 (3)", - "tab": "Summarization metrics", - "score": -0.20340532606019324 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.38, mean=0.385, max=0.394, sum=1.156 (3)", - "tab": "Summarization metrics", - "score": 0.3853545238949662 - }, - "XSUM - Coverage": { - "description": "min=0.786, mean=0.793, max=0.801, sum=4.757 (6)", - "tab": "Summarization metrics", - "score": 0.792833262373014 - }, - "XSUM - Density": { - "description": "min=3.215, mean=3.286, max=3.34, sum=19.716 (6)", - "tab": "Summarization metrics", - "score": 3.2859287054515427 - }, - "XSUM - Compression": { - "description": "min=17.984, mean=18.428, max=18.968, sum=110.571 (6)", - "tab": "Summarization metrics", - "score": 18.428451341381788 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927, - "details": { - "description": "min=0.923, mean=0.927, max=0.934, sum=2.782 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.093, mean=0.118, max=0.136, sum=0.355 (3)", - "tab": "Calibration", - "score": 0.11832833491942714 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.883, mean=0.896, max=0.909, sum=2.687 (3)", - "tab": "Robustness", - "score": 0.8956666666666667 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.904, mean=0.912, max=0.922, sum=2.737 (3)", - "tab": "Fairness", - "score": 0.9123333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.845, mean=4.932, max=4.985, sum=14.796 (3)", - "tab": "General information", - "score": 4.9319999999999995 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1152.524, mean=1389.183, max=1743.988, sum=4167.55 (3)", - "tab": "General information", - "score": 1389.1833333333332 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532, - "details": { - "description": "min=0.053, mean=0.532, max=0.955, sum=28.701 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.052, mean=0.248, max=0.54, sum=13.38 (54)", - "tab": "Calibration", - "score": 0.24778001352805415 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.022, mean=0.336, max=0.831, sum=18.169 (54)", - "tab": "Robustness", - "score": 0.336456419012055 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.042, mean=0.473, max=0.947, sum=25.533 (54)", - "tab": "Fairness", - "score": 0.4728366689674401 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.525, - "details": { - "description": "min=0.025, mean=0.525, max=0.975, sum=17.325 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.103, mean=0.314, max=0.912, sum=10.346 (33)", - "tab": "Calibration", - "score": 0.31351556505949635 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.445, max=0.95, sum=14.675 (33)", - "tab": "Robustness", - "score": 0.4446969696969697 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.025, mean=0.502, max=0.975, sum=16.55 (33)", - "tab": "Fairness", - "score": 0.5015151515151516 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=0.15, mean=2.76, max=6.175, sum=91.075 (33)", - "tab": "General information", - "score": 2.7598484848484848 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json deleted file mode 100644 index 1fd56a99f..000000000 --- a/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral v0.1 7B", - "id": "mistralai/Mistral-v0.1-7B", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.8963869463869464 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.8611188811188811 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5247457047269077 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.4297202797202797 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572, - "details": { - "description": "min=0.28, mean=0.572, max=0.84, sum=2.861 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.24, mean=0.533, max=0.82, sum=2.666 (5)", - "tab": "Robustness", - "score": 0.5332280701754385 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.27, mean=0.542, max=0.83, sum=2.709 (5)", - "tab": "Fairness", - "score": 0.541719298245614 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.874, - "details": { - "description": "min=0.874, mean=0.874, max=0.874, sum=0.874 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.837, mean=0.837, max=0.837, sum=0.837 (1)", - "tab": "Robustness", - "score": 0.837 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.842, mean=0.842, max=0.842, sum=0.842 (1)", - "tab": "Fairness", - "score": 0.842 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1418.259, mean=1418.259, max=1418.259, sum=1418.259 (1)", - "tab": "General information", - "score": 1418.259 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.716, - "details": { - "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.649, mean=0.649, max=0.649, sum=0.649 (1)", - "tab": "Robustness", - "score": 0.6485445694648198 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.644, mean=0.644, max=0.644, sum=0.644 (1)", - "tab": "Fairness", - "score": 0.6436697691254157 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.575, mean=4.575, max=4.575, sum=4.575 (1)", - "tab": "General information", - "score": 4.574647887323944 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3627.715, mean=3627.715, max=3627.715, sum=3627.715 (1)", - "tab": "General information", - "score": 3627.7154929577464 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.173, mean=0.173, max=0.173, sum=0.173 (1)", - "tab": "Bias", - "score": 0.1730769230769231 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)", - "tab": "Toxicity", - "score": 0.008450704225352112 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.687, - "details": { - "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.305 (1)", - "tab": "Robustness", - "score": 0.3052498746141498 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)", - "tab": "Robustness", - "score": 0.6314234953832969 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)", - "tab": "Fairness", - "score": 0.30018094571517623 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)", - "tab": "Fairness", - "score": 0.6249254915559919 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.832, mean=4.832, max=4.832, sum=4.832 (1)", - "tab": "General information", - "score": 4.832 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2268.728, mean=2268.728, max=2268.728, sum=2268.728 (1)", - "tab": "General information", - "score": 2268.728 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)", - "tab": "General information", - "score": 0.987 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)", - "tab": "Bias", - "score": 0.25 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)", - "tab": "Bias", - "score": 0.28746177370030584 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.065, mean=0.065, max=0.065, sum=0.065 (1)", - "tab": "Bias", - "score": 0.06521739130434784 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.439 (1)", - "tab": "Bias", - "score": 0.4385964912280702 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.48 (1)", - "tab": "Bias", - "score": 0.48000000000000004 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423, - "details": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.423 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.31 (1)", - "tab": "Robustness", - "score": 0.3098633908730089 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)", - "tab": "Fairness", - "score": 0.3528008659962099 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=3.44, mean=3.44, max=3.44, sum=3.44 (1)", - "tab": "General information", - "score": 3.44 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=3680.143, mean=3680.143, max=3680.143, sum=3680.143 (1)", - "tab": "General information", - "score": 3680.143 - }, - "QuAC - # output tokens": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "General information", - "score": 0.999 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)", - "tab": "Bias", - "score": 0.6213450292397661 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)", - "tab": "Bias", - "score": 0.4119047619047619 - }, - "QuAC - Representation (race)": { - "description": "min=0.274, mean=0.274, max=0.274, sum=0.274 (1)", - "tab": "Bias", - "score": 0.27356321839080466 - }, - "QuAC - Representation (gender)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)", - "tab": "Bias", - "score": 0.2479564032697547 - }, - "QuAC - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)", - "tab": "Toxicity", - "score": 0.003 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422, - "details": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.339 (1)", - "tab": "Robustness", - "score": 0.3394495412844037 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)", - "tab": "Fairness", - "score": 0.3318042813455658 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.962, - "details": { - "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)", - "tab": "Robustness", - "score": 0.954 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.952, mean=0.952, max=0.952, sum=0.952 (1)", - "tab": "Fairness", - "score": 0.952 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=2811.31, mean=2811.31, max=2811.31, sum=2811.31 (1)", - "tab": "General information", - "score": 2811.31 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.624, - "details": { - "description": "min=0.219, mean=0.624, max=0.874, sum=11.24 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.123, mean=0.521, max=0.842, sum=9.37 (18)", - "tab": "Robustness", - "score": 0.5205335787071343 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.06, mean=0.52, max=0.863, sum=9.357 (18)", - "tab": "Fairness", - "score": 0.5198588163222009 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=390.28, mean=831.904, max=1394.234, sum=14974.265 (18)", - "tab": "General information", - "score": 831.9036212109548 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.707, - "details": { - "description": "min=0.1, mean=0.707, max=0.975, sum=7.775 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.652, max=0.975, sum=7.175 (11)", - "tab": "Robustness", - "score": 0.6522727272727272 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.1, mean=0.664, max=0.975, sum=7.3 (11)", - "tab": "Fairness", - "score": 0.6636363636363636 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=2.675, mean=4.789, max=5, sum=52.675 (11)", - "tab": "General information", - "score": 4.788636363636363 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=0, mean=328.595, max=3614.55, sum=3614.55 (11)", - "tab": "General information", - "score": 328.5954545454546 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json deleted file mode 100644 index b0d1817b0..000000000 --- a/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MPT 30B", - "id": "mosaicml/MPT-30B", - "developer": "mosaicml", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.714, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.6966666666666667 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.7464102564102564 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.2946998974900761 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.44918414918414923 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437, - "details": { - "description": "min=0.25, mean=0.437, max=0.68, sum=2.183 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.25, mean=0.381, max=0.6, sum=1.904 (5)", - "tab": "Robustness", - "score": 0.38087719298245615 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.24, mean=0.41, max=0.64, sum=2.049 (5)", - "tab": "Fairness", - "score": 0.40989473684210526 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704, - "details": { - "description": "min=0.704, mean=0.704, max=0.704, sum=0.704 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)", - "tab": "Robustness", - "score": 0.656 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)", - "tab": "Fairness", - "score": 0.631 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)", - "tab": "General information", - "score": 1251.897 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.732, - "details": { - "description": "min=0.732, mean=0.732, max=0.732, sum=0.732 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.584, mean=0.584, max=0.584, sum=0.584 (1)", - "tab": "Robustness", - "score": 0.5840358182644836 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.653, mean=0.653, max=0.653, sum=0.653 (1)", - "tab": "Fairness", - "score": 0.6525810359656932 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "General information", - "score": 1.9690140845070423 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)", - "tab": "General information", - "score": 1691.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)", - "tab": "Bias", - "score": 0.2377049180327869 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)", - "tab": "Toxicity", - "score": 0.01971830985915493 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673, - "details": { - "description": "min=0.673, mean=0.673, max=0.673, sum=0.673 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)", - "tab": "Robustness", - "score": 0.2720121639433268 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.609, mean=0.609, max=0.609, sum=0.609 (1)", - "tab": "Robustness", - "score": 0.6094875286076354 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)", - "tab": "Fairness", - "score": 0.28717918481295357 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)", - "tab": "Fairness", - "score": 0.6239999868788104 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)", - "tab": "General information", - "score": 117.299 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "General information", - "score": 0.999 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)", - "tab": "General information", - "score": 4.704 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)", - "tab": "General information", - "score": 1495.552 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)", - "tab": "General information", - "score": 0.993 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Bias", - "score": 0.3333333333333333 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.088, mean=0.088, max=0.088, sum=0.088 (1)", - "tab": "Bias", - "score": 0.08823529411764708 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.527, mean=0.527, max=0.527, sum=0.527 (1)", - "tab": "Bias", - "score": 0.5268817204301075 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.18, mean=0.18, max=0.18, sum=0.18 (1)", - "tab": "Bias", - "score": 0.17999999999999997 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.393, - "details": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.393 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.231, mean=0.231, max=0.231, sum=0.231 (1)", - "tab": "Robustness", - "score": 0.23071567735549398 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.318 (1)", - "tab": "Fairness", - "score": 0.3176438145195143 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)", - "tab": "General information", - "score": 0.883 - }, - "QuAC - truncated": { - "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)", - "tab": "General information", - "score": 0.021 - }, - "QuAC - # prompt tokens": { - "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)", - "tab": "General information", - "score": 1655.708 - }, - "QuAC - # output tokens": { - "description": "min=0.997, mean=0.997, max=0.997, sum=0.997 (1)", - "tab": "General information", - "score": 0.997 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)", - "tab": "Bias", - "score": 0.4133540372670807 - }, - "QuAC - Representation (race)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)", - "tab": "Bias", - "score": 0.4433656957928802 - }, - "QuAC - Representation (gender)": { - "description": "min=0.279, mean=0.279, max=0.279, sum=0.279 (1)", - "tab": "Bias", - "score": 0.27914110429447847 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.231, - "details": { - "description": "min=0.231, mean=0.231, max=0.231, sum=0.231 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.177, mean=0.177, max=0.177, sum=0.177 (1)", - "tab": "Robustness", - "score": 0.17737003058103976 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.19, mean=0.19, max=0.19, sum=0.19 (1)", - "tab": "Fairness", - "score": 0.18960244648318042 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)", - "tab": "General information", - "score": 505.35168195718654 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.959, - "details": { - "description": "min=0.959, mean=0.959, max=0.959, sum=0.959 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)", - "tab": "Robustness", - "score": 0.942 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.955, mean=0.955, max=0.955, sum=0.955 (1)", - "tab": "Fairness", - "score": 0.955 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)", - "tab": "General information", - "score": 2.911 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)", - "tab": "General information", - "score": 1619.568 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599, - "details": { - "description": "min=0.121, mean=0.599, max=0.951, sum=10.782 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.112, mean=0.484, max=0.81, sum=8.708 (18)", - "tab": "Robustness", - "score": 0.4837936253587437 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.073, mean=0.553, max=0.939, sum=9.947 (18)", - "tab": "Fairness", - "score": 0.5526050039546541 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)", - "tab": "General information", - "score": 771.6539847352628 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723, - "details": { - "description": "min=0.45, mean=0.723, max=0.975, sum=7.95 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.075, mean=0.58, max=0.975, sum=6.375 (11)", - "tab": "Robustness", - "score": 0.5795454545454546 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.35, mean=0.68, max=0.975, sum=7.475 (11)", - "tab": "Fairness", - "score": 0.6795454545454546 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)", - "tab": "General information", - "score": 4.6045454545454545 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)", - "tab": "General information", - "score": 869.6909090909089 - }, - "RAFT - # output tokens": { - "description": "min=0.725, mean=0.975, max=1, sum=10.725 (11)", - "tab": "General information", - "score": 0.975 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json deleted file mode 100644 index 771c4ac02..000000000 --- a/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MPT-Instruct 30B", - "id": "mosaicml/MPT-Instruct-30B", - "developer": "mosaicml", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.716, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.6561072261072262 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.6874125874125874 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.3616994955593857 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.2453962703962704 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444, - "details": { - "description": "min=0.3, mean=0.444, max=0.64, sum=2.222 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.22, mean=0.383, max=0.59, sum=1.913 (5)", - "tab": "Robustness", - "score": 0.3826315789473684 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.24, mean=0.4, max=0.61, sum=2.002 (5)", - "tab": "Fairness", - "score": 0.40038596491228073 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)", - "tab": "Robustness", - "score": 0.77 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)", - "tab": "Fairness", - "score": 0.807 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)", - "tab": "General information", - "score": 1251.897 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=0.733 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.623, mean=0.623, max=0.623, sum=0.623 (1)", - "tab": "Robustness", - "score": 0.6233490338408667 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.633, mean=0.633, max=0.633, sum=0.633 (1)", - "tab": "Fairness", - "score": 0.6330893045624563 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "General information", - "score": 1.9690140845070423 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)", - "tab": "General information", - "score": 1691.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)", - "tab": "Bias", - "score": 0.22357723577235772 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.017, mean=0.017, max=0.017, sum=0.017 (1)", - "tab": "Toxicity", - "score": 0.016901408450704224 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.697, - "details": { - "description": "min=0.697, mean=0.697, max=0.697, sum=0.697 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.202, mean=0.202, max=0.202, sum=0.202 (1)", - "tab": "Robustness", - "score": 0.20213849058578032 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)", - "tab": "Robustness", - "score": 0.6065652552159236 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)", - "tab": "Fairness", - "score": 0.23301952773256637 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.639, mean=0.639, max=0.639, sum=0.639 (1)", - "tab": "Fairness", - "score": 0.6392400021633227 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)", - "tab": "General information", - "score": 117.299 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)", - "tab": "General information", - "score": 4.704 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)", - "tab": "General information", - "score": 1495.552 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=0.994 (1)", - "tab": "General information", - "score": 0.994 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Bias", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.542, mean=0.542, max=0.542, sum=0.542 (1)", - "tab": "Bias", - "score": 0.5416666666666667 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.091, mean=0.091, max=0.091, sum=0.091 (1)", - "tab": "Bias", - "score": 0.09090909090909088 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.493, mean=0.493, max=0.493, sum=0.493 (1)", - "tab": "Bias", - "score": 0.4931129476584022 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.286 (1)", - "tab": "Bias", - "score": 0.2857142857142857 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.327, - "details": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)", - "tab": "Robustness", - "score": 0.20366013650654988 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.252, mean=0.252, max=0.252, sum=0.252 (1)", - "tab": "Fairness", - "score": 0.2519147363869601 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)", - "tab": "General information", - "score": 0.883 - }, - "QuAC - truncated": { - "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)", - "tab": "General information", - "score": 0.021 - }, - "QuAC - # prompt tokens": { - "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)", - "tab": "General information", - "score": 1655.708 - }, - "QuAC - # output tokens": { - "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)", - "tab": "General information", - "score": 0.998 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)", - "tab": "Bias", - "score": 0.42553763440860215 - }, - "QuAC - Representation (race)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)", - "tab": "Bias", - "score": 0.4074074074074074 - }, - "QuAC - Representation (gender)": { - "description": "min=0.232, mean=0.232, max=0.232, sum=0.232 (1)", - "tab": "Bias", - "score": 0.23239436619718312 - }, - "QuAC - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)", - "tab": "Toxicity", - "score": 0.003 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.234, - "details": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.177, mean=0.177, max=0.177, sum=0.177 (1)", - "tab": "Robustness", - "score": 0.17737003058103976 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.18, mean=0.18, max=0.18, sum=0.18 (1)", - "tab": "Fairness", - "score": 0.18042813455657492 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)", - "tab": "General information", - "score": 505.35168195718654 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.956, - "details": { - "description": "min=0.956, mean=0.956, max=0.956, sum=0.956 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)", - "tab": "Robustness", - "score": 0.942 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.944, mean=0.944, max=0.944, sum=0.944 (1)", - "tab": "Fairness", - "score": 0.944 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)", - "tab": "General information", - "score": 2.911 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)", - "tab": "General information", - "score": 1619.568 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.573, - "details": { - "description": "min=0.119, mean=0.573, max=0.967, sum=10.316 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.042, mean=0.408, max=0.867, sum=7.353 (18)", - "tab": "Robustness", - "score": 0.40848129232892094 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.085, mean=0.527, max=0.95, sum=9.488 (18)", - "tab": "Fairness", - "score": 0.5271340155324973 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)", - "tab": "General information", - "score": 771.6539847352628 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68, - "details": { - "description": "min=0.425, mean=0.68, max=0.9, sum=7.475 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.075, mean=0.548, max=0.875, sum=6.025 (11)", - "tab": "Robustness", - "score": 0.5477272727272727 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.4, mean=0.636, max=0.825, sum=7 (11)", - "tab": "Fairness", - "score": 0.6363636363636364 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)", - "tab": "General information", - "score": 4.6045454545454545 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)", - "tab": "General information", - "score": 869.6909090909089 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json deleted file mode 100644 index 20a0f0d63..000000000 --- a/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_GPT-J-6B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-J 6B", - "id": "openai/GPT-J-6B", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.273, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.4640964584689531 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.29051104623963353 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.2899930436637889 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6008771929824561 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4572430192172563 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.24521373688040354 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5489557226399332 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.249, - "details": { - "description": "min=0.14, mean=0.249, max=0.3, sum=3.728 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.062, mean=0.115, max=0.149, sum=1.732 (15)", - "tab": "Calibration", - "score": 0.11546362297486105 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.11, mean=0.217, max=0.28, sum=3.262 (15)", - "tab": "Robustness", - "score": 0.2174502923976608 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.13, mean=0.22, max=0.27, sum=3.294 (15)", - "tab": "Fairness", - "score": 0.21961403508771932 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.066, mean=0.07, max=0.072, sum=1.05 (15)", - "tab": "Efficiency", - "score": 0.06997480863135229 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649, - "details": { - "description": "min=0.646, mean=0.649, max=0.65, sum=1.946 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.043, mean=0.062, max=0.086, sum=0.187 (3)", - "tab": "Calibration", - "score": 0.062432673938629946 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.608, mean=0.621, max=0.631, sum=1.863 (3)", - "tab": "Robustness", - "score": 0.621 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.638, mean=0.639, max=0.64, sum=1.916 (3)", - "tab": "Fairness", - "score": 0.6386666666666666 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.354, mean=0.499, max=0.575, sum=1.497 (3)", - "tab": "Efficiency", - "score": 0.49915384031836946 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.545, - "details": { - "description": "min=0.54, mean=0.545, max=0.554, sum=1.634 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.189, mean=0.199, max=0.211, sum=0.596 (3)", - "tab": "Calibration", - "score": 0.19883043691040034 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.099, mean=0.135, max=0.156, sum=0.405 (3)", - "tab": "Robustness", - "score": 0.1349521611222693 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.417, mean=0.433, max=0.448, sum=1.3 (3)", - "tab": "Fairness", - "score": 0.43317656281615613 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.988, mean=1.311, max=1.513, sum=3.934 (3)", - "tab": "Efficiency", - "score": 1.311420011868712 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=42.766, mean=56.052, max=70.845, sum=168.155 (3)", - "tab": "General information", - "score": 56.05164319248826 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.416, mean=0.451, max=0.5, sum=1.353 (3)", - "tab": "Bias", - "score": 0.4510416666666666 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.204, mean=0.217, max=0.229, sum=0.651 (3)", - "tab": "Bias", - "score": 0.21710889248239795 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.017, mean=0.021, max=0.025, sum=0.062 (3)", - "tab": "Toxicity", - "score": 0.020657276995305163 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.548, mean=0.559, max=0.57, sum=1.677 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.069, mean=0.075, max=0.079, sum=0.224 (3)", - "tab": "Calibration", - "score": 0.07464671252737104 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.346, mean=0.354, max=0.358, sum=1.062 (3)", - "tab": "Calibration", - "score": 0.3539383109024162 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.09, mean=0.099, max=0.109, sum=0.298 (3)", - "tab": "Robustness", - "score": 0.09933930594531819 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.185, mean=0.228, max=0.265, sum=0.683 (3)", - "tab": "Robustness", - "score": 0.22767804828628146 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.112, mean=0.122, max=0.128, sum=0.365 (3)", - "tab": "Fairness", - "score": 0.12161534757794057 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.475, mean=0.493, max=0.505, sum=1.479 (3)", - "tab": "Fairness", - "score": 0.4930833990161269 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=1.626, mean=1.777, max=1.998, sum=5.331 (3)", - "tab": "Efficiency", - "score": 1.77691167926379 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=3.687, mean=3.866, max=4.016, sum=11.599 (3)", - "tab": "Efficiency", - "score": 3.8663324384530373 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=273.408, mean=282.837, max=296.556, sum=848.512 (3)", - "tab": "General information", - "score": 282.83733333333333 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=234.154, mean=247.23, max=261.681, sum=741.689 (3)", - "tab": "General information", - "score": 247.22966666666665 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.373, mean=0.49, max=0.553, sum=1.47 (3)", - "tab": "Bias", - "score": 0.49013920663848926 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.071, mean=0.192, max=0.38, sum=0.576 (3)", - "tab": "Bias", - "score": 0.19214285714285717 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.484, mean=0.524, max=0.561, sum=1.571 (3)", - "tab": "Bias", - "score": 0.5236086934551658 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.289, mean=0.317, max=0.333, sum=0.95 (3)", - "tab": "Bias", - "score": 0.3167977414801371 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)", - "tab": "Toxicity", - "score": 0.0013333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.322, mean=0.33, max=0.335, sum=0.989 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.104, mean=0.13, max=0.169, sum=0.391 (3)", - "tab": "Calibration", - "score": 0.13037730069459044 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.14, mean=0.147, max=0.155, sum=0.44 (3)", - "tab": "Robustness", - "score": 0.14672783806116493 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.245, mean=0.249, max=0.258, sum=0.748 (3)", - "tab": "Fairness", - "score": 0.2494842989068126 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.354, mean=1.389, max=1.411, sum=4.166 (3)", - "tab": "Efficiency", - "score": 1.3887290514336688 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=64.208, mean=68.54, max=71.626, sum=205.621 (3)", - "tab": "General information", - "score": 68.54033333333334 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.568, mean=0.613, max=0.641, sum=1.838 (3)", - "tab": "Bias", - "score": 0.6126959460292795 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.41, mean=0.43, max=0.447, sum=1.29 (3)", - "tab": "Bias", - "score": 0.4301368170697724 - }, - "QuAC - Representation (race)": { - "description": "min=0.232, mean=0.266, max=0.294, sum=0.798 (3)", - "tab": "Bias", - "score": 0.2658629278217009 - }, - "QuAC - Representation (gender)": { - "description": "min=0.211, mean=0.23, max=0.241, sum=0.69 (3)", - "tab": "Bias", - "score": 0.2300432286449244 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.004, max=0.005, sum=0.011 (3)", - "tab": "Toxicity", - "score": 0.0036666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.663, - "details": { - "description": "min=0.663, mean=0.663, max=0.663, sum=0.663 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)", - "tab": "Calibration", - "score": 0.2332919292558098 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.619, mean=0.619, max=0.619, sum=0.619 (1)", - "tab": "Robustness", - "score": 0.619 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.486 (1)", - "tab": "Fairness", - "score": 0.486 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.03, mean=0.03, max=0.03, sum=0.03 (1)", - "tab": "Efficiency", - "score": 0.030294155851006508 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514, - "details": { - "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.235, mean=0.235, max=0.235, sum=0.235 (1)", - "tab": "Calibration", - "score": 0.2353362549897216 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.398 (1)", - "tab": "Robustness", - "score": 0.398 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.416 (1)", - "tab": "Fairness", - "score": 0.416 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.019, mean=0.019, max=0.019, sum=0.019 (1)", - "tab": "Efficiency", - "score": 0.019339164675618026 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.199, - "details": { - "description": "min=0.187, mean=0.199, max=0.213, sum=0.797 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.056, mean=0.078, max=0.103, sum=0.311 (4)", - "tab": "Calibration", - "score": 0.07772735423117484 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.157, mean=0.181, max=0.209, sum=0.725 (4)", - "tab": "Robustness", - "score": 0.1811926605504587 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.156, mean=0.18, max=0.209, sum=0.72 (4)", - "tab": "Fairness", - "score": 0.18004587155963303 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.018, mean=0.044, max=0.053, sum=0.175 (4)", - "tab": "Efficiency", - "score": 0.043782452828866295 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.75, max=5, sum=15 (4)", - "tab": "General information", - "score": 3.75 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=85.121, mean=404.621, max=529.121, sum=1618.483 (4)", - "tab": "General information", - "score": 404.62079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.345, - "details": { - "description": "min=0.315, mean=0.345, max=0.362, sum=1.035 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.094, mean=0.116, max=0.131, sum=0.349 (3)", - "tab": "Robustness", - "score": 0.11636587301587299 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.29, mean=0.319, max=0.336, sum=0.957 (3)", - "tab": "Robustness", - "score": 0.3190834142643501 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.106, mean=0.129, max=0.144, sum=0.387 (3)", - "tab": "Fairness", - "score": 0.12886375661375657 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.303, mean=0.332, max=0.348, sum=0.997 (3)", - "tab": "Fairness", - "score": 0.3321982457704417 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.081, mean=0.084, max=0.088, sum=0.252 (3)", - "tab": "Efficiency", - "score": 0.08407480907713127 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.078, mean=0.081, max=0.083, sum=0.242 (3)", - "tab": "Efficiency", - "score": 0.08053553836682271 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.131, - "details": { - "description": "min=0.127, mean=0.131, max=0.135, sum=0.787 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=1.997, mean=2.076, max=2.172, sum=12.455 (6)", - "tab": "Efficiency", - "score": 2.0758840914959578 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=76.916, mean=83.931, max=91.68, sum=503.584 (6)", - "tab": "General information", - "score": 83.93061516452074 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.602, mean=0.63, max=0.655, sum=3.78 (6)", - "tab": "Bias", - "score": 0.6299677400199846 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.398, mean=0.402, max=0.41, sum=2.415 (6)", - "tab": "Bias", - "score": 0.40247728320483095 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.23, mean=0.293, max=0.359, sum=1.759 (6)", - "tab": "Bias", - "score": 0.2931668421996429 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.131, mean=0.146, max=0.169, sum=0.875 (6)", - "tab": "Bias", - "score": 0.14576217898261626 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.013 (6)", - "tab": "Toxicity", - "score": 0.002145922746781116 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.172, mean=0.208, max=0.236, sum=0.623 (3)", - "tab": "Summarization metrics", - "score": 0.20780144742590156 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.648, mean=4.704, max=4.739, sum=28.226 (6)", - "tab": "Summarization metrics", - "score": 4.704313539792442 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.241, mean=0.247, max=0.25, sum=0.74 (3)", - "tab": "Summarization metrics", - "score": 0.2466254745716148 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.902, mean=0.948, max=0.97, sum=5.685 (6)", - "tab": "Summarization metrics", - "score": 0.9475541325972495 - }, - "CNN/DailyMail - Density": { - "description": "min=41.364, mean=48.284, max=57.69, sum=289.703 (6)", - "tab": "Summarization metrics", - "score": 48.283839374824815 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.117, mean=9.864, max=11.439, sum=59.186 (6)", - "tab": "Summarization metrics", - "score": 9.864391531990323 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.096, - "details": { - "description": "min=0.093, mean=0.096, max=0.097, sum=0.573 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.73, mean=0.742, max=0.758, sum=4.455 (6)", - "tab": "Efficiency", - "score": 0.7424737962465443 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=24.919, mean=25.529, max=26.187, sum=153.174 (6)", - "tab": "General information", - "score": 25.52895752895753 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.393, mean=0.435, max=0.466, sum=2.612 (6)", - "tab": "Bias", - "score": 0.43535525321239604 - }, - "XSUM - Representation (race)": { - "description": "min=0.467, mean=0.513, max=0.565, sum=3.08 (6)", - "tab": "Bias", - "score": 0.5133548156104547 - }, - "XSUM - Representation (gender)": { - "description": "min=0.141, mean=0.165, max=0.179, sum=0.988 (6)", - "tab": "Bias", - "score": 0.1646512031093765 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)", - "tab": "Toxicity", - "score": 0.0019305019305019308 - }, - "XSUM - SummaC": { - "description": "min=-0.229, mean=-0.198, max=-0.176, sum=-0.593 (3)", - "tab": "Summarization metrics", - "score": -0.1976111372976741 - }, - "XSUM - QAFactEval": { - "description": "min=3.59, mean=3.813, max=4.142, sum=22.877 (6)", - "tab": "Summarization metrics", - "score": 3.8128682530109397 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.379, mean=0.381, max=0.384, sum=1.142 (3)", - "tab": "Summarization metrics", - "score": 0.3808147712365148 - }, - "XSUM - Coverage": { - "description": "min=0.824, mean=0.829, max=0.831, sum=4.972 (6)", - "tab": "Summarization metrics", - "score": 0.8286466360730634 - }, - "XSUM - Density": { - "description": "min=3.796, mean=4.043, max=4.434, sum=24.256 (6)", - "tab": "Summarization metrics", - "score": 4.042629935538992 - }, - "XSUM - Compression": { - "description": "min=17.57, mean=17.942, max=18.398, sum=107.65 (6)", - "tab": "Summarization metrics", - "score": 17.941696288315352 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.939, - "details": { - "description": "min=0.932, mean=0.939, max=0.946, sum=2.816 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.285, mean=0.295, max=0.311, sum=0.884 (3)", - "tab": "Calibration", - "score": 0.2945110955018834 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.895, mean=0.903, max=0.908, sum=2.709 (3)", - "tab": "Robustness", - "score": 0.903 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.92, mean=0.927, max=0.932, sum=2.782 (3)", - "tab": "Fairness", - "score": 0.9273333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.63, mean=0.701, max=0.761, sum=2.104 (3)", - "tab": "Efficiency", - "score": 0.7011672212481499 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.846, mean=4.933, max=4.986, sum=14.798 (3)", - "tab": "General information", - "score": 4.932666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1152.694, mean=1389.454, max=1744.631, sum=4168.363 (3)", - "tab": "General information", - "score": 1389.4543333333331 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.002, mean=0.52, max=1, sum=28.06 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.075, mean=0.409, max=0.626, sum=22.076 (54)", - "tab": "Calibration", - "score": 0.40880926893677766 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.418, max=1, sum=22.597 (54)", - "tab": "Robustness", - "score": 0.4184575354873046 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.488, max=1, sum=26.356 (54)", - "tab": "Fairness", - "score": 0.4880679688031825 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.154, mean=0.307, max=0.494, sum=16.591 (54)", - "tab": "Efficiency", - "score": 0.30723795570455475 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "min=0.333, mean=0.5, max=0.667, sum=1 (2)", - "tab": "Bias", - "score": 0.5 - }, - "CivilComments - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Bias", - "score": 0.5 - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (54)", - "tab": "Toxicity", - "score": 0.000027763895829862844 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.619, - "details": { - "description": "min=0.275, mean=0.619, max=0.975, sum=20.425 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.116, mean=0.389, max=0.975, sum=12.832 (33)", - "tab": "Calibration", - "score": 0.3888407166022056 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.1, mean=0.53, max=0.975, sum=17.5 (33)", - "tab": "Robustness", - "score": 0.5303030303030303 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.275, mean=0.594, max=0.975, sum=19.6 (33)", - "tab": "Fairness", - "score": 0.593939393939394 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.107, mean=0.628, max=1.382, sum=20.733 (33)", - "tab": "Efficiency", - "score": 0.6282604447639349 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=5, mean=14.276, max=30, sum=471.1 (33)", - "tab": "General information", - "score": 14.275757575757577 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json deleted file mode 100644 index 0c00ea05c..000000000 --- a/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-NeoX 20B", - "id": "openai/GPT-NeoX-20B", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.351, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.4215761012322838 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.3361523348731358 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.3311530516202374 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5141337719298246 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.46836548983528487 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.36547434047434046 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.4456349206349206 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276, - "details": { - "description": "min=0.21, mean=0.276, max=0.351, sum=4.146 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.094, mean=0.122, max=0.145, sum=1.831 (15)", - "tab": "Calibration", - "score": 0.12205035764205192 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.149, mean=0.189, max=0.24, sum=2.833 (15)", - "tab": "Robustness", - "score": 0.1888421052631579 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.175, mean=0.215, max=0.26, sum=3.228 (15)", - "tab": "Fairness", - "score": 0.21518128654970764 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.093, mean=0.133, max=0.275, sum=1.995 (15)", - "tab": "Efficiency", - "score": 0.1330090104470642 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=7019.035 (15)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.659, mean=0.683, max=0.714, sum=2.048 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.168, mean=0.195, max=0.238, sum=0.585 (3)", - "tab": "Calibration", - "score": 0.19500535688345313 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.548, mean=0.551, max=0.556, sum=1.653 (3)", - "tab": "Robustness", - "score": 0.551 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.594, mean=0.609, max=0.629, sum=1.827 (3)", - "tab": "Fairness", - "score": 0.609 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.515, mean=0.773, max=1.206, sum=2.318 (3)", - "tab": "Efficiency", - "score": 0.772616056262233 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=656.897, mean=913.897, max=1251.897, sum=2741.691 (3)", - "tab": "General information", - "score": 913.8969999999999 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599, - "details": { - "description": "min=0.558, mean=0.599, max=0.623, sum=1.797 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.2, mean=0.224, max=0.244, sum=0.672 (3)", - "tab": "Calibration", - "score": 0.2239646545151891 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.378, mean=0.421, max=0.443, sum=1.263 (3)", - "tab": "Robustness", - "score": 0.4211068794456416 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.419, mean=0.461, max=0.485, sum=1.382 (3)", - "tab": "Fairness", - "score": 0.46066534756418576 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.904, mean=1.468, max=1.998, sum=4.404 (3)", - "tab": "Efficiency", - "score": 1.4680144681286658 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=0.989, mean=1.568, max=1.969, sum=4.704 (3)", - "tab": "General information", - "score": 1.568075117370892 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1607.893, mean=1641.033, max=1691.082, sum=4923.099 (3)", - "tab": "General information", - "score": 1641.0328638497651 - }, - "NarrativeQA - # output tokens": { - "description": "min=24.282, mean=40.047, max=54.028, sum=120.141 (3)", - "tab": "General information", - "score": 40.04694835680751 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.396, mean=0.449, max=0.5, sum=1.346 (3)", - "tab": "Bias", - "score": 0.44861111111111107 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.159, mean=0.186, max=0.206, sum=0.557 (3)", - "tab": "Bias", - "score": 0.18579713036394171 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.017, mean=0.022, max=0.025, sum=0.065 (3)", - "tab": "Toxicity", - "score": 0.0215962441314554 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.581, mean=0.596, max=0.608, sum=1.788 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.099, mean=0.103, max=0.106, sum=0.309 (3)", - "tab": "Calibration", - "score": 0.10315653555419742 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.371, mean=0.373, max=0.375, sum=1.118 (3)", - "tab": "Calibration", - "score": 0.37278118995003706 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.125, mean=0.133, max=0.14, sum=0.398 (3)", - "tab": "Robustness", - "score": 0.1325934362402064 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.429, mean=0.452, max=0.48, sum=1.357 (3)", - "tab": "Robustness", - "score": 0.4524359199313521 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.147, mean=0.154, max=0.158, sum=0.461 (3)", - "tab": "Fairness", - "score": 0.15381312093617092 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.509, mean=0.525, max=0.537, sum=1.574 (3)", - "tab": "Fairness", - "score": 0.524698076718683 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.381, mean=0.482, max=0.655, sum=1.447 (3)", - "tab": "Efficiency", - "score": 0.4823250982166127 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=1.913, mean=2.137, max=2.288, sum=6.411 (3)", - "tab": "Efficiency", - "score": 2.1369374864319965 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.299, mean=112.966, max=117.299, sum=338.897 (3)", - "tab": "General information", - "score": 112.96566666666668 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=77.379, mean=90.195, max=107.541, sum=270.584 (3)", - "tab": "General information", - "score": 90.19466666666666 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.685, mean=4.704, max=4.723, sum=14.112 (3)", - "tab": "General information", - "score": 4.704 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.111 (3)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1247.862, mean=1394.229, max=1495.552, sum=4182.688 (3)", - "tab": "General information", - "score": 1394.2293333333334 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=73.671, mean=87.693, max=98.984, sum=263.078 (3)", - "tab": "General information", - "score": 87.69266666666665 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.309, mean=0.362, max=0.444, sum=1.086 (3)", - "tab": "Bias", - "score": 0.3621399176954732 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.233, mean=0.318, max=0.382, sum=0.954 (3)", - "tab": "Bias", - "score": 0.31784137078254726 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.538, mean=0.57, max=0.59, sum=1.709 (3)", - "tab": "Bias", - "score": 0.5695499220251695 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0, mean=0.094, max=0.241, sum=0.283 (3)", - "tab": "Bias", - "score": 0.09428104575163399 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.326, - "details": { - "description": "min=0.32, mean=0.326, max=0.335, sum=0.979 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.105, mean=0.115, max=0.129, sum=0.345 (3)", - "tab": "Calibration", - "score": 0.11494333135422596 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.176, mean=0.191, max=0.202, sum=0.574 (3)", - "tab": "Robustness", - "score": 0.19141062427574787 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.224, mean=0.232, max=0.243, sum=0.695 (3)", - "tab": "Fairness", - "score": 0.23177797124335245 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.906, mean=2.025, max=2.127, sum=6.075 (3)", - "tab": "Efficiency", - "score": 2.024874148220674 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.804, mean=0.889, max=0.979, sum=2.666 (3)", - "tab": "General information", - "score": 0.8886666666666666 - }, - "QuAC - truncated": { - "description": "min=0.021, mean=0.021, max=0.021, sum=0.063 (3)", - "tab": "General information", - "score": 0.021 - }, - "QuAC - # prompt tokens": { - "description": "min=1602.026, mean=1640.361, max=1663.349, sum=4921.083 (3)", - "tab": "General information", - "score": 1640.3609999999999 - }, - "QuAC - # output tokens": { - "description": "min=73.99, mean=77.489, max=80.665, sum=232.466 (3)", - "tab": "General information", - "score": 77.48866666666667 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.606, mean=0.626, max=0.639, sum=1.877 (3)", - "tab": "Bias", - "score": 0.6257674787086551 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.436, mean=0.448, max=0.455, sum=1.344 (3)", - "tab": "Bias", - "score": 0.4481503328194676 - }, - "QuAC - Representation (race)": { - "description": "min=0.319, mean=0.334, max=0.354, sum=1.003 (3)", - "tab": "Bias", - "score": 0.3344046827039365 - }, - "QuAC - Representation (gender)": { - "description": "min=0.258, mean=0.268, max=0.282, sum=0.804 (3)", - "tab": "Bias", - "score": 0.26793463346025864 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.277 (1)", - "tab": "Calibration", - "score": 0.2773372160584027 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.661, mean=0.661, max=0.661, sum=0.661 (1)", - "tab": "Robustness", - "score": 0.661 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.552, mean=0.552, max=0.552, sum=0.552 (1)", - "tab": "Fairness", - "score": 0.552 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.025, mean=0.025, max=0.025, sum=0.025 (1)", - "tab": "Efficiency", - "score": 0.025470768198370932 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=88.806, mean=88.806, max=88.806, sum=88.806 (1)", - "tab": "General information", - "score": 88.806 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524, - "details": { - "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.232, mean=0.232, max=0.232, sum=0.232 (1)", - "tab": "Calibration", - "score": 0.23249621701719156 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)", - "tab": "Robustness", - "score": 0.414 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)", - "tab": "Fairness", - "score": 0.438 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.024, mean=0.024, max=0.024, sum=0.024 (1)", - "tab": "Efficiency", - "score": 0.023963596328905958 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.346, mean=5.346, max=5.346, sum=5.346 (1)", - "tab": "General information", - "score": 5.346 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.216, - "details": { - "description": "min=0.205, mean=0.216, max=0.225, sum=0.864 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.051, mean=0.058, max=0.068, sum=0.232 (4)", - "tab": "Calibration", - "score": 0.057891800582365614 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.144, mean=0.175, max=0.225, sum=0.7 (4)", - "tab": "Robustness", - "score": 0.17507645259938837 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.161, mean=0.179, max=0.225, sum=0.714 (4)", - "tab": "Fairness", - "score": 0.17851681957186544 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.029, mean=0.084, max=0.133, sum=0.335 (4)", - "tab": "Efficiency", - "score": 0.08375055263898766 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.75, max=5, sum=15 (4)", - "tab": "General information", - "score": 3.75 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=86.352, mean=406.102, max=532.352, sum=1624.407 (4)", - "tab": "General information", - "score": 406.10168195718654 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398, - "details": { - "description": "min=0.37, mean=0.398, max=0.436, sum=1.195 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.082, mean=0.096, max=0.107, sum=0.288 (3)", - "tab": "Robustness", - "score": 0.09600105820105831 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.338, mean=0.351, max=0.365, sum=1.053 (3)", - "tab": "Robustness", - "score": 0.3510422646487042 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.137, mean=0.148, max=0.163, sum=0.445 (3)", - "tab": "Fairness", - "score": 0.1483276455026454 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.347, mean=0.381, max=0.416, sum=1.144 (3)", - "tab": "Fairness", - "score": 0.38125183165300675 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.111, mean=0.118, max=0.128, sum=0.355 (3)", - "tab": "Efficiency", - "score": 0.11821914517316674 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.105, mean=0.116, max=0.127, sum=0.349 (3)", - "tab": "Efficiency", - "score": 0.11621723726407733 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=499.575, mean=537.908, max=583.575, sum=1613.725 (3)", - "tab": "General information", - "score": 537.9083333333334 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=481.14, mean=519.473, max=565.14, sum=1558.419 (3)", - "tab": "General information", - "score": 519.4728682170543 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.123, - "details": { - "description": "min=0.108, mean=0.123, max=0.138, sum=0.738 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=2.104, mean=2.133, max=2.168, sum=12.798 (6)", - "tab": "Efficiency", - "score": 2.133056901521097 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1561.275, mean=1582.608, max=1612.275, sum=9495.648 (6)", - "tab": "General information", - "score": 1582.6080114449214 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=80.197, mean=80.409, max=80.588, sum=482.455 (6)", - "tab": "General information", - "score": 80.40915593705294 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.612, mean=0.616, max=0.62, sum=3.697 (6)", - "tab": "Bias", - "score": 0.6162431158667614 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.386, mean=0.41, max=0.431, sum=2.46 (6)", - "tab": "Bias", - "score": 0.4099353286102709 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.182, mean=0.289, max=0.35, sum=1.732 (6)", - "tab": "Bias", - "score": 0.288716873622534 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.127, mean=0.149, max=0.168, sum=0.896 (6)", - "tab": "Bias", - "score": 0.14933277507884896 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.009, mean=0.165, max=0.255, sum=0.494 (3)", - "tab": "Summarization metrics", - "score": 0.16465107490254738 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.591, mean=4.69, max=4.763, sum=28.138 (6)", - "tab": "Summarization metrics", - "score": 4.689614935266213 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.175, mean=0.226, max=0.262, sum=0.677 (3)", - "tab": "Summarization metrics", - "score": 0.2255769362361307 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.786, mean=0.91, max=0.973, sum=5.46 (6)", - "tab": "Summarization metrics", - "score": 0.910005755446767 - }, - "CNN/DailyMail - Density": { - "description": "min=35.834, mean=37.149, max=38.818, sum=222.893 (6)", - "tab": "Summarization metrics", - "score": 37.14890205441478 - }, - "CNN/DailyMail - Compression": { - "description": "min=9.164, mean=9.676, max=9.978, sum=58.057 (6)", - "tab": "Summarization metrics", - "score": 9.676104726319009 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102, - "details": { - "description": "min=0.098, mean=0.102, max=0.105, sum=0.61 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=1.104, mean=1.116, max=1.135, sum=6.698 (6)", - "tab": "Efficiency", - "score": 1.1163698516910754 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.996, mean=4.997, max=5, sum=29.985 (6)", - "tab": "General information", - "score": 4.997425997425997 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1487.131, mean=1545.148, max=1574.17, sum=9270.888 (6)", - "tab": "General information", - "score": 1545.148005148005 - }, - "XSUM - # output tokens": { - "description": "min=24.871, mean=25.402, max=26.143, sum=152.413 (6)", - "tab": "General information", - "score": 25.402187902187904 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.449, mean=0.449, max=0.449, sum=2.694 (6)", - "tab": "Bias", - "score": 0.4490600226000671 - }, - "XSUM - Representation (race)": { - "description": "min=0.483, mean=0.526, max=0.565, sum=3.158 (6)", - "tab": "Bias", - "score": 0.5263835263835264 - }, - "XSUM - Representation (gender)": { - "description": "min=0.132, mean=0.162, max=0.184, sum=0.972 (6)", - "tab": "Bias", - "score": 0.16191706040214252 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)", - "tab": "Toxicity", - "score": 0.0019305019305019308 - }, - "XSUM - SummaC": { - "description": "min=-0.22, mean=-0.208, max=-0.2, sum=-0.625 (3)", - "tab": "Summarization metrics", - "score": -0.2082928215061222 - }, - "XSUM - QAFactEval": { - "description": "min=3.048, mean=3.303, max=3.621, sum=19.818 (6)", - "tab": "Summarization metrics", - "score": 3.302964744932122 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.385, mean=0.391, max=0.395, sum=1.174 (3)", - "tab": "Summarization metrics", - "score": 0.39129907447599627 - }, - "XSUM - Coverage": { - "description": "min=0.822, mean=0.825, max=0.83, sum=4.948 (6)", - "tab": "Summarization metrics", - "score": 0.8247285888112758 - }, - "XSUM - Density": { - "description": "min=3.228, mean=3.371, max=3.613, sum=20.226 (6)", - "tab": "Summarization metrics", - "score": 3.3710531876366 - }, - "XSUM - Compression": { - "description": "min=17.631, mean=18.238, max=18.621, sum=109.428 (6)", - "tab": "Summarization metrics", - "score": 18.23798025069092 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.948, - "details": { - "description": "min=0.946, mean=0.948, max=0.95, sum=2.844 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.189, mean=0.23, max=0.269, sum=0.69 (3)", - "tab": "Calibration", - "score": 0.22988586030197733 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.906, mean=0.912, max=0.921, sum=2.736 (3)", - "tab": "Robustness", - "score": 0.9119999999999999 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.925, mean=0.928, max=0.933, sum=2.785 (3)", - "tab": "Fairness", - "score": 0.9283333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.748, mean=0.862, max=1.078, sum=2.586 (3)", - "tab": "Efficiency", - "score": 0.862092325799332 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.842, mean=4.93, max=4.981, sum=14.789 (3)", - "tab": "General information", - "score": 4.929666666666667 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1162.003, mean=1398.09, max=1750.717, sum=4194.271 (3)", - "tab": "General information", - "score": 1398.0903333333333 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.516, - "details": { - "description": "min=0, mean=0.516, max=1, sum=27.878 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.26, mean=0.444, max=0.593, sum=23.994 (54)", - "tab": "Calibration", - "score": 0.4443373993811643 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.48, max=1, sum=25.9 (54)", - "tab": "Robustness", - "score": 0.4796354739742704 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.491, max=1, sum=26.497 (54)", - "tab": "Fairness", - "score": 0.4906931444587031 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.253, mean=0.408, max=0.906, sum=22.04 (54)", - "tab": "Efficiency", - "score": 0.4081493504712871 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=360.976, mean=726.728, max=1282.4, sum=39243.315 (54)", - "tab": "General information", - "score": 726.7280588093369 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.505, - "details": { - "description": "min=0.025, mean=0.505, max=0.975, sum=16.65 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.067, mean=0.324, max=0.975, sum=10.705 (33)", - "tab": "Calibration", - "score": 0.3243919141625793 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.399, max=0.975, sum=13.175 (33)", - "tab": "Robustness", - "score": 0.39924242424242423 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.025, mean=0.475, max=0.975, sum=15.675 (33)", - "tab": "Fairness", - "score": 0.47500000000000003 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.16, mean=1.156, max=2.589, sum=38.155 (33)", - "tab": "Efficiency", - "score": 1.1562087950381366 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.56, max=5, sum=150.475 (33)", - "tab": "General information", - "score": 4.5598484848484855 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=269.35, mean=807.97, max=1764, sum=26663.0 (33)", - "tab": "General information", - "score": 807.9696969696969 - }, - "RAFT - # output tokens": { - "description": "min=5, mean=13.945, max=30, sum=460.2 (33)", - "tab": "General information", - "score": 13.945454545454545 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json deleted file mode 100644 index 5355ce78b..000000000 --- a/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_ada-350M/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ada 350M", - "id": "openai/ada-350M", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6164902182478501 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.10196623917424807 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.10483119031506129 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7698300438596491 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4272126112641924 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.30052416719083386 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.23114035087719298 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.243, - "details": { - "description": "min=0.132, mean=0.243, max=0.32, sum=3.641 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.049, mean=0.128, max=0.186, sum=1.923 (15)", - "tab": "Calibration", - "score": 0.1282115692539908 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.105, mean=0.204, max=0.28, sum=3.054 (15)", - "tab": "Robustness", - "score": 0.20357894736842103 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.053, mean=0.21, max=0.31, sum=3.155 (15)", - "tab": "Fairness", - "score": 0.2103157894736842 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.14, mean=0.14, max=0.141, sum=2.103 (15)", - "tab": "Efficiency", - "score": 0.1402282775493421 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.581, - "details": { - "description": "min=0.525, mean=0.581, max=0.627, sum=1.743 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.049, mean=0.067, max=0.09, sum=0.2 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.06655133808072823 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.349, mean=0.461, max=0.549, sum=1.383 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.461 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.421, mean=0.507, max=0.575, sum=1.52 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.5066666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.14, mean=0.141, max=0.141, sum=0.422 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.14052770182291666 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1.004, max=1.008, sum=3.012 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.004 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.326, - "details": { - "description": "min=0.311, mean=0.326, max=0.35, sum=0.978 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.034, mean=0.046, max=0.064, sum=0.138 (3)", - "tab": "Calibration", - "score": 0.04605131521940172 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.094, mean=0.104, max=0.11, sum=0.312 (3)", - "tab": "Robustness", - "score": 0.10413260236022294 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.191, mean=0.205, max=0.221, sum=0.616 (3)", - "tab": "Fairness", - "score": 0.20535614023925777 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.203, mean=0.211, max=0.224, sum=0.632 (3)", - "tab": "Efficiency", - "score": 0.21074192341549294 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=11.13, mean=12.381, max=14.623, sum=37.144 (3)", - "tab": "General information", - "score": 12.381220657276996 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.415, mean=0.444, max=0.464, sum=1.333 (3)", - "tab": "Bias", - "score": 0.44422611988401467 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.074, mean=0.132, max=0.198, sum=0.397 (3)", - "tab": "Bias", - "score": 0.13244266197852694 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.025, mean=0.03, max=0.037, sum=0.09 (3)", - "tab": "Toxicity", - "score": 0.030046948356807508 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.365, - "details": { - "description": "min=0.35, mean=0.365, max=0.379, sum=1.095 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.024, mean=0.028, max=0.034, sum=0.083 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.02767630939495112 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.168, mean=0.18, max=0.188, sum=0.539 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.17953919898525875 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.029, mean=0.031, max=0.033, sum=0.092 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.030523107267064337 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.042, mean=0.043, max=0.044, sum=0.129 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.04293332221345858 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.054, mean=0.057, max=0.061, sum=0.171 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.057147528877813734 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.269, mean=0.273, max=0.278, sum=0.82 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.2734675120722885 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.162, mean=0.167, max=0.171, sum=0.5 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.16660095312500048 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.259, mean=0.271, max=0.277, sum=0.812 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.27051720963541687 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.865, mean=5.656, max=6.378, sum=16.969 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.656333333333333 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=20.643, mean=22.436, max=23.53, sum=67.308 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 22.436000000000003 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.067, mean=0.284, max=0.429, sum=0.852 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.2838533114395183 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.167, mean=0.281, max=0.404, sum=0.843 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.2809020267563887 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.487, mean=0.496, max=0.5, sum=1.487 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4955194805194805 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.401, mean=0.466, max=0.574, sum=1.399 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.46622237638437936 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.308, mean=0.333, max=0.361, sum=0.998 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.33253136409012896 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0006666666666666666 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.004, sum=0.007 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0023333333333333335 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.242, - "details": { - "description": "min=0.226, mean=0.242, max=0.267, sum=0.725 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.022, mean=0.039, max=0.059, sum=0.118 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.039442503431989094 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.082, mean=0.092, max=0.098, sum=0.275 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.09165527832991893 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.15, mean=0.166, max=0.187, sum=0.497 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.16579958101328882 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=0.253, mean=0.27, max=0.28, sum=0.811 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.2701784687500001 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=19.431, mean=22.281, max=23.851, sum=66.844 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 22.281333333333333 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.437, mean=0.452, max=0.465, sum=1.355 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4515937058073862 - }, - "QuAC - Representation (race)": { - "description": "min=0.269, mean=0.341, max=0.377, sum=1.022 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.3407089337701805 - }, - "QuAC - Representation (gender)": { - "description": "min=0.195, mean=0.209, max=0.237, sum=0.627 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.2091296383711505 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.004, sum=0.008 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0026666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435, - "details": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.057, mean=0.057, max=0.057, sum=0.057 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.057406609088416535 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.37 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.37 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.294 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.294 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.138, mean=0.138, max=0.138, sum=0.138 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.13805987500000028 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38, - "details": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.38 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.346 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.3457887658657961 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.27 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.318 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.318 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.136, mean=0.136, max=0.136, sum=0.136 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.13612351562500047 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.215, - "details": { - "description": "min=0.206, mean=0.215, max=0.222, sum=0.645 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.06, mean=0.071, max=0.086, sum=0.213 (3)", - "tab": "Calibration", - "score": 0.07105251349575469 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.154, mean=0.167, max=0.179, sum=0.502 (3)", - "tab": "Robustness", - "score": 0.1671763506625892 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.18, mean=0.185, max=0.187, sum=0.554 (3)", - "tab": "Fairness", - "score": 0.18450560652395517 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.14, mean=0.141, max=0.141, sum=0.422 (3)", - "tab": "Efficiency", - "score": 0.14062155366016812 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29, - "details": { - "description": "min=0.184, mean=0.29, max=0.427, sum=0.871 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.039, mean=0.072, max=0.111, sum=0.215 (3)", - "tab": "Robustness", - "score": 0.07152063492063503 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.148, mean=0.247, max=0.358, sum=0.741 (3)", - "tab": "Robustness", - "score": 0.24715427563243078 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.051, mean=0.086, max=0.134, sum=0.258 (3)", - "tab": "Fairness", - "score": 0.08609259259259262 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.17, mean=0.268, max=0.399, sum=0.804 (3)", - "tab": "Fairness", - "score": 0.267882893215826 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.14, mean=0.142, max=0.143, sum=0.425 (3)", - "tab": "Efficiency", - "score": 0.14154662890625005 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.14, mean=0.142, max=0.142, sum=0.425 (3)", - "tab": "Efficiency", - "score": 0.14153152252906978 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.059, mean=1.219, max=1.379, sum=3.656 (3)", - "tab": "General information", - "score": 1.2186666666666666 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1.093, mean=1.171, max=1.209, sum=3.512 (3)", - "tab": "General information", - "score": 1.1705426356589146 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.09, - "details": { - "description": "min=0.046, mean=0.09, max=0.116, sum=0.541 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=0.494, mean=0.598, max=0.669, sum=3.587 (6)", - "tab": "Efficiency", - "score": 0.5978011528746431 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=59.695, mean=76.958, max=88.815, sum=461.747 (6)", - "tab": "General information", - "score": 76.95779685264664 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.598, mean=0.628, max=0.667, sum=3.769 (6)", - "tab": "Bias", - "score": 0.6280987623495909 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.361, mean=0.403, max=0.447, sum=2.416 (6)", - "tab": "Bias", - "score": 0.4025937932369326 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.275, mean=0.297, max=0.329, sum=1.782 (6)", - "tab": "Bias", - "score": 0.2969968830498775 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.109, mean=0.134, max=0.15, sum=0.804 (6)", - "tab": "Bias", - "score": 0.13397007527013516 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.000715307582260372 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.007, mean=0.169, max=0.28, sum=0.506 (3)", - "tab": "Summarization metrics", - "score": 0.1685268875223913 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=3.028, mean=3.742, max=4.119, sum=22.454 (6)", - "tab": "Summarization metrics", - "score": 3.742251717543341 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=-0.233, mean=0.026, max=0.191, sum=0.079 (3)", - "tab": "Summarization metrics", - "score": 0.02646359689379031 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.551, mean=0.773, max=0.886, sum=4.64 (6)", - "tab": "Summarization metrics", - "score": 0.7733298424406031 - }, - "CNN/DailyMail - Density": { - "description": "min=18.265, mean=36.596, max=52.461, sum=219.577 (6)", - "tab": "Summarization metrics", - "score": 36.59619529550019 - }, - "CNN/DailyMail - Compression": { - "description": "min=9.827, mean=12.07, max=15.425, sum=72.42 (6)", - "tab": "Summarization metrics", - "score": 12.070019676025145 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.022, - "details": { - "description": "min=0.012, mean=0.022, max=0.034, sum=0.134 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.194, mean=0.237, max=0.271, sum=1.423 (6)", - "tab": "Efficiency", - "score": 0.23717034165862286 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=9.643, mean=16.878, max=22.542, sum=101.27 (6)", - "tab": "General information", - "score": 16.878378378378375 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.383, mean=0.412, max=0.438, sum=2.474 (6)", - "tab": "Bias", - "score": 0.4122685185185186 - }, - "XSUM - Representation (race)": { - "description": "min=0.467, mean=0.558, max=0.667, sum=3.35 (6)", - "tab": "Bias", - "score": 0.5583333333333335 - }, - "XSUM - Representation (gender)": { - "description": "min=0.158, mean=0.222, max=0.264, sum=1.335 (6)", - "tab": "Bias", - "score": 0.22244262246907046 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.151, mean=-0.115, max=-0.086, sum=-0.345 (3)", - "tab": "Summarization metrics", - "score": -0.11515867019712234 - }, - "XSUM - QAFactEval": { - "description": "min=0, mean=0.009, max=0.028, sum=0.056 (6)", - "tab": "Summarization metrics", - "score": 0.009336465575789038 - }, - "XSUM - BERTScore (F1)": { - "description": "min=-0.509, mean=-0.232, max=-0.002, sum=-0.695 (3)", - "tab": "Summarization metrics", - "score": -0.23174258205917408 - }, - "XSUM - Coverage": { - "description": "min=0.208, mean=0.407, max=0.566, sum=2.442 (6)", - "tab": "Summarization metrics", - "score": 0.40704982952261465 - }, - "XSUM - Density": { - "description": "min=1.129, mean=2.653, max=3.54, sum=15.917 (6)", - "tab": "Summarization metrics", - "score": 2.652801659570502 - }, - "XSUM - Compression": { - "description": "min=4.395, mean=8.023, max=11.123, sum=48.138 (6)", - "tab": "Summarization metrics", - "score": 8.022940864769765 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.834, mean=0.849, max=0.861, sum=2.547 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.223, mean=0.274, max=0.332, sum=0.821 (3)", - "tab": "Calibration", - "score": 0.2737600797307666 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.663, mean=0.701, max=0.737, sum=2.102 (3)", - "tab": "Robustness", - "score": 0.7006666666666668 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.787, mean=0.806, max=0.819, sum=2.417 (3)", - "tab": "Fairness", - "score": 0.8056666666666666 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.141, mean=0.142, max=0.143, sum=0.426 (3)", - "tab": "Efficiency", - "score": 0.14206914127604175 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)", - "tab": "General information", - "score": 4.242 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)", - "tab": "General information", - "score": 1553.363 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517, - "details": { - "description": "min=0, mean=0.517, max=1, sum=27.9 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.096, mean=0.355, max=0.704, sum=19.19 (54)", - "tab": "Calibration", - "score": 0.35537087067123496 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.421, max=1, sum=22.752 (54)", - "tab": "Robustness", - "score": 0.42132444064350366 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.436, max=1, sum=23.537 (54)", - "tab": "Fairness", - "score": 0.435870046986927 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.14, mean=0.141, max=0.141, sum=7.587 (54)", - "tab": "Efficiency", - "score": 0.14050017531142125 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423, - "details": { - "description": "min=0, mean=0.423, max=0.975, sum=13.975 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.066, mean=0.268, max=0.696, sum=8.86 (33)", - "tab": "Calibration", - "score": 0.2684712140450576 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.345, max=0.975, sum=11.375 (33)", - "tab": "Robustness", - "score": 0.3446969696969697 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.395, max=0.975, sum=13.05 (33)", - "tab": "Fairness", - "score": 0.3954545454545455 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.142, mean=0.154, max=0.17, sum=5.08 (33)", - "tab": "Efficiency", - "score": 0.15395451290246212 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=1.275, mean=3.125, max=5.85, sum=103.125 (33)", - "tab": "General information", - "score": 3.125 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Bias", - "score": 0.5 - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json deleted file mode 100644 index d3977fc36..000000000 --- a/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_babbage-1.3B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "babbage 1.3B", - "id": "openai/babbage-1.3B", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.5876917234841996 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.11687598645329457 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.13375380644568632 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.860531798245614 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.47969140134405086 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5128371628371629 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.19609440267335004 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.235, - "details": { - "description": "min=0.17, mean=0.235, max=0.35, sum=3.518 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.095, mean=0.14, max=0.179, sum=2.093 (15)", - "tab": "Calibration", - "score": 0.13954639548632583 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.09, mean=0.166, max=0.24, sum=2.489 (15)", - "tab": "Robustness", - "score": 0.165906432748538 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.14, mean=0.206, max=0.28, sum=3.085 (15)", - "tab": "Fairness", - "score": 0.20567251461988303 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.118, mean=0.119, max=0.12, sum=1.785 (15)", - "tab": "Efficiency", - "score": 0.11896953947368419 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.574, - "details": { - "description": "min=0.52, mean=0.574, max=0.623, sum=1.723 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.036, mean=0.068, max=0.089, sum=0.203 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.06758031979129187 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.432, mean=0.477, max=0.522, sum=1.431 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.47700000000000004 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.404, mean=0.436, max=0.457, sum=1.307 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.43566666666666665 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.119, mean=0.121, max=0.125, sum=0.364 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.12137238953993056 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.491, - "details": { - "description": "min=0.468, mean=0.491, max=0.525, sum=1.474 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.025, mean=0.027, max=0.03, sum=0.081 (3)", - "tab": "Calibration", - "score": 0.027162479976532598 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.232, mean=0.255, max=0.266, sum=0.764 (3)", - "tab": "Robustness", - "score": 0.2547490737014401 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.334, mean=0.367, max=0.396, sum=1.101 (3)", - "tab": "Fairness", - "score": 0.3669650821225828 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.164, mean=0.176, max=0.194, sum=0.529 (3)", - "tab": "Efficiency", - "score": 0.1762964825410799 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.659, mean=8.835, max=11.769, sum=26.504 (3)", - "tab": "General information", - "score": 8.83474178403756 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.404, mean=0.445, max=0.5, sum=1.335 (3)", - "tab": "Bias", - "score": 0.44511511879932936 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.142, mean=0.191, max=0.246, sum=0.574 (3)", - "tab": "Bias", - "score": 0.1912053369170701 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.016, max=0.023, sum=0.048 (3)", - "tab": "Toxicity", - "score": 0.01596244131455399 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451, - "details": { - "description": "min=0.435, mean=0.451, max=0.47, sum=1.354 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.012, mean=0.016, max=0.023, sum=0.048 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.01603851394023659 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.141, mean=0.147, max=0.153, sum=0.44 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.14681748032197228 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.063, mean=0.068, max=0.072, sum=0.205 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.06829400341950241 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.211, mean=0.212, max=0.214, sum=0.637 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.21249077319847984 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.079, mean=0.084, max=0.088, sum=0.252 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.08399089853474369 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.365, mean=0.381, max=0.403, sum=1.144 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.381423207180998 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.15, mean=0.152, max=0.152, sum=0.455 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.15162744531249991 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.228, mean=0.232, max=0.235, sum=0.696 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.23211142730034728 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=6.994, mean=7.258, max=7.401, sum=21.773 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 7.257666666666666 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=18.158, mean=18.539, max=18.902, sum=55.617 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 18.539 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.578, mean=0.624, max=0.667, sum=1.871 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6236303630363037 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0, mean=0.015, max=0.038, sum=0.046 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.015466015466015476 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.445, mean=0.479, max=0.5, sum=1.436 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.47855712855712856 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.422, mean=0.441, max=0.46, sum=1.323 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.44113329919781535 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.257, mean=0.349, max=0.419, sum=1.046 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.34872771165606054 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.002 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0006666666666666666 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.273, - "details": { - "description": "min=0.263, mean=0.273, max=0.282, sum=0.818 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.03, mean=0.045, max=0.065, sum=0.136 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.04533749534838898 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.141, mean=0.149, max=0.156, sum=0.448 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.14927279809816305 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.198, mean=0.202, max=0.205, sum=0.607 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.20229238580626874 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=0.245, mean=0.261, max=0.27, sum=0.782 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.2607369557291667 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=20.236, mean=22.916, max=24.512, sum=68.749 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 22.91633333333333 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.643, mean=0.659, max=0.667, sum=1.976 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6587301587301589 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.434, mean=0.445, max=0.452, sum=1.336 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4452529926214137 - }, - "QuAC - Representation (race)": { - "description": "min=0.311, mean=0.339, max=0.382, sum=1.016 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.33878845629358273 - }, - "QuAC - Representation (gender)": { - "description": "min=0.251, mean=0.258, max=0.264, sum=0.775 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.25817229310554 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0016666666666666668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555, - "details": { - "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.144, mean=0.144, max=0.144, sum=0.144 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.14430034567571584 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.489, mean=0.489, max=0.489, sum=0.489 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.489 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.401 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.113, mean=0.113, max=0.113, sum=0.113 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.1134031874999998 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.438, - "details": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.3000308921028506 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.314 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.314 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.326 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.326 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.111, mean=0.111, max=0.111, sum=0.111 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.11114410156249971 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.188, - "details": { - "description": "min=0.174, mean=0.188, max=0.196, sum=0.563 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.13, mean=0.142, max=0.164, sum=0.426 (3)", - "tab": "Calibration", - "score": 0.14198207765086143 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.141, mean=0.162, max=0.183, sum=0.486 (3)", - "tab": "Robustness", - "score": 0.1620795107033639 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.159, mean=0.178, max=0.19, sum=0.534 (3)", - "tab": "Fairness", - "score": 0.17787971457696228 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.119, mean=0.12, max=0.12, sum=0.359 (3)", - "tab": "Efficiency", - "score": 0.11970087223655701 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.317, - "details": { - "description": "min=0.291, mean=0.317, max=0.362, sum=0.95 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.055, mean=0.073, max=0.086, sum=0.219 (3)", - "tab": "Robustness", - "score": 0.07291031746031752 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.206, mean=0.246, max=0.285, sum=0.739 (3)", - "tab": "Robustness", - "score": 0.24641961891165112 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.082, mean=0.105, max=0.123, sum=0.316 (3)", - "tab": "Fairness", - "score": 0.10532936507936512 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.275, mean=0.301, max=0.346, sum=0.902 (3)", - "tab": "Fairness", - "score": 0.300592144197253 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.119, mean=0.122, max=0.126, sum=0.367 (3)", - "tab": "Efficiency", - "score": 0.12232188151041663 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.118, mean=0.122, max=0.128, sum=0.367 (3)", - "tab": "Efficiency", - "score": 0.12249798631298452 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.128, mean=1.537, max=2.075, sum=4.612 (3)", - "tab": "General information", - "score": 1.5373333333333334 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1.496, max=2.302, sum=4.488 (3)", - "tab": "General information", - "score": 1.4961240310077522 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.079, - "details": { - "description": "min=0.016, mean=0.079, max=0.147, sum=0.472 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=0.293, mean=0.533, max=0.795, sum=3.197 (6)", - "tab": "Efficiency", - "score": 0.5327935382950345 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=28.479, mean=68.44, max=112.258, sum=410.639 (6)", - "tab": "General information", - "score": 68.43991416309014 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.5, mean=0.568, max=0.611, sum=3.41 (6)", - "tab": "Bias", - "score": 0.5683358120009704 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.403, mean=0.418, max=0.435, sum=2.509 (6)", - "tab": "Bias", - "score": 0.4181282755076701 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.321, mean=0.327, max=0.333, sum=1.962 (6)", - "tab": "Bias", - "score": 0.32700197854837026 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.131, mean=0.146, max=0.165, sum=0.879 (6)", - "tab": "Bias", - "score": 0.14643429372740835 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.024, mean=0.194, max=0.404, sum=0.582 (3)", - "tab": "Summarization metrics", - "score": 0.19395910509097278 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=1.208, mean=3.207, max=4.672, sum=19.24 (6)", - "tab": "Summarization metrics", - "score": 3.206720080183251 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=-0.533, mean=-0.129, max=0.256, sum=-0.388 (3)", - "tab": "Summarization metrics", - "score": -0.12942978993545518 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.243, mean=0.606, max=0.942, sum=3.637 (6)", - "tab": "Summarization metrics", - "score": 0.6061106279492011 - }, - "CNN/DailyMail - Density": { - "description": "min=7.213, mean=43.534, max=84.961, sum=261.202 (6)", - "tab": "Summarization metrics", - "score": 43.533595505945534 - }, - "CNN/DailyMail - Compression": { - "description": "min=5.569, mean=6.733, max=8.376, sum=40.398 (6)", - "tab": "Summarization metrics", - "score": 6.733051993966683 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.045, - "details": { - "description": "min=0.041, mean=0.045, max=0.054, sum=0.273 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.264, mean=0.272, max=0.286, sum=1.632 (6)", - "tab": "Efficiency", - "score": 0.27202574924254597 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=23.645, mean=25.051, max=27.259, sum=150.309 (6)", - "tab": "General information", - "score": 25.051480051480052 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.389, mean=0.42, max=0.46, sum=2.52 (6)", - "tab": "Bias", - "score": 0.42004149135109864 - }, - "XSUM - Representation (race)": { - "description": "min=0.417, mean=0.458, max=0.542, sum=2.75 (6)", - "tab": "Bias", - "score": 0.4583333333333333 - }, - "XSUM - Representation (gender)": { - "description": "min=0.105, mean=0.148, max=0.182, sum=0.89 (6)", - "tab": "Bias", - "score": 0.14837887499687488 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.221, mean=-0.188, max=-0.16, sum=-0.564 (3)", - "tab": "Summarization metrics", - "score": -0.18805348402642733 - }, - "XSUM - QAFactEval": { - "description": "min=0.003, mean=0.195, max=0.546, sum=1.171 (6)", - "tab": "Summarization metrics", - "score": 0.19517962440346606 - }, - "XSUM - BERTScore (F1)": { - "description": "min=-0.047, mean=0.02, max=0.139, sum=0.059 (3)", - "tab": "Summarization metrics", - "score": 0.01972435572139075 - }, - "XSUM - Coverage": { - "description": "min=0.538, mean=0.604, max=0.715, sum=3.622 (6)", - "tab": "Summarization metrics", - "score": 0.6037080043294082 - }, - "XSUM - Density": { - "description": "min=3.597, mean=4.386, max=5.935, sum=26.316 (6)", - "tab": "Summarization metrics", - "score": 4.385950410054523 - }, - "XSUM - Compression": { - "description": "min=10.355, mean=11.716, max=13.636, sum=70.293 (6)", - "tab": "Summarization metrics", - "score": 11.71557516895029 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.597, - "details": { - "description": "min=0.5, mean=0.597, max=0.646, sum=1.792 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.161, mean=0.212, max=0.289, sum=0.637 (3)", - "tab": "Calibration", - "score": 0.2122386190139247 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.476, mean=0.5, max=0.512, sum=1.5 (3)", - "tab": "Robustness", - "score": 0.5 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.489, mean=0.534, max=0.558, sum=1.602 (3)", - "tab": "Fairness", - "score": 0.534 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.125, mean=0.128, max=0.131, sum=0.385 (3)", - "tab": "Efficiency", - "score": 0.12819260763888898 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)", - "tab": "General information", - "score": 4.242 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)", - "tab": "General information", - "score": 1553.363 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519, - "details": { - "description": "min=0.005, mean=0.519, max=0.996, sum=28.025 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.063, mean=0.31, max=0.598, sum=16.723 (54)", - "tab": "Calibration", - "score": 0.30968147474692964 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.4, max=0.996, sum=21.618 (54)", - "tab": "Robustness", - "score": 0.40032672585199003 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.474, max=0.994, sum=25.57 (54)", - "tab": "Fairness", - "score": 0.4735149158411243 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.118, mean=0.12, max=0.125, sum=6.485 (54)", - "tab": "Efficiency", - "score": 0.12008918109610113 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.455, - "details": { - "description": "min=0.025, mean=0.455, max=0.975, sum=15.025 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.1, mean=0.286, max=0.455, sum=9.428 (33)", - "tab": "Calibration", - "score": 0.28570502706051176 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.409, max=0.975, sum=13.5 (33)", - "tab": "Robustness", - "score": 0.40909090909090906 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.438, max=0.975, sum=14.45 (33)", - "tab": "Fairness", - "score": 0.43787878787878787 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.117, mean=0.137, max=0.182, sum=4.525 (33)", - "tab": "Efficiency", - "score": 0.13711408420138893 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=3.511, max=10.6, sum=115.85 (33)", - "tab": "General information", - "score": 3.5106060606060603 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "Bias", - "score": 0.0 - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json deleted file mode 100644 index fe011ca06..000000000 --- a/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_curie-6.7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "curie 6.7B", - "id": "openai/curie-6.7B", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.247, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6031752149929763 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.23139443056017028 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.23055057660174458 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.8951315789473684 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.36598228279277495 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.4175808759142092 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.32471804511278196 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.243, - "details": { - "description": "min=0.19, mean=0.243, max=0.29, sum=3.642 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.069, mean=0.138, max=0.238, sum=2.071 (15)", - "tab": "Calibration", - "score": 0.1380385889615569 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.1, mean=0.19, max=0.263, sum=2.854 (15)", - "tab": "Robustness", - "score": 0.1902923976608187 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.15, mean=0.218, max=0.281, sum=3.266 (15)", - "tab": "Fairness", - "score": 0.21771929824561406 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.091, mean=0.092, max=0.095, sum=1.387 (15)", - "tab": "Efficiency", - "score": 0.09245237979714913 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656, - "details": { - "description": "min=0.597, mean=0.656, max=0.704, sum=1.969 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.051, mean=0.079, max=0.115, sum=0.236 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.07881150352718548 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.484, mean=0.545, max=0.599, sum=1.635 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.545 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.535, mean=0.594, max=0.631, sum=1.782 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.594 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.096, mean=0.1, max=0.104, sum=0.3 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.09988102712673615 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.604, - "details": { - "description": "min=0.588, mean=0.604, max=0.632, sum=1.813 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.031, mean=0.045, max=0.056, sum=0.135 (3)", - "tab": "Calibration", - "score": 0.044936394093581626 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.352, mean=0.367, max=0.39, sum=1.1 (3)", - "tab": "Robustness", - "score": 0.36665112128820915 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.453, mean=0.482, max=0.515, sum=1.445 (3)", - "tab": "Fairness", - "score": 0.48150959406800437 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.14, mean=0.152, max=0.166, sum=0.455 (3)", - "tab": "Efficiency", - "score": 0.15159477332746474 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.775, mean=6.607, max=8.732, sum=19.82 (3)", - "tab": "General information", - "score": 6.606572769953051 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.431, mean=0.455, max=0.5, sum=1.364 (3)", - "tab": "Bias", - "score": 0.45462962962962966 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.209, mean=0.229, max=0.267, sum=0.688 (3)", - "tab": "Bias", - "score": 0.2292955082742317 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.017, mean=0.017, max=0.017, sum=0.051 (3)", - "tab": "Toxicity", - "score": 0.016901408450704224 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.552, - "details": { - "description": "min=0.521, mean=0.552, max=0.568, sum=1.655 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.014, mean=0.017, max=0.022, sum=0.052 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.01724854000741595 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.123, mean=0.134, max=0.149, sum=0.403 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.13427394452181574 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.118, mean=0.126, max=0.133, sum=0.379 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.1262678947150161 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.28, mean=0.338, max=0.381, sum=1.015 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.33838638278361 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.139, mean=0.147, max=0.151, sum=0.44 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.14670404179376148 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.446, mean=0.479, max=0.506, sum=1.436 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.47851717891712475 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.116, mean=0.122, max=0.128, sum=0.367 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.12234622395833335 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.166, mean=0.189, max=0.21, sum=0.566 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.18882224978298598 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.376, mean=6.313, max=7.104, sum=18.94 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 6.3133333333333335 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=9.89, mean=12.581, max=15.337, sum=37.742 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 12.580666666666668 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.291, mean=0.415, max=0.509, sum=1.245 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4150858887700994 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.119, mean=0.203, max=0.25, sum=0.608 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.20272601794340928 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.407, mean=0.469, max=0.5, sum=1.407 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.469047619047619 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.441, mean=0.453, max=0.467, sum=1.359 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4528357579590976 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.361, mean=0.379, max=0.397, sum=1.136 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.3786428074398272 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.003, sum=0.005 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0016666666666666668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.321, - "details": { - "description": "min=0.312, mean=0.321, max=0.335, sum=0.963 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.033, mean=0.043, max=0.055, sum=0.129 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.04303687950629059 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.164, mean=0.171, max=0.178, sum=0.513 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.1711623480279509 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.241, mean=0.243, max=0.245, sum=0.728 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.24255939370982219 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=0.31, mean=0.323, max=0.34, sum=0.968 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.32252038281250045 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=29.104, mean=31.034, max=33.548, sum=93.102 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 31.034000000000002 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.633, mean=0.645, max=0.667, sum=1.936 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6454545454545455 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.426, mean=0.439, max=0.452, sum=1.317 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4390862600512319 - }, - "QuAC - Representation (race)": { - "description": "min=0.2, mean=0.246, max=0.271, sum=0.738 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.24599483204134365 - }, - "QuAC - Representation (gender)": { - "description": "min=0.226, mean=0.231, max=0.234, sum=0.693 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.23109052551695608 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.003, max=0.003, sum=0.008 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0026666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.682, - "details": { - "description": "min=0.682, mean=0.682, max=0.682, sum=0.682 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.24965148877506194 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.632, mean=0.632, max=0.632, sum=0.632 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.632 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.522, mean=0.522, max=0.522, sum=0.522 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.522 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.084, mean=0.084, max=0.084, sum=0.084 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.08380637499999992 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502, - "details": { - "description": "min=0.502, mean=0.502, max=0.502, sum=0.502 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.25956257561884827 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.396 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.396 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.43 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.079, mean=0.079, max=0.079, sum=0.079 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.07928820312499986 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.232, - "details": { - "description": "min=0.222, mean=0.232, max=0.251, sum=0.696 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.05, mean=0.062, max=0.072, sum=0.186 (3)", - "tab": "Calibration", - "score": 0.06204978796421436 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.167, mean=0.186, max=0.214, sum=0.557 (3)", - "tab": "Robustness", - "score": 0.1855249745158002 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.165, mean=0.186, max=0.216, sum=0.558 (3)", - "tab": "Fairness", - "score": 0.18603465851172274 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.093, mean=0.094, max=0.094, sum=0.281 (3)", - "tab": "Efficiency", - "score": 0.09360438168960249 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3, - "details": { - "description": "min=0.279, mean=0.3, max=0.31, sum=0.899 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.086, mean=0.11, max=0.14, sum=0.33 (3)", - "tab": "Robustness", - "score": 0.10991481481481481 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.25, mean=0.253, max=0.254, sum=0.759 (3)", - "tab": "Robustness", - "score": 0.25287196320995325 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.119, mean=0.14, max=0.167, sum=0.42 (3)", - "tab": "Fairness", - "score": 0.14012791005291 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.266, mean=0.284, max=0.295, sum=0.852 (3)", - "tab": "Fairness", - "score": 0.2838824123845733 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.094, mean=0.094, max=0.095, sum=0.283 (3)", - "tab": "Efficiency", - "score": 0.09442029557291665 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.094, mean=0.095, max=0.097, sum=0.286 (3)", - "tab": "Efficiency", - "score": 0.09531934350775194 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.035, mean=1.112, max=1.183, sum=3.336 (3)", - "tab": "General information", - "score": 1.112 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1.093, mean=1.248, max=1.488, sum=3.744 (3)", - "tab": "General information", - "score": 1.248062015503876 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.113, - "details": { - "description": "min=0.038, mean=0.113, max=0.141, sum=0.789 (7)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=0.559, mean=0.623, max=0.691, sum=4.363 (7)", - "tab": "Efficiency", - "score": 0.6232588631080115 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=3262 (7)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=0, mean=4.286, max=5, sum=30 (7)", - "tab": "General information", - "score": 4.285714285714286 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=583.586, mean=1411.872, max=1567.586, sum=9883.101 (7)", - "tab": "General information", - "score": 1411.8715511955854 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=65.127, mean=74.606, max=84.073, sum=522.245 (7)", - "tab": "General information", - "score": 74.60637645616187 - }, - "CNN/DailyMail - # trials": { - "description": "min=1, mean=2.714, max=3, sum=19 (7)", - "tab": "General information", - "score": 2.7142857142857144 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.619, mean=0.642, max=0.667, sum=4.492 (7)", - "tab": "Bias", - "score": 0.6416796928441896 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.383, mean=0.409, max=0.43, sum=2.86 (7)", - "tab": "Bias", - "score": 0.40861926379951435 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.238, mean=0.295, max=0.417, sum=2.068 (7)", - "tab": "Bias", - "score": 0.29545894187058713 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.109, mean=0.129, max=0.144, sum=0.9 (7)", - "tab": "Bias", - "score": 0.12851266312443646 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (7)", - "tab": "Toxicity", - "score": 0.0006131207847946045 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.108, mean=0.354, max=0.557, sum=1.415 (4)", - "tab": "Summarization metrics", - "score": 0.3538436304603978 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=1.248, mean=4.204, max=4.78, sum=29.431 (7)", - "tab": "Summarization metrics", - "score": 4.20445410382703 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=-0.343, mean=0.089, max=0.264, sum=0.355 (4)", - "tab": "Summarization metrics", - "score": 0.08867060792677807 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.425, mean=0.89, max=0.973, sum=6.231 (7)", - "tab": "Summarization metrics", - "score": 0.8901263761958778 - }, - "CNN/DailyMail - Density": { - "description": "min=11.471, mean=23.472, max=34.455, sum=164.303 (7)", - "tab": "Summarization metrics", - "score": 23.471817181725523 - }, - "CNN/DailyMail - Compression": { - "description": "min=5.037, mean=9.495, max=12.229, sum=66.463 (7)", - "tab": "Summarization metrics", - "score": 9.494670330829432 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)", - "tab": "Summarization metrics", - "score": 0.2866666666666666 - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "min=1.933, mean=1.933, max=1.933, sum=1.933 (1)", - "tab": "Summarization metrics", - "score": 1.9333333333333333 - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "min=1.767, mean=1.767, max=1.767, sum=1.767 (1)", - "tab": "Summarization metrics", - "score": 1.7666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.091, - "details": { - "description": "min=0.035, mean=0.091, max=0.104, sum=0.636 (7)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.274, mean=0.294, max=0.41, sum=2.059 (7)", - "tab": "Efficiency", - "score": 0.29416145294688817 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3626 (7)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=0, mean=4.285, max=5, sum=29.992 (7)", - "tab": "General information", - "score": 4.284611141753999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=388.402, mean=1350.13, max=1538.921, sum=9450.911 (7)", - "tab": "General information", - "score": 1350.1301709873137 - }, - "XSUM - # output tokens": { - "description": "min=24.405, mean=27.757, max=46.521, sum=194.297 (7)", - "tab": "General information", - "score": 27.75675675675676 - }, - "XSUM - # trials": { - "description": "min=1, mean=2.714, max=3, sum=19 (7)", - "tab": "General information", - "score": 2.7142857142857144 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=3.333 (5)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.409, mean=0.449, max=0.488, sum=3.143 (7)", - "tab": "Bias", - "score": 0.44897893078382667 - }, - "XSUM - Representation (race)": { - "description": "min=0.446, mean=0.599, max=0.667, sum=4.196 (7)", - "tab": "Bias", - "score": 0.5994124922696351 - }, - "XSUM - Representation (gender)": { - "description": "min=0.169, mean=0.205, max=0.268, sum=1.435 (7)", - "tab": "Bias", - "score": 0.20496360887910145 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (7)", - "tab": "Toxicity", - "score": 0.0005515719801434088 - }, - "XSUM - SummaC": { - "description": "min=-0.237, mean=-0.143, max=0.073, sum=-0.574 (4)", - "tab": "Summarization metrics", - "score": -0.14346265436541167 - }, - "XSUM - QAFactEval": { - "description": "min=2.914, mean=3.922, max=4.204, sum=27.454 (7)", - "tab": "Summarization metrics", - "score": 3.9220091164391953 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.091, mean=0.313, max=0.388, sum=1.251 (4)", - "tab": "Summarization metrics", - "score": 0.312644368874429 - }, - "XSUM - Coverage": { - "description": "min=0.795, mean=0.815, max=0.823, sum=5.707 (7)", - "tab": "Summarization metrics", - "score": 0.8152742026902194 - }, - "XSUM - Density": { - "description": "min=2.849, mean=5.57, max=19.82, sum=38.989 (7)", - "tab": "Summarization metrics", - "score": 5.569907111767537 - }, - "XSUM - Compression": { - "description": "min=10.146, mean=17.018, max=18.474, sum=119.123 (7)", - "tab": "Summarization metrics", - "score": 17.01754099745573 - }, - "XSUM - HumanEval-faithfulness": { - "description": "min=0.773, mean=0.924, max=1, sum=2.773 (3)", - "tab": "Summarization metrics", - "score": 0.9244444444444445 - }, - "XSUM - HumanEval-relevance": { - "description": "min=3.387, mean=3.573, max=3.667, sum=10.72 (3)", - "tab": "Summarization metrics", - "score": 3.573333333333333 - }, - "XSUM - HumanEval-coherence": { - "description": "min=3.163, mean=4.166, max=4.667, sum=12.497 (3)", - "tab": "Summarization metrics", - "score": 4.165555555555556 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.831, mean=0.889, max=0.939, sum=2.668 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.231, mean=0.259, max=0.285, sum=0.776 (3)", - "tab": "Calibration", - "score": 0.25871248887630766 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.716, mean=0.803, max=0.892, sum=2.41 (3)", - "tab": "Robustness", - "score": 0.8033333333333333 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.792, mean=0.86, max=0.922, sum=2.581 (3)", - "tab": "Fairness", - "score": 0.8603333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.105, mean=0.11, max=0.115, sum=0.331 (3)", - "tab": "Efficiency", - "score": 0.11035393728298622 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)", - "tab": "General information", - "score": 4.242 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)", - "tab": "General information", - "score": 1553.363 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539, - "details": { - "description": "min=0.012, mean=0.539, max=1, sum=29.083 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.042, mean=0.293, max=0.601, sum=15.826 (54)", - "tab": "Calibration", - "score": 0.29307434802498333 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.002, mean=0.347, max=1, sum=18.748 (54)", - "tab": "Robustness", - "score": 0.3471901723680723 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.412, max=1, sum=22.222 (54)", - "tab": "Fairness", - "score": 0.41152337126555366 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.09, mean=0.097, max=0.105, sum=5.259 (54)", - "tab": "Efficiency", - "score": 0.09739228545773865 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0, mean=0.49, max=0.975, sum=16.175 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.054, mean=0.319, max=0.977, sum=10.54 (33)", - "tab": "Calibration", - "score": 0.31939577693629423 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.413, max=0.975, sum=13.625 (33)", - "tab": "Robustness", - "score": 0.4128787878787879 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.473, max=0.975, sum=15.625 (33)", - "tab": "Fairness", - "score": 0.4734848484848485 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.094, mean=0.112, max=0.139, sum=3.696 (33)", - "tab": "Efficiency", - "score": 0.11198840159406566 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=0.025, mean=2.867, max=6.375, sum=94.6 (33)", - "tab": "General information", - "score": 2.8666666666666667 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json deleted file mode 100644 index b376d2873..000000000 --- a/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_davinci-175B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "davinci 175B", - "id": "openai/davinci-175B", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.538, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.5745594499834401 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.5094878610451469 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.5578754949166518 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.557938596491228 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.44460142486244675 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.42202673869340535 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.3600250626566416 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422, - "details": { - "description": "min=0.26, mean=0.422, max=0.7, sum=6.336 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.093, mean=0.132, max=0.18, sum=1.976 (15)", - "tab": "Calibration", - "score": 0.13175836488041992 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.17, mean=0.34, max=0.6, sum=5.102 (15)", - "tab": "Robustness", - "score": 0.3401169590643275 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.24, mean=0.38, max=0.61, sum=5.705 (15)", - "tab": "Fairness", - "score": 0.3803040935672514 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.203, mean=0.212, max=0.221, sum=3.181 (15)", - "tab": "Efficiency", - "score": 0.21209971402138156 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722, - "details": { - "description": "min=0.679, mean=0.722, max=0.77, sum=2.167 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.047, mean=0.072, max=0.103, sum=0.215 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.07164645838795872 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.592, mean=0.639, max=0.677, sum=1.918 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.6393333333333334 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.635, mean=0.682, max=0.729, sum=2.046 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.682 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.204, mean=0.21, max=0.217, sum=0.631 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.21022733463541673 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.687, - "details": { - "description": "min=0.664, mean=0.687, max=0.706, sum=2.061 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.041, mean=0.067, max=0.109, sum=0.202 (3)", - "tab": "Calibration", - "score": 0.06738212205854943 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.476, mean=0.498, max=0.52, sum=1.493 (3)", - "tab": "Robustness", - "score": 0.4976057829109271 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.556, mean=0.597, max=0.634, sum=1.791 (3)", - "tab": "Fairness", - "score": 0.5970096000459133 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.36, mean=0.369, max=0.384, sum=1.108 (3)", - "tab": "Efficiency", - "score": 0.3694498019366194 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.338, mean=5.709, max=6.197, sum=17.127 (3)", - "tab": "General information", - "score": 5.708920187793427 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.4, mean=0.443, max=0.5, sum=1.329 (3)", - "tab": "Bias", - "score": 0.44285714285714284 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.199, mean=0.208, max=0.221, sum=0.623 (3)", - "tab": "Bias", - "score": 0.2075773756101625 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.012, max=0.014, sum=0.037 (3)", - "tab": "Toxicity", - "score": 0.012206572769953052 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.599, mean=0.625, max=0.65, sum=1.874 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.054, mean=0.061, max=0.07, sum=0.182 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.06060614220397647 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.06, mean=0.079, max=0.1, sum=0.236 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.07854855230782792 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.251, mean=0.256, max=0.264, sum=0.769 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.2562420226045557 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.48, mean=0.521, max=0.561, sum=1.563 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.5211614334906893 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.271, mean=0.276, max=0.282, sum=0.828 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.2760483569290458 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.537, mean=0.567, max=0.594, sum=1.702 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.5674897299434086 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.304, mean=0.327, max=0.357, sum=0.981 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.32700476562499997 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.378, mean=0.462, max=0.583, sum=1.386 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.462036467447917 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.601, mean=5.361, max=6.345, sum=16.082 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.360666666666667 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.369, mean=8.992, max=12.931, sum=26.977 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 8.992333333333333 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.342, mean=0.447, max=0.5, sum=1.342 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4472502805836139 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.286, mean=0.382, max=0.439, sum=1.147 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.382401229992038 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.032, mean=0.247, max=0.4, sum=0.742 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.24726062467997953 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.293, mean=0.365, max=0.412, sum=1.096 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.3654871847728991 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.422, mean=0.435, max=0.447, sum=1.304 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4346811201445348 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.222, mean=0.244, max=0.271, sum=0.733 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.24420285420364105 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0003333333333333333 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.002 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0006666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.36, - "details": { - "description": "min=0.354, mean=0.36, max=0.367, sum=1.081 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.066, mean=0.068, max=0.071, sum=0.204 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.06797808745527684 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.197, mean=0.208, max=0.217, sum=0.623 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.20766668147064418 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.264, mean=0.279, max=0.288, sum=0.836 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.27860575089348755 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=1.01, mean=1.085, max=1.233, sum=3.256 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 1.085224210937499 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=27.082, mean=29.572, max=34.534, sum=88.717 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 29.572333333333333 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.636, mean=0.65, max=0.667, sum=1.949 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6495628554452085 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.435, mean=0.445, max=0.455, sum=1.335 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4451588893133011 - }, - "QuAC - Representation (race)": { - "description": "min=0.354, mean=0.367, max=0.375, sum=1.1 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.366690749431994 - }, - "QuAC - Representation (gender)": { - "description": "min=0.244, mean=0.251, max=0.256, sum=0.754 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.25124249915688174 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0003333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "details": { - "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.31 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.30968673998386337 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.738 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.641, mean=0.641, max=0.641, sum=0.641 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.641 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.193, mean=0.193, max=0.193, sum=0.193 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.19329937499999997 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.586, - "details": { - "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.20443749582919374 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.474 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.474 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.502, mean=0.502, max=0.502, sum=0.502 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.502 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.184, mean=0.184, max=0.184, sum=0.184 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.18361757812499943 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.194, - "details": { - "description": "min=0.182, mean=0.194, max=0.213, sum=0.581 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.186, mean=0.211, max=0.224, sum=0.632 (3)", - "tab": "Calibration", - "score": 0.21061421693460983 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.131, mean=0.145, max=0.162, sum=0.434 (3)", - "tab": "Robustness", - "score": 0.14475025484199797 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.136, mean=0.155, max=0.185, sum=0.466 (3)", - "tab": "Fairness", - "score": 0.15545361875637104 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.208, mean=0.215, max=0.219, sum=0.645 (3)", - "tab": "Efficiency", - "score": 0.21492536613627675 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378, - "details": { - "description": "min=0.343, mean=0.378, max=0.397, sum=1.135 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.15, mean=0.154, max=0.157, sum=0.462 (3)", - "tab": "Robustness", - "score": 0.15391111111111108 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.31, mean=0.332, max=0.352, sum=0.996 (3)", - "tab": "Robustness", - "score": 0.3320850067305285 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.179, mean=0.185, max=0.192, sum=0.554 (3)", - "tab": "Fairness", - "score": 0.18462896825396802 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.324, mean=0.357, max=0.375, sum=1.072 (3)", - "tab": "Fairness", - "score": 0.35718542292055805 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.202, mean=0.211, max=0.218, sum=0.632 (3)", - "tab": "Efficiency", - "score": 0.21074697460937475 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.201, mean=0.214, max=0.221, sum=0.641 (3)", - "tab": "Efficiency", - "score": 0.2137389625726744 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.127, - "details": { - "description": "min=0.087, mean=0.127, max=0.14, sum=0.889 (7)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=1.919, mean=2.256, max=3.967, sum=15.789 (7)", - "tab": "Efficiency", - "score": 2.255577085568669 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=3262 (7)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=0, mean=4.286, max=5, sum=30 (7)", - "tab": "General information", - "score": 4.285714285714286 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=583.586, mean=1411.872, max=1567.586, sum=9883.101 (7)", - "tab": "General information", - "score": 1411.8715511955854 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=57.459, mean=68.76, max=126.343, sum=481.322 (7)", - "tab": "General information", - "score": 68.76026977314531 - }, - "CNN/DailyMail - # trials": { - "description": "min=1, mean=2.714, max=3, sum=19 (7)", - "tab": "General information", - "score": 2.7142857142857144 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.579, mean=0.619, max=0.641, sum=4.33 (7)", - "tab": "Bias", - "score": 0.618631744195654 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.373, mean=0.401, max=0.418, sum=2.804 (7)", - "tab": "Bias", - "score": 0.4005751850408633 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.235, mean=0.301, max=0.378, sum=2.105 (7)", - "tab": "Bias", - "score": 0.3007554818500092 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.111, mean=0.125, max=0.16, sum=0.876 (7)", - "tab": "Bias", - "score": 0.12511140031093898 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.002, sum=0.011 (7)", - "tab": "Toxicity", - "score": 0.0015328019619865114 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.08, mean=0.321, max=0.532, sum=1.284 (4)", - "tab": "Summarization metrics", - "score": 0.321074205166444 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=2.929, mean=4.062, max=4.888, sum=28.435 (7)", - "tab": "Summarization metrics", - "score": 4.062076530805548 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.023, mean=0.182, max=0.25, sum=0.729 (4)", - "tab": "Summarization metrics", - "score": 0.18232803102041212 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.72, mean=0.873, max=0.944, sum=6.111 (7)", - "tab": "Summarization metrics", - "score": 0.87307141297806 - }, - "CNN/DailyMail - Density": { - "description": "min=15.056, mean=17.914, max=20.184, sum=125.396 (7)", - "tab": "Summarization metrics", - "score": 17.913710646412884 - }, - "CNN/DailyMail - Compression": { - "description": "min=4.761, mean=9.843, max=11.282, sum=68.899 (7)", - "tab": "Summarization metrics", - "score": 9.842721706219109 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "min=0.763, mean=0.953, max=1, sum=4.763 (5)", - "tab": "Summarization metrics", - "score": 0.9526666666666668 - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "min=3.503, mean=4.501, max=5, sum=22.503 (5)", - "tab": "Summarization metrics", - "score": 4.500666666666667 - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "min=2.647, mean=3.863, max=4.667, sum=19.313 (5)", - "tab": "Summarization metrics", - "score": 3.862666666666667 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.126, - "details": { - "description": "min=0.045, mean=0.126, max=0.144, sum=0.884 (7)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.958, mean=1.148, max=2.074, sum=8.038 (7)", - "tab": "Efficiency", - "score": 1.1482822034007862 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3626 (7)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=0, mean=4.285, max=5, sum=29.992 (7)", - "tab": "General information", - "score": 4.284611141753999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=388.402, mean=1350.13, max=1538.921, sum=9450.911 (7)", - "tab": "General information", - "score": 1350.1301709873137 - }, - "XSUM - # output tokens": { - "description": "min=25.444, mean=31.877, max=63.193, sum=223.139 (7)", - "tab": "General information", - "score": 31.87699944842802 - }, - "XSUM - # trials": { - "description": "min=1, mean=2.714, max=3, sum=19 (7)", - "tab": "General information", - "score": 2.7142857142857144 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4.667 (7)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.427, mean=0.444, max=0.469, sum=3.111 (7)", - "tab": "Bias", - "score": 0.44436594684493835 - }, - "XSUM - Representation (race)": { - "description": "min=0.473, mean=0.564, max=0.667, sum=3.948 (7)", - "tab": "Bias", - "score": 0.5639808220453382 - }, - "XSUM - Representation (gender)": { - "description": "min=0.189, mean=0.217, max=0.251, sum=1.521 (7)", - "tab": "Bias", - "score": 0.21723674492179154 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.003, max=0.015, sum=0.019 (7)", - "tab": "Toxicity", - "score": 0.0027578599007170436 - }, - "XSUM - SummaC": { - "description": "min=-0.317, mean=-0.267, max=-0.218, sum=-1.068 (4)", - "tab": "Summarization metrics", - "score": -0.2669066513504126 - }, - "XSUM - QAFactEval": { - "description": "min=1.878, mean=2.338, max=2.635, sum=16.363 (7)", - "tab": "Summarization metrics", - "score": 2.337582859954366 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.063, mean=0.318, max=0.423, sum=1.272 (4)", - "tab": "Summarization metrics", - "score": 0.3179425085241978 - }, - "XSUM - Coverage": { - "description": "min=0.698, mean=0.751, max=0.774, sum=5.255 (7)", - "tab": "Summarization metrics", - "score": 0.7506856271565006 - }, - "XSUM - Density": { - "description": "min=2.081, mean=3.351, max=10.076, sum=23.459 (7)", - "tab": "Summarization metrics", - "score": 3.3513024292310853 - }, - "XSUM - Compression": { - "description": "min=7.668, mean=14.08, max=15.293, sum=98.56 (7)", - "tab": "Summarization metrics", - "score": 14.079969364330754 - }, - "XSUM - HumanEval-faithfulness": { - "description": "min=0.5, mean=0.829, max=1, sum=5.803 (7)", - "tab": "Summarization metrics", - "score": 0.8290476190476191 - }, - "XSUM - HumanEval-relevance": { - "description": "min=2.833, mean=4.075, max=5, sum=28.523 (7)", - "tab": "Summarization metrics", - "score": 4.074761904761905 - }, - "XSUM - HumanEval-coherence": { - "description": "min=2.167, mean=3.398, max=5, sum=23.783 (7)", - "tab": "Summarization metrics", - "score": 3.397619047619048 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.933, - "details": { - "description": "min=0.925, mean=0.933, max=0.942, sum=2.8 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.104, mean=0.126, max=0.166, sum=0.378 (3)", - "tab": "Calibration", - "score": 0.12610548329130192 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.855, mean=0.873, max=0.89, sum=2.62 (3)", - "tab": "Robustness", - "score": 0.8733333333333334 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.917, mean=0.921, max=0.923, sum=2.762 (3)", - "tab": "Fairness", - "score": 0.9206666666666669 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.218, mean=0.225, max=0.231, sum=0.676 (3)", - "tab": "Efficiency", - "score": 0.22547806217447905 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)", - "tab": "General information", - "score": 4.242 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)", - "tab": "General information", - "score": 1553.363 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532, - "details": { - "description": "min=0.006, mean=0.532, max=1, sum=28.723 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.083, mean=0.396, max=0.664, sum=21.389 (54)", - "tab": "Calibration", - "score": 0.3960964912577608 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.461, max=1, sum=24.899 (54)", - "tab": "Robustness", - "score": 0.461098863197608 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.003, mean=0.478, max=1, sum=25.83 (54)", - "tab": "Fairness", - "score": 0.4783299102254815 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.203, mean=0.21, max=0.218, sum=11.326 (54)", - "tab": "Efficiency", - "score": 0.20974755918568705 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=0.998, mean=1.0, max=1.001, sum=54.0 (54)", - "tab": "General information", - "score": 0.9999957802714455 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.642, - "details": { - "description": "min=0.1, mean=0.642, max=0.975, sum=21.2 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.073, mean=0.222, max=0.806, sum=7.328 (33)", - "tab": "Calibration", - "score": 0.22206849861217967 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.505, max=0.975, sum=16.65 (33)", - "tab": "Robustness", - "score": 0.5045454545454545 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.05, mean=0.605, max=0.975, sum=19.95 (33)", - "tab": "Fairness", - "score": 0.6045454545454545 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.213, mean=0.279, max=0.378, sum=9.22 (33)", - "tab": "Efficiency", - "score": 0.2793995279947917 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=0.3, mean=3.056, max=6.575, sum=100.85 (33)", - "tab": "General information", - "score": 3.056060606060606 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json deleted file mode 100644 index 8051b9b3e..000000000 --- a/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-3.5-turbo-0301", - "id": "openai/gpt-3.5-turbo-0301", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.8156643356643357 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.6617249417249418 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5128923320135726 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.8050116550116551 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.3, mean=0.59, max=0.85, sum=2.949 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.23, mean=0.525, max=0.79, sum=2.627 (5)", - "tab": "Robustness", - "score": 0.5254736842105263 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.26, mean=0.53, max=0.8, sum=2.65 (5)", - "tab": "Fairness", - "score": 0.5299649122807017 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)", - "tab": "General information", - "score": 460.71996491228066 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1.012, max=1.06, sum=5.06 (5)", - "tab": "General information", - "score": 1.012 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)", - "tab": "Robustness", - "score": 0.66 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)", - "tab": "Fairness", - "score": 0.666 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1220.329, mean=1220.329, max=1220.329, sum=1220.329 (1)", - "tab": "General information", - "score": 1220.329 - }, - "BoolQ - # output tokens": { - "description": "min=1.932, mean=1.932, max=1.932, sum=1.932 (1)", - "tab": "General information", - "score": 1.932 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.663, - "details": { - "description": "min=0.663, mean=0.663, max=0.663, sum=0.663 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.602, mean=0.602, max=0.602, sum=0.602 (1)", - "tab": "Robustness", - "score": 0.6017866194784781 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)", - "tab": "Fairness", - "score": 0.5846601621436455 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.966, mean=4.966, max=4.966, sum=4.966 (1)", - "tab": "General information", - "score": 4.966197183098592 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3443.349, mean=3443.349, max=3443.349, sum=3443.349 (1)", - "tab": "General information", - "score": 3443.349295774648 - }, - "NarrativeQA - # output tokens": { - "description": "min=11.186, mean=11.186, max=11.186, sum=11.186 (1)", - "tab": "General information", - "score": 11.185915492957747 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)", - "tab": "Bias", - "score": 0.4789473684210526 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Bias", - "score": 0.33333333333333337 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.216, mean=0.216, max=0.216, sum=0.216 (1)", - "tab": "Bias", - "score": 0.21590909090909088 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)", - "tab": "Toxicity", - "score": 0.011267605633802818 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.624, - "details": { - "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)", - "tab": "Robustness", - "score": 0.32682585209770315 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.556, mean=0.556, max=0.556, sum=0.556 (1)", - "tab": "Robustness", - "score": 0.5559619230719722 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.331 (1)", - "tab": "Fairness", - "score": 0.3309794595447127 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)", - "tab": "Fairness", - "score": 0.5593911419045751 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=112.127, mean=112.127, max=112.127, sum=112.127 (1)", - "tab": "General information", - "score": 112.127 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=16.241, mean=16.241, max=16.241, sum=16.241 (1)", - "tab": "General information", - "score": 16.241 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.887, mean=4.887, max=4.887, sum=4.887 (1)", - "tab": "General information", - "score": 4.887 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.019, mean=0.019, max=0.019, sum=0.019 (1)", - "tab": "General information", - "score": 0.019 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1590.821, mean=1590.821, max=1590.821, sum=1590.821 (1)", - "tab": "General information", - "score": 1590.821 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=12.998, mean=12.998, max=12.998, sum=12.998 (1)", - "tab": "General information", - "score": 12.998 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)", - "tab": "Bias", - "score": 0.35333333333333333 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.364, mean=0.364, max=0.364, sum=0.364 (1)", - "tab": "Bias", - "score": 0.3643410852713178 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)", - "tab": "Bias", - "score": 0.16666666666666669 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.408 (1)", - "tab": "Bias", - "score": 0.4083885209713024 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.236, mean=0.236, max=0.236, sum=0.236 (1)", - "tab": "Bias", - "score": 0.23584905660377362 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512, - "details": { - "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.411 (1)", - "tab": "Robustness", - "score": 0.41122249859183385 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)", - "tab": "Fairness", - "score": 0.4167691534016683 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=3.871, mean=3.871, max=3.871, sum=3.871 (1)", - "tab": "General information", - "score": 3.871 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=3461.981, mean=3461.981, max=3461.981, sum=3461.981 (1)", - "tab": "General information", - "score": 3461.981 - }, - "QuAC - # output tokens": { - "description": "min=23.136, mean=23.136, max=23.136, sum=23.136 (1)", - "tab": "General information", - "score": 23.136 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.639, mean=0.639, max=0.639, sum=0.639 (1)", - "tab": "Bias", - "score": 0.638888888888889 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)", - "tab": "Bias", - "score": 0.40322916666666675 - }, - "QuAC - Representation (race)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.436 (1)", - "tab": "Bias", - "score": 0.43589743589743585 - }, - "QuAC - Representation (gender)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.229 (1)", - "tab": "Bias", - "score": 0.22941176470588232 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609, - "details": { - "description": "min=0.609, mean=0.609, max=0.609, sum=0.609 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)", - "tab": "Robustness", - "score": 0.5657492354740061 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)", - "tab": "Fairness", - "score": 0.5137614678899083 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=464.434, mean=464.434, max=464.434, sum=464.434 (1)", - "tab": "General information", - "score": 464.434250764526 - }, - "TruthfulQA - # output tokens": { - "description": "min=1.047, mean=1.047, max=1.047, sum=1.047 (1)", - "tab": "General information", - "score": 1.047400611620795 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.899, - "details": { - "description": "min=0.899, mean=0.899, max=0.899, sum=0.899 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)", - "tab": "Robustness", - "score": 0.857 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)", - "tab": "Fairness", - "score": 0.844 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=2543.665, mean=2543.665, max=2543.665, sum=2543.665 (1)", - "tab": "General information", - "score": 2543.665 - }, - "IMDB - # output tokens": { - "description": "min=1.006, mean=1.006, max=1.006, sum=1.006 (1)", - "tab": "General information", - "score": 1.006 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674, - "details": { - "description": "min=0.528, mean=0.674, max=0.824, sum=12.134 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.394, mean=0.605, max=0.824, sum=10.882 (18)", - "tab": "Robustness", - "score": 0.6045521523734413 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.024, mean=0.422, max=0.824, sum=7.597 (18)", - "tab": "Fairness", - "score": 0.4220761773099496 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=333.915, mean=733.362, max=1226.723, sum=13200.513 (18)", - "tab": "General information", - "score": 733.3618295565135 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1.023, max=1.103, sum=18.406 (18)", - "tab": "General information", - "score": 1.0225713328901465 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.3, mean=0.768, max=0.975, sum=8.45 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.2, mean=0.705, max=0.975, sum=7.75 (11)", - "tab": "Robustness", - "score": 0.7045454545454546 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.025, mean=0.689, max=0.975, sum=7.575 (11)", - "tab": "Fairness", - "score": 0.6886363636363636 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=3, mean=4.818, max=5, sum=53 (11)", - "tab": "General information", - "score": 4.818181818181818 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=252.275, mean=1002.239, max=3545.1, sum=11024.625 (11)", - "tab": "General information", - "score": 1002.2386363636365 - }, - "RAFT - # output tokens": { - "description": "min=1.325, mean=2.982, max=5, sum=32.8 (11)", - "tab": "General information", - "score": 2.9818181818181815 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json deleted file mode 100644 index b2682e6f7..000000000 --- a/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-3.5-turbo-0613", - "id": "openai/gpt-3.5-turbo-0613", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.7622144522144523 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.7175058275058275 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5232317557148765 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.7166083916083916 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391, - "details": { - "description": "min=0.2, mean=0.391, max=0.73, sum=1.955 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.1, mean=0.262, max=0.49, sum=1.312 (5)", - "tab": "Robustness", - "score": 0.2623859649122807 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.12, mean=0.313, max=0.66, sum=1.566 (5)", - "tab": "Fairness", - "score": 0.31312280701754386 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)", - "tab": "General information", - "score": 460.71996491228066 - }, - "MMLU - # output tokens": { - "description": "min=1.19, mean=1.371, max=1.61, sum=6.857 (5)", - "tab": "General information", - "score": 1.3714035087719298 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=0.87 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.845, mean=0.845, max=0.845, sum=0.845 (1)", - "tab": "Robustness", - "score": 0.845 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)", - "tab": "Fairness", - "score": 0.817 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1220.329, mean=1220.329, max=1220.329, sum=1220.329 (1)", - "tab": "General information", - "score": 1220.329 - }, - "BoolQ - # output tokens": { - "description": "min=1.057, mean=1.057, max=1.057, sum=1.057 (1)", - "tab": "General information", - "score": 1.057 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)", - "tab": "Robustness", - "score": 0.5658549915417233 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)", - "tab": "Fairness", - "score": 0.546599991762967 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.966, mean=4.966, max=4.966, sum=4.966 (1)", - "tab": "General information", - "score": 4.966197183098592 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3443.349, mean=3443.349, max=3443.349, sum=3443.349 (1)", - "tab": "General information", - "score": 3443.349295774648 - }, - "NarrativeQA - # output tokens": { - "description": "min=12.194, mean=12.194, max=12.194, sum=12.194 (1)", - "tab": "General information", - "score": 12.194366197183099 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.455, mean=0.455, max=0.455, sum=0.455 (1)", - "tab": "Bias", - "score": 0.45454545454545453 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.429 (1)", - "tab": "Bias", - "score": 0.42857142857142855 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.169, mean=0.169, max=0.169, sum=0.169 (1)", - "tab": "Bias", - "score": 0.16860465116279072 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)", - "tab": "Toxicity", - "score": 0.011267605633802818 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)", - "tab": "Robustness", - "score": 0.28373438775512194 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.606, mean=0.606, max=0.606, sum=0.606 (1)", - "tab": "Robustness", - "score": 0.6060594363127481 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)", - "tab": "Fairness", - "score": 0.2871379631388369 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.627, mean=0.627, max=0.627, sum=0.627 (1)", - "tab": "Fairness", - "score": 0.6270354958497198 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=112.127, mean=112.127, max=112.127, sum=112.127 (1)", - "tab": "General information", - "score": 112.127 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=18.876, mean=18.876, max=18.876, sum=18.876 (1)", - "tab": "General information", - "score": 18.876 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.887, mean=4.887, max=4.887, sum=4.887 (1)", - "tab": "General information", - "score": 4.887 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.019, mean=0.019, max=0.019, sum=0.019 (1)", - "tab": "General information", - "score": 0.019 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1590.821, mean=1590.821, max=1590.821, sum=1590.821 (1)", - "tab": "General information", - "score": 1590.821 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=11.901, mean=11.901, max=11.901, sum=11.901 (1)", - "tab": "General information", - "score": 11.901 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.382 (1)", - "tab": "Bias", - "score": 0.38211382113821135 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.104, mean=0.104, max=0.104, sum=0.104 (1)", - "tab": "Bias", - "score": 0.10377358490566038 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.415 (1)", - "tab": "Bias", - "score": 0.41463414634146334 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)", - "tab": "Bias", - "score": 0.23333333333333336 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.485, - "details": { - "description": "min=0.485, mean=0.485, max=0.485, sum=0.485 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.371, mean=0.371, max=0.371, sum=0.371 (1)", - "tab": "Robustness", - "score": 0.3712446607257685 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.398 (1)", - "tab": "Fairness", - "score": 0.3977545370248786 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=3.871, mean=3.871, max=3.871, sum=3.871 (1)", - "tab": "General information", - "score": 3.871 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=3461.981, mean=3461.981, max=3461.981, sum=3461.981 (1)", - "tab": "General information", - "score": 3461.981 - }, - "QuAC - # output tokens": { - "description": "min=25.691, mean=25.691, max=25.691, sum=25.691 (1)", - "tab": "General information", - "score": 25.691 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.589, mean=0.589, max=0.589, sum=0.589 (1)", - "tab": "Bias", - "score": 0.5889724310776943 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)", - "tab": "Bias", - "score": 0.4030096483037659 - }, - "QuAC - Representation (race)": { - "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)", - "tab": "Bias", - "score": 0.3782051282051282 - }, - "QuAC - Representation (gender)": { - "description": "min=0.223, mean=0.223, max=0.223, sum=0.223 (1)", - "tab": "Bias", - "score": 0.22334293948126804 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.339, - "details": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.339 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)", - "tab": "Robustness", - "score": 0.18654434250764526 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.255, mean=0.255, max=0.255, sum=0.255 (1)", - "tab": "Fairness", - "score": 0.25535168195718655 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=464.434, mean=464.434, max=464.434, sum=464.434 (1)", - "tab": "General information", - "score": 464.434250764526 - }, - "TruthfulQA - # output tokens": { - "description": "min=1.517, mean=1.517, max=1.517, sum=1.517 (1)", - "tab": "General information", - "score": 1.5168195718654434 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.943, - "details": { - "description": "min=0.943, mean=0.943, max=0.943, sum=0.943 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)", - "tab": "Robustness", - "score": 0.916 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)", - "tab": "Fairness", - "score": 0.912 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=2543.665, mean=2543.665, max=2543.665, sum=2543.665 (1)", - "tab": "General information", - "score": 2543.665 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.48, mean=0.696, max=0.874, sum=12.534 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.206, mean=0.564, max=0.863, sum=10.15 (18)", - "tab": "Robustness", - "score": 0.5638779146224463 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.133, mean=0.525, max=0.863, sum=9.458 (18)", - "tab": "Fairness", - "score": 0.5254285459217098 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=333.915, mean=733.362, max=1226.723, sum=13200.513 (18)", - "tab": "General information", - "score": 733.3618295565135 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1.001, max=1.01, sum=18.025 (18)", - "tab": "General information", - "score": 1.0013947024944874 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.748, - "details": { - "description": "min=0.275, mean=0.748, max=0.95, sum=8.225 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.15, mean=0.677, max=0.95, sum=7.45 (11)", - "tab": "Robustness", - "score": 0.6772727272727272 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.2, mean=0.641, max=0.95, sum=7.05 (11)", - "tab": "Fairness", - "score": 0.640909090909091 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=3, mean=4.818, max=5, sum=53 (11)", - "tab": "General information", - "score": 4.818181818181818 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=252.275, mean=1002.239, max=3545.1, sum=11024.625 (11)", - "tab": "General information", - "score": 1002.2386363636365 - }, - "RAFT - # output tokens": { - "description": "min=1.275, mean=2.955, max=5.05, sum=32.5 (11)", - "tab": "General information", - "score": 2.9545454545454546 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json deleted file mode 100644 index 43f728bf2..000000000 --- a/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-ada-001/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "text-ada-001", - "id": "openai/text-ada-001", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.107, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.17139908178298557 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.10508470024599056 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.10817286162113748 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.937796052631579 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4261942744755245 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5531715198381865 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.48596491228070177 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.238, - "details": { - "description": "min=0.14, mean=0.238, max=0.31, sum=3.566 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.357, mean=0.506, max=0.666, sum=7.594 (15)", - "tab": "Calibration", - "score": 0.5062965949265723 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.08, mean=0.178, max=0.28, sum=2.665 (15)", - "tab": "Robustness", - "score": 0.17768421052631578 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.11, mean=0.202, max=0.28, sum=3.026 (15)", - "tab": "Fairness", - "score": 0.201766081871345 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.086, mean=0.088, max=0.089, sum=1.314 (15)", - "tab": "Efficiency", - "score": 0.08760755934758772 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.464, - "details": { - "description": "min=0.405, mean=0.464, max=0.503, sum=1.392 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.257, mean=0.346, max=0.483, sum=1.039 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.34632807207915267 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.316, mean=0.332, max=0.362, sum=0.997 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.33233333333333337 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.364, mean=0.378, max=0.397, sum=1.134 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.37799999999999995 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.09, mean=0.096, max=0.103, sum=0.287 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.09557654231770833 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=0.995, mean=1.003, max=1.009, sum=3.009 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.003 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.238, - "details": { - "description": "min=0.22, mean=0.238, max=0.273, sum=0.714 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.248, mean=0.319, max=0.386, sum=0.956 (3)", - "tab": "Calibration", - "score": 0.318718698868713 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.049, mean=0.058, max=0.075, sum=0.175 (3)", - "tab": "Robustness", - "score": 0.05828828370185365 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.11, mean=0.119, max=0.126, sum=0.356 (3)", - "tab": "Fairness", - "score": 0.1187630501762329 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.16, mean=0.171, max=0.186, sum=0.513 (3)", - "tab": "Efficiency", - "score": 0.1710890294894365 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=9.054, mean=10.756, max=13.293, sum=32.268 (3)", - "tab": "General information", - "score": 10.755868544600938 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.382, mean=0.403, max=0.438, sum=1.21 (3)", - "tab": "Bias", - "score": 0.40317130936696155 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.151, mean=0.203, max=0.252, sum=0.609 (3)", - "tab": "Bias", - "score": 0.20287726757892108 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.003, mean=0.006, max=0.008, sum=0.017 (3)", - "tab": "Toxicity", - "score": 0.005633802816901408 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.149, - "details": { - "description": "min=0.06, mean=0.149, max=0.193, sum=0.446 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.751, mean=0.764, max=0.789, sum=2.292 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.7640868917536278 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.6, mean=0.691, max=0.866, sum=2.072 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.6905918803748641 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.007, mean=0.008, max=0.009, sum=0.023 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.007711173104376766 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.01, mean=0.034, max=0.062, sum=0.102 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.033837452909760764 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.009, mean=0.012, max=0.018, sum=0.036 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.012133718750385417 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.026, mean=0.083, max=0.115, sum=0.249 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.08303504557607948 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.083, mean=0.085, max=0.087, sum=0.255 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.08484092187500009 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.119, mean=0.128, max=0.133, sum=0.383 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.12779065299479173 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0.729, mean=1.04, max=1.418, sum=3.12 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0399999999999998 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1.801, mean=3.933, max=5.648, sum=11.799 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.933 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.16666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.567, mean=0.633, max=0.667, sum=1.9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6333333333333334 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.1, mean=0.217, max=0.318, sum=0.652 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.21717171717171715 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176, - "details": { - "description": "min=0.14, mean=0.176, max=0.203, sum=0.527 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.16, mean=0.268, max=0.362, sum=0.803 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.2675195450588613 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.054, mean=0.067, max=0.074, sum=0.201 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.06713428098997175 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.063, mean=0.091, max=0.113, sum=0.273 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.09086419903543015 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=0.194, mean=0.21, max=0.221, sum=0.629 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.20979015885416655 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=14.536, mean=17.274, max=19.327, sum=51.821 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 17.273666666666667 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.625, mean=0.653, max=0.667, sum=1.958 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6527777777777778 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.415, mean=0.433, max=0.448, sum=1.3 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4333686045042254 - }, - "QuAC - Representation (race)": { - "description": "min=0.308, mean=0.345, max=0.387, sum=1.034 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.34482454482454483 - }, - "QuAC - Representation (gender)": { - "description": "min=0.223, mean=0.244, max=0.269, sum=0.732 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.24387920564334062 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0003333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429, - "details": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.429 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.103, mean=0.103, max=0.103, sum=0.103 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.1034689985203878 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.32 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.32 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.27 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.079, mean=0.079, max=0.079, sum=0.079 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.07943312500000001 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346, - "details": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.346 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.487, mean=0.487, max=0.487, sum=0.487 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.4870210553256142 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.248 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.266 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.076, mean=0.076, max=0.076, sum=0.076 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.07620585937499988 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.232, - "details": { - "description": "min=0.216, mean=0.232, max=0.263, sum=0.696 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.418, mean=0.465, max=0.495, sum=1.395 (3)", - "tab": "Calibration", - "score": 0.46507296315502505 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.165, mean=0.175, max=0.194, sum=0.526 (3)", - "tab": "Robustness", - "score": 0.17533129459734964 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.18, mean=0.191, max=0.213, sum=0.573 (3)", - "tab": "Fairness", - "score": 0.191131498470948 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.088, mean=0.089, max=0.089, sum=0.266 (3)", - "tab": "Efficiency", - "score": 0.08860781608371561 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302, - "details": { - "description": "min=0.21, mean=0.302, max=0.353, sum=0.905 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.044, mean=0.069, max=0.091, sum=0.207 (3)", - "tab": "Robustness", - "score": 0.06911044973544983 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.172, mean=0.252, max=0.302, sum=0.757 (3)", - "tab": "Robustness", - "score": 0.2521954718959493 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.071, mean=0.107, max=0.133, sum=0.32 (3)", - "tab": "Fairness", - "score": 0.10653478835978836 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.177, mean=0.276, max=0.327, sum=0.827 (3)", - "tab": "Fairness", - "score": 0.2757254036023355 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.089, mean=0.09, max=0.091, sum=0.27 (3)", - "tab": "Efficiency", - "score": 0.08991796223958341 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.089, mean=0.09, max=0.09, sum=0.269 (3)", - "tab": "Efficiency", - "score": 0.08954472504844961 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.014, mean=1.123, max=1.303, sum=3.369 (3)", - "tab": "General information", - "score": 1.123 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=0.953, mean=1.101, max=1.326, sum=3.302 (3)", - "tab": "General information", - "score": 1.1007751937984496 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.136, - "details": { - "description": "min=0.134, mean=0.136, max=0.137, sum=0.813 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=0.791, mean=0.793, max=0.796, sum=4.758 (6)", - "tab": "Efficiency", - "score": 0.7929256541152537 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=114.727, mean=114.938, max=115.313, sum=689.627 (6)", - "tab": "General information", - "score": 114.93776824034335 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.585, mean=0.603, max=0.618, sum=3.62 (6)", - "tab": "Bias", - "score": 0.6033209686988849 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.366, mean=0.376, max=0.394, sum=2.258 (6)", - "tab": "Bias", - "score": 0.376337569695528 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.32, mean=0.327, max=0.336, sum=1.964 (6)", - "tab": "Bias", - "score": 0.3273411562788524 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.118, mean=0.135, max=0.151, sum=0.81 (6)", - "tab": "Bias", - "score": 0.13502681064518518 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.000715307582260372 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.202, mean=0.223, max=0.237, sum=0.67 (3)", - "tab": "Summarization metrics", - "score": 0.22335669413101697 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=2.69, mean=3.369, max=3.833, sum=20.217 (6)", - "tab": "Summarization metrics", - "score": 3.3694626717468696 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.244, mean=0.247, max=0.25, sum=0.741 (3)", - "tab": "Summarization metrics", - "score": 0.2468463296383967 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.923, mean=0.929, max=0.933, sum=5.574 (6)", - "tab": "Summarization metrics", - "score": 0.9289690481394134 - }, - "CNN/DailyMail - Density": { - "description": "min=28.745, mean=31.424, max=35.767, sum=188.544 (6)", - "tab": "Summarization metrics", - "score": 31.424005422737114 - }, - "CNN/DailyMail - Compression": { - "description": "min=5.334, mean=5.461, max=5.548, sum=32.769 (6)", - "tab": "Summarization metrics", - "score": 5.461465024583634 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034, - "details": { - "description": "min=0.034, mean=0.034, max=0.036, sum=0.206 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.304, mean=0.311, max=0.318, sum=1.868 (6)", - "tab": "Efficiency", - "score": 0.31128436946991633 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=33.533, mean=34.806, max=36.037, sum=208.834 (6)", - "tab": "General information", - "score": 34.805662805662806 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.387, mean=0.403, max=0.414, sum=2.418 (6)", - "tab": "Bias", - "score": 0.4030736615819075 - }, - "XSUM - Representation (race)": { - "description": "min=0.547, mean=0.597, max=0.623, sum=3.579 (6)", - "tab": "Bias", - "score": 0.5965455454885051 - }, - "XSUM - Representation (gender)": { - "description": "min=0.087, mean=0.19, max=0.25, sum=1.142 (6)", - "tab": "Bias", - "score": 0.19037429957632912 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.132, mean=-0.102, max=-0.078, sum=-0.305 (3)", - "tab": "Summarization metrics", - "score": -0.10168572979799827 - }, - "XSUM - QAFactEval": { - "description": "min=4.849, mean=4.929, max=5.055, sum=29.572 (6)", - "tab": "Summarization metrics", - "score": 4.92859074878104 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.237, mean=0.245, max=0.254, sum=0.734 (3)", - "tab": "Summarization metrics", - "score": 0.24476258912195994 - }, - "XSUM - Coverage": { - "description": "min=0.834, mean=0.847, max=0.866, sum=5.08 (6)", - "tab": "Summarization metrics", - "score": 0.8466942307223615 - }, - "XSUM - Density": { - "description": "min=7.289, mean=7.626, max=8.299, sum=45.753 (6)", - "tab": "Summarization metrics", - "score": 7.625570347216255 - }, - "XSUM - Compression": { - "description": "min=12.7, mean=13.08, max=13.496, sum=78.483 (6)", - "tab": "Summarization metrics", - "score": 13.080494860928995 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.776, mean=0.822, max=0.853, sum=2.466 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.053, mean=0.09, max=0.142, sum=0.269 (3)", - "tab": "Calibration", - "score": 0.08977338148861268 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.663, mean=0.716, max=0.744, sum=2.148 (3)", - "tab": "Robustness", - "score": 0.7160000000000001 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.724, mean=0.769, max=0.808, sum=2.308 (3)", - "tab": "Fairness", - "score": 0.7693333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.104, mean=0.109, max=0.114, sum=0.328 (3)", - "tab": "Efficiency", - "score": 0.109459033203125 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)", - "tab": "General information", - "score": 4.242 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)", - "tab": "General information", - "score": 1553.363 - }, - "IMDB - # output tokens": { - "description": "min=1.006, mean=1.013, max=1.021, sum=3.039 (3)", - "tab": "General information", - "score": 1.013 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.503, - "details": { - "description": "min=0, mean=0.503, max=1, sum=27.18 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.012, mean=0.479, max=0.985, sum=25.845 (54)", - "tab": "Calibration", - "score": 0.47860750507636396 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.491, max=1, sum=26.518 (54)", - "tab": "Robustness", - "score": 0.4910745197871521 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.497, max=1, sum=26.82 (54)", - "tab": "Fairness", - "score": 0.49665917233754203 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.086, mean=0.092, max=0.103, sum=4.964 (54)", - "tab": "Efficiency", - "score": 0.0919244734885576 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406, - "details": { - "description": "min=0.05, mean=0.406, max=0.975, sum=13.4 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.018, mean=0.473, max=0.891, sum=15.613 (33)", - "tab": "Calibration", - "score": 0.47311876061285835 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.335, max=0.925, sum=11.05 (33)", - "tab": "Robustness", - "score": 0.3348484848484849 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.05, mean=0.376, max=0.975, sum=12.4 (33)", - "tab": "Fairness", - "score": 0.3757575757575758 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.084, mean=0.107, max=0.14, sum=3.527 (33)", - "tab": "Efficiency", - "score": 0.10687999526515152 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=0.15, mean=2.997, max=6.925, sum=98.9 (33)", - "tab": "General information", - "score": 2.996969696969697 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json deleted file mode 100644 index fbb4b5bb6..000000000 --- a/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-babbage-001/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "text-babbage-001", - "id": "openai/text-babbage-001", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.229, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.27686841173581844 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.22569775422945612 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.2438772758572536 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7775548245614035 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.5333126239886427 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5020704604037938 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.6459690893901421 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.229, - "details": { - "description": "min=0.11, mean=0.229, max=0.325, sum=3.431 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.16, mean=0.311, max=0.472, sum=4.659 (15)", - "tab": "Calibration", - "score": 0.31056724427484883 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.1, mean=0.186, max=0.228, sum=2.79 (15)", - "tab": "Robustness", - "score": 0.18602339181286548 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.09, mean=0.205, max=0.272, sum=3.077 (15)", - "tab": "Fairness", - "score": 0.20512280701754387 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.131, mean=0.133, max=0.135, sum=1.99 (15)", - "tab": "Efficiency", - "score": 0.13263352809758774 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451, - "details": { - "description": "min=0.414, mean=0.451, max=0.477, sum=1.353 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.318, mean=0.344, max=0.371, sum=1.031 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.34372183455656985 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.339, mean=0.384, max=0.412, sum=1.151 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.38366666666666666 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.388, mean=0.41, max=0.43, sum=1.23 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.41 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.136, mean=0.142, max=0.15, sum=0.426 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.14212787000868074 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1.004, max=1.008, sum=3.012 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.004 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429, - "details": { - "description": "min=0.412, mean=0.429, max=0.463, sum=1.288 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.158, mean=0.186, max=0.215, sum=0.557 (3)", - "tab": "Calibration", - "score": 0.18581698260430923 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.101, mean=0.126, max=0.154, sum=0.377 (3)", - "tab": "Robustness", - "score": 0.12577588570182116 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.277, mean=0.299, max=0.335, sum=0.896 (3)", - "tab": "Fairness", - "score": 0.29864937428822036 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.239, mean=0.243, max=0.246, sum=0.728 (3)", - "tab": "Efficiency", - "score": 0.24279079738849765 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=12.048, mean=12.829, max=13.307, sum=38.487 (3)", - "tab": "General information", - "score": 12.829107981220657 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.382, mean=0.403, max=0.433, sum=1.209 (3)", - "tab": "Bias", - "score": 0.40286362942612947 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.089, mean=0.132, max=0.178, sum=0.395 (3)", - "tab": "Bias", - "score": 0.13153743304740043 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.003, mean=0.009, max=0.02, sum=0.028 (3)", - "tab": "Toxicity", - "score": 0.009389671361502348 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.296, mean=0.33, max=0.355, sum=0.989 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.505, mean=0.522, max=0.555, sum=1.567 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.5224886706365456 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.346, mean=0.385, max=0.427, sum=1.155 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.38493664744185446 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.039, mean=0.04, max=0.041, sum=0.119 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.039736972833954616 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.139, mean=0.151, max=0.169, sum=0.452 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.15066474277626352 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.048, mean=0.053, max=0.057, sum=0.16 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.05326475617936846 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.209, mean=0.24, max=0.263, sum=0.72 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.23984494964196315 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.134, mean=0.136, max=0.137, sum=0.407 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.1355529375 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.2, mean=0.204, max=0.207, sum=0.612 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.20402605620659717 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1.708, mean=2.016, max=2.304, sum=6.048 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 2.016 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.676, mean=7.772, max=7.9, sum=23.317 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 7.772333333333333 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.238, mean=0.317, max=0.467, sum=0.95 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.3167919799498747 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.125, mean=0.145, max=0.167, sum=0.435 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.14484126984126985 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.286, mean=0.333, max=0.364, sum=0.999 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.3331168831168831 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.35, mean=0.403, max=0.457, sum=1.208 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4025813878698122 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.221, mean=0.243, max=0.273, sum=0.728 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.2427837942788109 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0003333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.284, - "details": { - "description": "min=0.279, mean=0.284, max=0.288, sum=0.852 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.224, mean=0.24, max=0.25, sum=0.72 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.2399406998223789 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.083, mean=0.087, max=0.091, sum=0.261 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.08703476784265192 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.188, mean=0.196, max=0.202, sum=0.589 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.19638729492261867 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=0.305, mean=0.314, max=0.32, sum=0.941 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.3136292994791667 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=21.715, mean=22.966, max=24.001, sum=68.897 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 22.965666666666667 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.593, mean=0.617, max=0.643, sum=1.851 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6171143671143672 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.425, mean=0.435, max=0.449, sum=1.305 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.43511418044370825 - }, - "QuAC - Representation (race)": { - "description": "min=0.342, mean=0.361, max=0.388, sum=1.084 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.36134886795921545 - }, - "QuAC - Representation (gender)": { - "description": "min=0.255, mean=0.26, max=0.268, sum=0.779 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.25974518866516266 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0003333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.083, mean=0.083, max=0.083, sum=0.083 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.08291053064819098 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.468 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.468 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.405 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.125, mean=0.125, max=0.125, sum=0.125 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.12474649999999997 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.362, mean=0.362, max=0.362, sum=0.362 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.36220844968968424 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.39 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.386 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.386 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.122, mean=0.122, max=0.122, sum=0.122 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.12216468749999997 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.233, - "details": { - "description": "min=0.2, mean=0.233, max=0.274, sum=0.699 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.202, mean=0.251, max=0.279, sum=0.752 (3)", - "tab": "Calibration", - "score": 0.2505684624777335 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.156, mean=0.195, max=0.252, sum=0.586 (3)", - "tab": "Robustness", - "score": 0.19520897043832822 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.173, mean=0.207, max=0.257, sum=0.622 (3)", - "tab": "Fairness", - "score": 0.20744138634046894 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.133, mean=0.134, max=0.134, sum=0.401 (3)", - "tab": "Efficiency", - "score": 0.1335233459161568 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449, - "details": { - "description": "min=0.42, mean=0.449, max=0.493, sum=1.347 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.099, mean=0.122, max=0.16, sum=0.366 (3)", - "tab": "Robustness", - "score": 0.12212023809523809 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.315, mean=0.356, max=0.413, sum=1.069 (3)", - "tab": "Robustness", - "score": 0.35630094105473137 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.152, mean=0.174, max=0.213, sum=0.523 (3)", - "tab": "Fairness", - "score": 0.17431719576719562 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.396, mean=0.424, max=0.469, sum=1.273 (3)", - "tab": "Fairness", - "score": 0.4244404820446352 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.136, mean=0.136, max=0.136, sum=0.408 (3)", - "tab": "Efficiency", - "score": 0.1359015429687499 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.135, mean=0.135, max=0.136, sum=0.406 (3)", - "tab": "Efficiency", - "score": 0.1353138323643411 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.142, mean=1.212, max=1.282, sum=3.635 (3)", - "tab": "General information", - "score": 1.2116666666666667 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=0.977, mean=1.132, max=1.326, sum=3.395 (3)", - "tab": "General information", - "score": 1.1317829457364341 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.151, - "details": { - "description": "min=0.147, mean=0.151, max=0.155, sum=0.907 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=0.951, mean=0.968, max=0.994, sum=5.81 (6)", - "tab": "Efficiency", - "score": 0.9683207451306926 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=114.333, mean=116.858, max=120.519, sum=701.146 (6)", - "tab": "General information", - "score": 116.85765379113019 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.623, mean=0.626, max=0.63, sum=3.757 (6)", - "tab": "Bias", - "score": 0.6261965622126104 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.369, mean=0.385, max=0.401, sum=2.312 (6)", - "tab": "Bias", - "score": 0.3853218330657557 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.366, mean=0.389, max=0.408, sum=2.333 (6)", - "tab": "Bias", - "score": 0.38877532854423413 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.142, mean=0.147, max=0.152, sum=0.879 (6)", - "tab": "Bias", - "score": 0.14657801266351475 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.347, mean=0.378, max=0.402, sum=1.135 (3)", - "tab": "Summarization metrics", - "score": 0.3784199534784201 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.659, mean=4.676, max=4.708, sum=28.057 (6)", - "tab": "Summarization metrics", - "score": 4.676089387380419 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.277, mean=0.282, max=0.285, sum=0.845 (3)", - "tab": "Summarization metrics", - "score": 0.28169928727191773 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.969, mean=0.972, max=0.973, sum=5.83 (6)", - "tab": "Summarization metrics", - "score": 0.9716251936961523 - }, - "CNN/DailyMail - Density": { - "description": "min=41.642, mean=45.948, max=53.738, sum=275.691 (6)", - "tab": "Summarization metrics", - "score": 45.94847550953912 - }, - "CNN/DailyMail - Compression": { - "description": "min=5.013, mean=5.291, max=5.576, sum=31.744 (6)", - "tab": "Summarization metrics", - "score": 5.290663826380655 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.046, - "details": { - "description": "min=0.044, mean=0.046, max=0.047, sum=0.275 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.416, mean=0.431, max=0.439, sum=2.583 (6)", - "tab": "Efficiency", - "score": 0.43057023625187685 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=38.037, mean=40.165, max=41.259, sum=240.988 (6)", - "tab": "General information", - "score": 40.16473616473616 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.42, mean=0.443, max=0.467, sum=2.66 (6)", - "tab": "Bias", - "score": 0.44339662209590786 - }, - "XSUM - Representation (race)": { - "description": "min=0.436, mean=0.521, max=0.667, sum=3.124 (6)", - "tab": "Bias", - "score": 0.5206745206745207 - }, - "XSUM - Representation (gender)": { - "description": "min=0.178, mean=0.204, max=0.222, sum=1.222 (6)", - "tab": "Bias", - "score": 0.20364463830300386 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)", - "tab": "Toxicity", - "score": 0.001287001287001287 - }, - "XSUM - SummaC": { - "description": "min=-0.078, mean=-0.057, max=-0.044, sum=-0.17 (3)", - "tab": "Summarization metrics", - "score": -0.05681849002633572 - }, - "XSUM - QAFactEval": { - "description": "min=4.256, mean=4.33, max=4.381, sum=25.981 (6)", - "tab": "Summarization metrics", - "score": 4.330178153632894 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.277, mean=0.281, max=0.286, sum=0.844 (3)", - "tab": "Summarization metrics", - "score": 0.28149043918051486 - }, - "XSUM - Coverage": { - "description": "min=0.873, mean=0.885, max=0.893, sum=5.312 (6)", - "tab": "Summarization metrics", - "score": 0.8853480945766184 - }, - "XSUM - Density": { - "description": "min=7.239, mean=8.487, max=9.133, sum=50.925 (6)", - "tab": "Summarization metrics", - "score": 8.487450287350649 - }, - "XSUM - Compression": { - "description": "min=11.1, mean=11.856, max=12.376, sum=71.136 (6)", - "tab": "Summarization metrics", - "score": 11.856076449493486 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.902, mean=0.913, max=0.921, sum=2.738 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.028, mean=0.038, max=0.05, sum=0.115 (3)", - "tab": "Calibration", - "score": 0.038396495508375095 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.821, mean=0.844, max=0.868, sum=2.532 (3)", - "tab": "Robustness", - "score": 0.844 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.871, mean=0.887, max=0.901, sum=2.66 (3)", - "tab": "Fairness", - "score": 0.8866666666666667 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.151, mean=0.157, max=0.162, sum=0.472 (3)", - "tab": "Efficiency", - "score": 0.15740409657118068 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)", - "tab": "General information", - "score": 4.242 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)", - "tab": "General information", - "score": 1553.363 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1.001, max=1.003, sum=3.003 (3)", - "tab": "General information", - "score": 1.0010000000000001 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.499, - "details": { - "description": "min=0, mean=0.499, max=1, sum=26.951 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.092, mean=0.499, max=0.911, sum=26.966 (54)", - "tab": "Calibration", - "score": 0.49936533676896183 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.499, max=1, sum=26.94 (54)", - "tab": "Robustness", - "score": 0.4988821054609162 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.499, max=1, sum=26.936 (54)", - "tab": "Fairness", - "score": 0.4988205867192775 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.13, mean=0.138, max=0.151, sum=7.438 (54)", - "tab": "Efficiency", - "score": 0.13774715150926628 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=54 (54)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "description": "min=0.125, mean=0.509, max=0.925, sum=16.8 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.102, mean=0.295, max=0.541, sum=9.737 (33)", - "tab": "Calibration", - "score": 0.2950696376748286 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.383, max=0.925, sum=12.625 (33)", - "tab": "Robustness", - "score": 0.38257575757575757 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.125, mean=0.475, max=0.925, sum=15.675 (33)", - "tab": "Fairness", - "score": 0.47500000000000003 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.13, mean=0.153, max=0.188, sum=5.047 (33)", - "tab": "Efficiency", - "score": 0.15293320707070707 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=0.85, mean=2.774, max=5.875, sum=91.55 (33)", - "tab": "General information", - "score": 2.7742424242424244 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.025, sum=0.025 (33)", - "tab": "Toxicity", - "score": 0.0007575757575757576 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json deleted file mode 100644 index 4537bcc84..000000000 --- a/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-curie-001/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "text-curie-001", - "id": "openai/text-curie-001", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.36, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.33452535946368817 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.336998226097225 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.377271245624972 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7827028508771929 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.49509040746991073 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.4050529717196384 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.6165831244778613 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.237, - "details": { - "description": "min=0.21, mean=0.237, max=0.298, sum=3.558 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.298, mean=0.462, max=0.534, sum=6.937 (15)", - "tab": "Calibration", - "score": 0.4624557415628211 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.16, mean=0.22, max=0.272, sum=3.303 (15)", - "tab": "Robustness", - "score": 0.22019883040935673 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.2, mean=0.231, max=0.281, sum=3.462 (15)", - "tab": "Fairness", - "score": 0.23079532163742691 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.129, mean=0.133, max=0.14, sum=1.998 (15)", - "tab": "Efficiency", - "score": 0.13321992694627194 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62, - "details": { - "description": "min=0.591, mean=0.62, max=0.638, sum=1.861 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.239, mean=0.253, max=0.279, sum=0.758 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.252648729019218 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.519, mean=0.549, max=0.566, sum=1.648 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.5493333333333332 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.543, mean=0.576, max=0.592, sum=1.727 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.5756666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.141, mean=0.143, max=0.146, sum=0.429 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.14293199392361097 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1.004, mean=1.007, max=1.012, sum=3.021 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.007 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.582, - "details": { - "description": "min=0.55, mean=0.582, max=0.63, sum=1.746 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.198, mean=0.221, max=0.233, sum=0.664 (3)", - "tab": "Calibration", - "score": 0.22125645338584943 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.299, mean=0.34, max=0.38, sum=1.02 (3)", - "tab": "Robustness", - "score": 0.33989457936851464 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.428, mean=0.463, max=0.5, sum=1.389 (3)", - "tab": "Fairness", - "score": 0.4630759323159577 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.19, mean=0.205, max=0.217, sum=0.615 (3)", - "tab": "Efficiency", - "score": 0.20493085387323948 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)", - "tab": "General information", - "score": 1.6469483568075116 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)", - "tab": "General information", - "score": 1652.3774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.645, mean=8.971, max=10.738, sum=26.913 (3)", - "tab": "General information", - "score": 8.970892018779344 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.436, mean=0.446, max=0.453, sum=1.339 (3)", - "tab": "Bias", - "score": 0.44628176056747487 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.494, mean=0.609, max=0.667, sum=1.828 (3)", - "tab": "Bias", - "score": 0.6091954022988506 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.161, mean=0.19, max=0.207, sum=0.569 (3)", - "tab": "Bias", - "score": 0.1896444305777106 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.015, max=0.017, sum=0.045 (3)", - "tab": "Toxicity", - "score": 0.015023474178403754 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.571, - "details": { - "description": "min=0.536, mean=0.571, max=0.599, sum=1.714 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.233, mean=0.253, max=0.264, sum=0.758 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.25269080261254767 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.215, mean=0.216, max=0.217, sum=0.648 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.21613185314031233 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.116, mean=0.121, max=0.124, sum=0.363 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.12098406641539787 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.365, mean=0.415, max=0.445, sum=1.246 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.4152585116053236 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.124, mean=0.132, max=0.139, sum=0.396 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.13187631785928275 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.464, mean=0.5, max=0.519, sum=1.499 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.4995085831746681 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.152, mean=0.153, max=0.154, sum=0.459 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.15303552604166656 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.176, mean=0.185, max=0.193, sum=0.554 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.1847613116319444 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.507, mean=4.641, max=4.737, sum=13.923 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.641 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.691333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1419.5736666666664 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.931, mean=6.634, max=7.52, sum=19.901 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 6.633666666666667 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.542, mean=0.566, max=0.6, sum=1.697 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5657407407407408 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.119, mean=0.238, max=0.346, sum=0.715 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.23840048840048841 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.3, mean=0.433, max=0.5, sum=1.3 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.43333333333333335 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.405, mean=0.441, max=0.467, sum=1.323 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.44097026888062185 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.144, mean=0.158, max=0.179, sum=0.473 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.15754640839386602 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358, - "details": { - "description": "min=0.341, mean=0.358, max=0.383, sum=1.074 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.237, mean=0.254, max=0.272, sum=0.763 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.25427485237899866 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.166, mean=0.169, max=0.173, sum=0.506 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.16872479684813432 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.244, mean=0.255, max=0.264, sum=0.765 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.2548639356870548 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=0.287, mean=0.298, max=0.313, sum=0.894 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.29803956770833356 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.9443333333333334 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1644.8306666666667 - }, - "QuAC - # output tokens": { - "description": "min=20.676, mean=22.198, max=24.409, sum=66.593 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 22.197666666666663 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.593, mean=0.631, max=0.667, sum=1.893 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6308641975308643 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.438, mean=0.456, max=0.473, sum=1.367 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4556780038650607 - }, - "QuAC - Representation (race)": { - "description": "min=0.244, mean=0.274, max=0.294, sum=0.822 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.27410775768984724 - }, - "QuAC - Representation (gender)": { - "description": "min=0.231, mean=0.242, max=0.26, sum=0.726 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.24189395211611728 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0013333333333333333 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.676, - "details": { - "description": "min=0.676, mean=0.676, max=0.676, sum=0.676 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.153, mean=0.153, max=0.153, sum=0.153 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.15281579026404526 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.625 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.534 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.125, mean=0.125, max=0.125, sum=0.125 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.12517962499999974 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514, - "details": { - "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.321 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.3206023655720099 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.424 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.452 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.119, mean=0.119, max=0.119, sum=0.119 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.1193705468750003 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.257, - "details": { - "description": "min=0.231, mean=0.257, max=0.301, sum=0.772 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.321, mean=0.355, max=0.375, sum=1.066 (3)", - "tab": "Calibration", - "score": 0.35539796883884156 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.206, mean=0.235, max=0.284, sum=0.705 (3)", - "tab": "Robustness", - "score": 0.23496432212028542 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.209, mean=0.239, max=0.286, sum=0.717 (3)", - "tab": "Fairness", - "score": 0.23904179408766565 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.134, mean=0.134, max=0.136, sum=0.403 (3)", - "tab": "Efficiency", - "score": 0.1343441023987004 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507, - "details": { - "description": "min=0.476, mean=0.507, max=0.545, sum=1.522 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.171, mean=0.198, max=0.222, sum=0.594 (3)", - "tab": "Robustness", - "score": 0.1980144179894178 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.393, mean=0.444, max=0.486, sum=1.331 (3)", - "tab": "Robustness", - "score": 0.4437543283018195 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.231, mean=0.244, max=0.26, sum=0.732 (3)", - "tab": "Fairness", - "score": 0.2441616402116399 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.448, mean=0.482, max=0.523, sum=1.445 (3)", - "tab": "Fairness", - "score": 0.4817143719085842 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.134, mean=0.136, max=0.138, sum=0.408 (3)", - "tab": "Efficiency", - "score": 0.13591170442708336 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.133, mean=0.135, max=0.138, sum=0.406 (3)", - "tab": "Efficiency", - "score": 0.13529218144379848 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.005, mean=1.031, max=1.08, sum=3.092 (3)", - "tab": "General information", - "score": 1.0306666666666666 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1.078, max=1.209, sum=3.233 (3)", - "tab": "General information", - "score": 1.0775193798449612 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.152, - "details": { - "description": "min=0.144, mean=0.152, max=0.159, sum=1.061 (7)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=0.748, mean=0.799, max=0.848, sum=5.594 (7)", - "tab": "Efficiency", - "score": 0.7991309579692929 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=3262 (7)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=0, mean=4.286, max=5, sum=30 (7)", - "tab": "General information", - "score": 4.285714285714286 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=583.586, mean=1411.872, max=1567.586, sum=9883.101 (7)", - "tab": "General information", - "score": 1411.8715511955854 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=86.798, mean=94.314, max=101.208, sum=660.2 (7)", - "tab": "General information", - "score": 94.31422440220724 - }, - "CNN/DailyMail - # trials": { - "description": "min=1, mean=2.714, max=3, sum=19 (7)", - "tab": "General information", - "score": 2.7142857142857144 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.566, mean=0.61, max=0.637, sum=4.269 (7)", - "tab": "Bias", - "score": 0.609875949224765 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.366, mean=0.387, max=0.406, sum=2.706 (7)", - "tab": "Bias", - "score": 0.38654992671117155 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.282, mean=0.301, max=0.322, sum=2.106 (7)", - "tab": "Bias", - "score": 0.30088570849440416 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.1, mean=0.118, max=0.133, sum=0.827 (7)", - "tab": "Bias", - "score": 0.11810804679822585 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.156, mean=0.291, max=0.356, sum=1.165 (4)", - "tab": "Summarization metrics", - "score": 0.2913458656100147 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.214, mean=4.616, max=4.743, sum=32.315 (7)", - "tab": "Summarization metrics", - "score": 4.616429547159027 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.299, mean=0.306, max=0.314, sum=1.222 (4)", - "tab": "Summarization metrics", - "score": 0.3055441003363248 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.935, mean=0.961, max=0.97, sum=6.725 (7)", - "tab": "Summarization metrics", - "score": 0.9607616041668255 - }, - "CNN/DailyMail - Density": { - "description": "min=17.105, mean=26.1, max=29.982, sum=182.7 (7)", - "tab": "Summarization metrics", - "score": 26.09992906850249 - }, - "CNN/DailyMail - Compression": { - "description": "min=6.155, mean=6.829, max=7.635, sum=47.805 (7)", - "tab": "Summarization metrics", - "score": 6.829258437977153 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "min=0.967, mean=0.967, max=0.967, sum=0.967 (1)", - "tab": "Summarization metrics", - "score": 0.9666666666666669 - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "min=4.587, mean=4.587, max=4.587, sum=4.587 (1)", - "tab": "Summarization metrics", - "score": 4.586666666666667 - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "min=4.243, mean=4.243, max=4.243, sum=4.243 (1)", - "tab": "Summarization metrics", - "score": 4.243333333333334 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.076, - "details": { - "description": "min=0.056, mean=0.076, max=0.081, sum=0.533 (7)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=0.349, mean=0.364, max=0.408, sum=2.548 (7)", - "tab": "Efficiency", - "score": 0.36398217373942815 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3626 (7)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=0, mean=4.285, max=5, sum=29.992 (7)", - "tab": "General information", - "score": 4.284611141753999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=388.402, mean=1350.13, max=1538.921, sum=9450.911 (7)", - "tab": "General information", - "score": 1350.1301709873137 - }, - "XSUM - # output tokens": { - "description": "min=29.917, mean=32.345, max=40.357, sum=226.415 (7)", - "tab": "General information", - "score": 32.3450082735797 - }, - "XSUM - # trials": { - "description": "min=1, mean=2.714, max=3, sum=19 (7)", - "tab": "General information", - "score": 2.7142857142857144 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4.667 (7)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.419, mean=0.442, max=0.466, sum=3.093 (7)", - "tab": "Bias", - "score": 0.4418823146165695 - }, - "XSUM - Representation (race)": { - "description": "min=0.473, mean=0.54, max=0.584, sum=3.777 (7)", - "tab": "Bias", - "score": 0.5395129666982432 - }, - "XSUM - Representation (gender)": { - "description": "min=0.172, mean=0.194, max=0.228, sum=1.356 (7)", - "tab": "Bias", - "score": 0.1937219794503278 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.012 (7)", - "tab": "Toxicity", - "score": 0.0016547159404302263 - }, - "XSUM - SummaC": { - "description": "min=-0.241, mean=-0.185, max=-0.057, sum=-0.741 (4)", - "tab": "Summarization metrics", - "score": -0.18531544589014434 - }, - "XSUM - QAFactEval": { - "description": "min=3.199, mean=3.459, max=3.799, sum=24.213 (7)", - "tab": "Summarization metrics", - "score": 3.458996653634986 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.308, mean=0.354, max=0.372, sum=1.415 (4)", - "tab": "Summarization metrics", - "score": 0.3536865086232682 - }, - "XSUM - Coverage": { - "description": "min=0.823, mean=0.839, max=0.903, sum=5.872 (7)", - "tab": "Summarization metrics", - "score": 0.838839539634714 - }, - "XSUM - Density": { - "description": "min=3.005, mean=4.008, max=8.274, sum=28.059 (7)", - "tab": "Summarization metrics", - "score": 4.008473483028278 - }, - "XSUM - Compression": { - "description": "min=11.556, mean=12.98, max=13.601, sum=90.86 (7)", - "tab": "Summarization metrics", - "score": 12.979988031884476 - }, - "XSUM - HumanEval-faithfulness": { - "description": "min=0.957, mean=0.991, max=1, sum=4.957 (5)", - "tab": "Summarization metrics", - "score": 0.9913333333333334 - }, - "XSUM - HumanEval-relevance": { - "description": "min=4, mean=4.068, max=4.34, sum=20.34 (5)", - "tab": "Summarization metrics", - "score": 4.068 - }, - "XSUM - HumanEval-coherence": { - "description": "min=4.273, mean=4.321, max=4.333, sum=21.607 (5)", - "tab": "Summarization metrics", - "score": 4.3213333333333335 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.923, - "details": { - "description": "min=0.915, mean=0.923, max=0.927, sum=2.768 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.027, mean=0.031, max=0.034, sum=0.093 (3)", - "tab": "Calibration", - "score": 0.03108408690404522 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.876, mean=0.881, max=0.887, sum=2.642 (3)", - "tab": "Robustness", - "score": 0.8806666666666666 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.903, mean=0.91, max=0.916, sum=2.731 (3)", - "tab": "Fairness", - "score": 0.9103333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.142, mean=0.147, max=0.151, sum=0.442 (3)", - "tab": "Efficiency", - "score": 0.1473289437934027 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)", - "tab": "General information", - "score": 4.242 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)", - "tab": "General information", - "score": 1553.363 - }, - "IMDB - # output tokens": { - "description": "min=0.998, mean=0.999, max=1, sum=2.996 (3)", - "tab": "General information", - "score": 0.9986666666666667 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537, - "details": { - "description": "min=0.04, mean=0.537, max=0.93, sum=29.013 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.049, mean=0.262, max=0.674, sum=14.15 (54)", - "tab": "Calibration", - "score": 0.26204430696260744 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.129, max=0.39, sum=6.954 (54)", - "tab": "Robustness", - "score": 0.12877898867890694 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.02, mean=0.471, max=0.874, sum=25.434 (54)", - "tab": "Fairness", - "score": 0.4710066762167616 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.129, mean=0.142, max=0.149, sum=7.645 (54)", - "tab": "Efficiency", - "score": 0.1415740791295965 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=0.905, mean=0.979, max=1, sum=52.876 (54)", - "tab": "General information", - "score": 0.9791789992573504 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.489, - "details": { - "description": "min=0, mean=0.489, max=0.85, sum=16.15 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.079, mean=0.409, max=1, sum=13.49 (33)", - "tab": "Calibration", - "score": 0.40879785924457385 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.399, max=0.775, sum=13.175 (33)", - "tab": "Robustness", - "score": 0.3992424242424243 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.458, max=0.85, sum=15.125 (33)", - "tab": "Fairness", - "score": 0.45833333333333337 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.13, mean=0.152, max=0.183, sum=5.003 (33)", - "tab": "Efficiency", - "score": 0.1516085454150884 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=0, mean=2.751, max=5.95, sum=90.775 (33)", - "tab": "General information", - "score": 2.750757575757576 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json deleted file mode 100644 index 0e9fa4947..000000000 --- a/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-davinci-002/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "text-davinci-002", - "id": "openai/text-davinci-002", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.4743236143945364 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.9158568720860156 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.8637256699548135 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6036239035087719 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.502171676177358 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.4088448588448588 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.6410087719298245 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.568, - "details": { - "description": "min=0.26, mean=0.568, max=0.86, sum=8.515 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.064, mean=0.176, max=0.264, sum=2.644 (15)", - "tab": "Calibration", - "score": 0.17629729974248792 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.23, mean=0.525, max=0.83, sum=7.868 (15)", - "tab": "Robustness", - "score": 0.5245380116959065 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.24, mean=0.531, max=0.82, sum=7.964 (15)", - "tab": "Fairness", - "score": 0.5309473684210526 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.175, mean=0.196, max=0.215, sum=2.946 (15)", - "tab": "Efficiency", - "score": 0.19643028419682018 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.872, mean=0.877, max=0.883, sum=2.631 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.057, mean=0.064, max=0.068, sum=0.192 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.06391934132499137 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.834, mean=0.841, max=0.854, sum=2.523 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.8410000000000001 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.829, mean=0.837, max=0.844, sum=2.51 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.8366666666666666 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.176, mean=0.191, max=0.216, sum=0.574 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.1911954346788195 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1.009, mean=1.013, max=1.018, sum=3.039 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.013 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.711, mean=0.727, max=0.752, sum=2.182 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.214, mean=0.239, max=0.268, sum=0.718 (3)", - "tab": "Calibration", - "score": 0.2393596998509794 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.61, mean=0.638, max=0.663, sum=1.915 (3)", - "tab": "Robustness", - "score": 0.6382180079306305 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.637, mean=0.646, max=0.664, sum=1.938 (3)", - "tab": "Fairness", - "score": 0.6459531095726224 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=0.48, mean=0.512, max=0.539, sum=1.537 (3)", - "tab": "Efficiency", - "score": 0.5124278205692486 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.259, mean=4.532, max=4.955, sum=13.597 (3)", - "tab": "General information", - "score": 4.532394366197183 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3479.563, mean=3579.093, max=3633.659, sum=10737.279 (3)", - "tab": "General information", - "score": 3579.092957746479 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.158, mean=7.378, max=8.448, sum=22.135 (3)", - "tab": "General information", - "score": 7.378403755868544 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.363, mean=0.395, max=0.417, sum=1.184 (3)", - "tab": "Bias", - "score": 0.39479717813051146 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.17, mean=0.189, max=0.21, sum=0.568 (3)", - "tab": "Bias", - "score": 0.18948121770702417 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.013, max=0.017, sum=0.039 (3)", - "tab": "Toxicity", - "score": 0.013145539906103286 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.713, - "details": { - "description": "min=0.71, mean=0.713, max=0.716, sum=2.139 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.315, mean=0.341, max=0.356, sum=1.022 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.34056739358291327 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.233, mean=0.242, max=0.247, sum=0.726 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.24207582378172995 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.279, mean=0.299, max=0.31, sum=0.896 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.29853007347043187 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.66, mean=0.665, max=0.67, sum=1.994 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.6645627340843298 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.311, mean=0.32, max=0.326, sum=0.96 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.3200640288704773 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.655, mean=0.659, max=0.663, sum=1.976 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.658783235208417 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.259, mean=0.264, max=0.268, sum=0.791 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.26376651302083315 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=0.387, mean=0.394, max=0.398, sum=1.182 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.3939576829427085 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=3.783, mean=3.954, max=4.116, sum=11.861 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.9536666666666664 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.874, mean=4.883, max=4.891, sum=14.65 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.883333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.02 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1315.257, mean=1520.977, max=1629.945, sum=4562.931 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1520.977 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.586, mean=6.652, max=6.739, sum=19.957 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 6.652333333333334 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.439, mean=0.448, max=0.467, sum=1.344 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.44795321637426905 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.079, mean=0.129, max=0.167, sum=0.388 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.1294903926482874 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.4, mean=0.407, max=0.42, sum=1.22 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.40666666666666673 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.474, mean=0.487, max=0.505, sum=1.46 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.48653132655730696 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.375, mean=0.401, max=0.44, sum=1.202 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.40059748427672953 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445, - "details": { - "description": "min=0.435, mean=0.445, max=0.451, sum=1.335 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.234, mean=0.274, max=0.301, sum=0.821 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.27378530130603257 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.313, mean=0.319, max=0.331, sum=0.958 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.3193910892114107 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.339, mean=0.353, max=0.363, sum=1.06 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.3532761321768228 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=0.887, mean=0.891, max=0.894, sum=2.674 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.8912715646701383 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=2.978, mean=3.438, max=3.878, sum=10.315 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.438333333333333 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=2819.048, mean=3249.907, max=3487.39, sum=9749.722 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3249.907333333333 - }, - "QuAC - # output tokens": { - "description": "min=20.711, mean=20.986, max=21.534, sum=62.959 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 20.98633333333333 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.567, mean=0.579, max=0.6, sum=1.738 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5793650793650794 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.443, mean=0.453, max=0.461, sum=1.358 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4526990667248227 - }, - "QuAC - Representation (race)": { - "description": "min=0.256, mean=0.27, max=0.28, sum=0.81 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.2701590708612791 - }, - "QuAC - Representation (gender)": { - "description": "min=0.245, mean=0.255, max=0.265, sum=0.764 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.2545671124587146 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.003, sum=0.007 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0023333333333333335 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.286 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.2864163850455534 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.776, mean=0.776, max=0.776, sum=0.776 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.776 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.703, mean=0.703, max=0.703, sum=0.703 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.703 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "min=0.171, mean=0.171, max=0.171, sum=0.171 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.1710758125 - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.594, - "details": { - "description": "min=0.594, mean=0.594, max=0.594, sum=0.594 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.23789749910476482 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.52 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=0.54 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.54 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "min=0.158, mean=0.158, max=0.158, sum=0.158 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Efficiency", - "score": 0.1578440234375 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "details": { - "description": "min=0.596, mean=0.61, max=0.63, sum=1.829 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.167, mean=0.199, max=0.232, sum=0.596 (3)", - "tab": "Calibration", - "score": 0.19868497875362334 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.517, mean=0.547, max=0.573, sum=1.641 (3)", - "tab": "Robustness", - "score": 0.5468909276248726 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.48, mean=0.515, max=0.547, sum=1.546 (3)", - "tab": "Fairness", - "score": 0.5152905198776758 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.186, mean=0.2, max=0.208, sum=0.601 (3)", - "tab": "Efficiency", - "score": 0.20048467762487246 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.664, - "details": { - "description": "min=0.642, mean=0.664, max=0.685, sum=1.991 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.327, mean=0.344, max=0.366, sum=1.031 (3)", - "tab": "Robustness", - "score": 0.3435873015873012 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.615, mean=0.628, max=0.641, sum=1.884 (3)", - "tab": "Robustness", - "score": 0.627999061572698 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.357, mean=0.373, max=0.39, sum=1.12 (3)", - "tab": "Fairness", - "score": 0.3732579365079361 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.614, mean=0.639, max=0.663, sum=1.917 (3)", - "tab": "Fairness", - "score": 0.6388640932298691 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "min=0.174, mean=0.192, max=0.207, sum=0.577 (3)", - "tab": "Efficiency", - "score": 0.19244404882812502 - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "min=0.173, mean=0.198, max=0.213, sum=0.594 (3)", - "tab": "Efficiency", - "score": 0.19810631661821707 - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1.006, mean=1.014, max=1.024, sum=3.042 (3)", - "tab": "General information", - "score": 1.014 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=0.977, mean=0.992, max=1, sum=2.977 (3)", - "tab": "General information", - "score": 0.9922480620155039 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.153, - "details": { - "description": "min=0.148, mean=0.153, max=0.156, sum=1.074 (7)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=2.064, mean=2.236, max=2.638, sum=15.65 (7)", - "tab": "Efficiency", - "score": 2.235718461202547 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=3262 (7)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=0, mean=4.286, max=5, sum=30 (7)", - "tab": "General information", - "score": 4.285714285714286 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=583.586, mean=1411.872, max=1567.586, sum=9883.101 (7)", - "tab": "General information", - "score": 1411.8715511955854 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=64.197, mean=70.37, max=85.644, sum=492.592 (7)", - "tab": "General information", - "score": 70.37032495401594 - }, - "CNN/DailyMail - # trials": { - "description": "min=1, mean=2.714, max=3, sum=19 (7)", - "tab": "General information", - "score": 2.7142857142857144 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.603, mean=0.625, max=0.667, sum=4.375 (7)", - "tab": "Bias", - "score": 0.6249837439576494 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.388, mean=0.408, max=0.42, sum=2.856 (7)", - "tab": "Bias", - "score": 0.4080224162158765 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.238, mean=0.293, max=0.347, sum=2.051 (7)", - "tab": "Bias", - "score": 0.293047968208597 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.07, mean=0.107, max=0.138, sum=0.752 (7)", - "tab": "Bias", - "score": 0.1073937839039085 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.136, mean=0.353, max=0.455, sum=1.412 (4)", - "tab": "Summarization metrics", - "score": 0.35298687802144607 - }, - "CNN/DailyMail - QAFactEval": { - "description": "min=4.04, mean=4.635, max=4.834, sum=32.448 (7)", - "tab": "Summarization metrics", - "score": 4.635409033816104 - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.303, mean=0.321, max=0.333, sum=1.283 (4)", - "tab": "Summarization metrics", - "score": 0.3206946902747002 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.904, mean=0.946, max=0.957, sum=6.625 (7)", - "tab": "Summarization metrics", - "score": 0.9464923911138073 - }, - "CNN/DailyMail - Density": { - "description": "min=13.275, mean=15.995, max=17.016, sum=111.962 (7)", - "tab": "Summarization metrics", - "score": 15.994591776988235 - }, - "CNN/DailyMail - Compression": { - "description": "min=7.152, mean=8.818, max=9.675, sum=61.729 (7)", - "tab": "Summarization metrics", - "score": 8.818392473408851 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "min=0.993, mean=0.999, max=1, sum=6.993 (7)", - "tab": "Summarization metrics", - "score": 0.9990476190476191 - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "min=4.333, mean=4.435, max=4.6, sum=31.044 (7)", - "tab": "Summarization metrics", - "score": 4.434920634920635 - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "min=4, mean=4.371, max=5, sum=30.598 (7)", - "tab": "Summarization metrics", - "score": 4.3711111111111105 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.144, - "details": { - "description": "min=0.087, mean=0.144, max=0.161, sum=1.006 (7)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=1.003, mean=1.026, max=1.088, sum=7.181 (7)", - "tab": "Efficiency", - "score": 1.0257979815553757 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3626 (7)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=0, mean=4.286, max=5, sum=30 (7)", - "tab": "General information", - "score": 4.285714285714286 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=388.402, mean=1350.402, max=1539.402, sum=9452.811 (7)", - "tab": "General information", - "score": 1350.4015444015445 - }, - "XSUM - # output tokens": { - "description": "min=27.776, mean=28.674, max=31.952, sum=200.716 (7)", - "tab": "General information", - "score": 28.673745173745175 - }, - "XSUM - # trials": { - "description": "min=1, mean=2.714, max=3, sum=19 (7)", - "tab": "General information", - "score": 2.7142857142857144 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4.667 (7)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.441, mean=0.457, max=0.48, sum=3.202 (7)", - "tab": "Bias", - "score": 0.45745150585486727 - }, - "XSUM - Representation (race)": { - "description": "min=0.376, mean=0.481, max=0.556, sum=3.37 (7)", - "tab": "Bias", - "score": 0.48149813295367977 - }, - "XSUM - Representation (gender)": { - "description": "min=0.19, mean=0.239, max=0.257, sum=1.672 (7)", - "tab": "Bias", - "score": 0.2388259605365298 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.004, sum=0.012 (7)", - "tab": "Toxicity", - "score": 0.0016547159404302263 - }, - "XSUM - SummaC": { - "description": "min=-0.288, mean=-0.273, max=-0.257, sum=-1.091 (4)", - "tab": "Summarization metrics", - "score": -0.2728636190391109 - }, - "XSUM - QAFactEval": { - "description": "min=2.795, mean=3.007, max=3.207, sum=21.05 (7)", - "tab": "Summarization metrics", - "score": 3.0071326818732076 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.366, mean=0.43, max=0.459, sum=1.718 (4)", - "tab": "Summarization metrics", - "score": 0.4296202005928721 - }, - "XSUM - Coverage": { - "description": "min=0.789, mean=0.801, max=0.833, sum=5.604 (7)", - "tab": "Summarization metrics", - "score": 0.8005553389114972 - }, - "XSUM - Density": { - "description": "min=2.471, mean=2.872, max=4.654, sum=20.107 (7)", - "tab": "Summarization metrics", - "score": 2.8724523474356 - }, - "XSUM - Compression": { - "description": "min=13.554, mean=14.07, max=14.306, sum=98.488 (7)", - "tab": "Summarization metrics", - "score": 14.069713395015288 - }, - "XSUM - HumanEval-faithfulness": { - "description": "min=0.762, mean=0.849, max=0.963, sum=5.941 (7)", - "tab": "Summarization metrics", - "score": 0.848692365835223 - }, - "XSUM - HumanEval-relevance": { - "description": "min=4.277, mean=4.41, max=4.63, sum=30.869 (7)", - "tab": "Summarization metrics", - "score": 4.40989417989418 - }, - "XSUM - HumanEval-coherence": { - "description": "min=4.403, mean=4.685, max=4.815, sum=32.795 (7)", - "tab": "Summarization metrics", - "score": 4.684981103552532 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.948, - "details": { - "description": "min=0.945, mean=0.948, max=0.953, sum=2.843 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.029, mean=0.031, max=0.033, sum=0.092 (3)", - "tab": "Calibration", - "score": 0.03076843904734194 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.898, mean=0.925, max=0.946, sum=2.776 (3)", - "tab": "Robustness", - "score": 0.9253333333333332 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.919, mean=0.934, max=0.945, sum=2.803 (3)", - "tab": "Fairness", - "score": 0.9343333333333333 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=0.218, mean=0.247, max=0.279, sum=0.741 (3)", - "tab": "Efficiency", - "score": 0.24716598621961808 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1282.797, mean=1897.464, max=2572.797, sum=5692.391 (3)", - "tab": "General information", - "score": 1897.4636666666665 - }, - "IMDB - # output tokens": { - "description": "min=0.999, mean=1.0, max=1, sum=2.999 (3)", - "tab": "General information", - "score": 0.9996666666666667 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.668, - "details": { - "description": "min=0.4, mean=0.668, max=0.876, sum=36.093 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.018, mean=0.183, max=0.424, sum=9.875 (54)", - "tab": "Calibration", - "score": 0.18286487616515196 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.287, mean=0.567, max=0.838, sum=30.64 (54)", - "tab": "Robustness", - "score": 0.5673997819699065 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.082, mean=0.463, max=0.851, sum=24.991 (54)", - "tab": "Fairness", - "score": 0.46278978149694866 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.174, mean=0.186, max=0.217, sum=10.038 (54)", - "tab": "Efficiency", - "score": 0.18589157378997984 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=0.967, mean=0.997, max=1, sum=53.855 (54)", - "tab": "General information", - "score": 0.9973133394349212 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.15, mean=0.733, max=0.975, sum=24.175 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.043, mean=0.212, max=0.586, sum=6.999 (33)", - "tab": "Calibration", - "score": 0.21210473630230625 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.666, max=0.975, sum=21.975 (33)", - "tab": "Robustness", - "score": 0.665909090909091 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.125, mean=0.671, max=0.975, sum=22.15 (33)", - "tab": "Fairness", - "score": 0.6712121212121211 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.195, mean=0.276, max=0.351, sum=9.119 (33)", - "tab": "Efficiency", - "score": 0.27634172535905943 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=2.025, mean=4.752, max=5, sum=156.8 (33)", - "tab": "General information", - "score": 4.751515151515152 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=1033.465, max=3591.4, sum=34104.35 (33)", - "tab": "General information", - "score": 1033.4651515151515 - }, - "RAFT - # output tokens": { - "description": "min=0.875, mean=3.057, max=6.85, sum=100.875 (33)", - "tab": "General information", - "score": 3.0568181818181817 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json deleted file mode 100644 index 9ca831c0f..000000000 --- a/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/openai_text-davinci-003/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "text-davinci-003", - "id": "openai/text-davinci-003", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.4065137447036923 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.9095617026651509 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.9027696441489546 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4087317179294733 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.4974399057732391 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.5263157894736842 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.569, - "details": { - "description": "min=0.28, mean=0.569, max=0.86, sum=8.532 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.127, mean=0.317, max=0.54, sum=4.761 (15)", - "tab": "Calibration", - "score": 0.31740378740673564 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.19, mean=0.517, max=0.84, sum=7.752 (15)", - "tab": "Robustness", - "score": 0.5167953216374268 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.24, mean=0.537, max=0.83, sum=8.054 (15)", - "tab": "Fairness", - "score": 0.5369590643274853 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.881, - "details": { - "description": "min=0.879, mean=0.881, max=0.883, sum=2.644 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.097, mean=0.098, max=0.099, sum=0.295 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.09835218401604591 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.851, mean=0.858, max=0.864, sum=2.573 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.8576666666666667 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.854, mean=0.858, max=0.861, sum=2.574 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.858 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1.036, mean=1.043, max=1.058, sum=3.13 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0433333333333332 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5 - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0006666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.703, mean=0.727, max=0.747, sum=2.181 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.346, mean=0.37, max=0.389, sum=1.111 (3)", - "tab": "Calibration", - "score": 0.3702182824812234 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.673, mean=0.694, max=0.713, sum=2.082 (3)", - "tab": "Robustness", - "score": 0.6939161040603179 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.643, mean=0.664, max=0.682, sum=1.993 (3)", - "tab": "Fairness", - "score": 0.6644210581739292 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.259, mean=4.532, max=4.955, sum=13.597 (3)", - "tab": "General information", - "score": 4.532394366197183 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3479.563, mean=3579.093, max=3633.659, sum=10737.279 (3)", - "tab": "General information", - "score": 3579.092957746479 - }, - "NarrativeQA - # output tokens": { - "description": "min=8.231, mean=9.164, max=9.732, sum=27.493 (3)", - "tab": "General information", - "score": 9.16431924882629 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.424, mean=0.442, max=0.464, sum=1.327 (3)", - "tab": "Bias", - "score": 0.44232989232989234 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.169, mean=0.177, max=0.187, sum=0.532 (3)", - "tab": "Bias", - "score": 0.17722658310007708 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.013, max=0.014, sum=0.039 (3)", - "tab": "Toxicity", - "score": 0.013145539906103287 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.768, mean=0.77, max=0.773, sum=2.311 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.27, mean=0.286, max=0.299, sum=0.857 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.28562303267045125 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.318, mean=0.323, max=0.331, sum=0.969 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.3230345144505907 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.36, mean=0.369, max=0.376, sum=1.106 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.36865975256659933 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.729, mean=0.73, max=0.733, sum=2.191 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.7304543451569532 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.349, mean=0.356, max=0.361, sum=1.069 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.3564629891973459 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.719, mean=0.721, max=0.725, sum=2.164 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.7213345530431851 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=7.074, mean=7.964, max=8.442, sum=23.891 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 7.963666666666666 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.874, mean=4.883, max=4.891, sum=14.65 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 4.883333333333334 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.02 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1315.257, mean=1520.977, max=1629.945, sum=4562.931 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1520.977 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.8, mean=6.937, max=7.011, sum=20.81 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 6.9366666666666665 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.452, mean=0.484, max=0.5, sum=1.452 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4841269841269842 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.292, mean=0.347, max=0.43, sum=1.042 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.34749417249417247 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.167, mean=0.27, max=0.367, sum=0.811 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.27037037037037037 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.429, mean=0.443, max=0.454, sum=1.328 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4428170082518513 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.375, mean=0.407, max=0.423, sum=1.221 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.407051282051282 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.525, - "details": { - "description": "min=0.496, mean=0.525, max=0.54, sum=1.574 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.259, mean=0.27, max=0.279, sum=0.809 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.2696184343953211 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.401, mean=0.42, max=0.432, sum=1.26 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.4199382541834728 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.427, mean=0.45, max=0.465, sum=1.351 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.45040220156517236 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=2.978, mean=3.438, max=3.878, sum=10.315 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.438333333333333 - }, - "QuAC - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "QuAC - # prompt tokens": { - "description": "min=2819.048, mean=3249.907, max=3487.39, sum=9749.722 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3249.907333333333 - }, - "QuAC - # output tokens": { - "description": "min=25.946, mean=27.199, max=28.821, sum=81.596 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 27.198666666666668 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.556, mean=0.582, max=0.606, sum=1.745 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.5816498316498318 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.427, mean=0.428, max=0.43, sum=1.285 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.4283515137656795 - }, - "QuAC - Representation (race)": { - "description": "min=0.321, mean=0.369, max=0.395, sum=1.106 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.368660072841299 - }, - "QuAC - Representation (gender)": { - "description": "min=0.244, mean=0.257, max=0.27, sum=0.772 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Bias", - "score": 0.2573013036656095 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=0.822 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "min=0.278, mean=0.278, max=0.278, sum=0.278 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.2781634038368795 - }, - "HellaSwag - EM (Robustness)": { - "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.798 - }, - "HellaSwag - EM (Fairness)": { - "description": "min=0.729, mean=0.729, max=0.729, sum=0.729 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.729 - }, - "HellaSwag - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1000.0 - }, - "HellaSwag - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # prompt tokens": { - "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 87.888 - }, - "HellaSwag - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "HellaSwag - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.646, - "details": { - "description": "min=0.646, mean=0.646, max=0.646, sum=0.646 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "min=0.216, mean=0.216, max=0.216, sum=0.216 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Calibration", - "score": 0.21592533141452896 - }, - "OpenbookQA - EM (Robustness)": { - "description": "min=0.572, mean=0.572, max=0.572, sum=0.572 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Robustness", - "score": 0.572 - }, - "OpenbookQA - EM (Fairness)": { - "description": "min=0.578, mean=0.578, max=0.578, sum=0.578 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "Fairness", - "score": 0.578 - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 5.27 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)\nâš  Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.593, - "details": { - "description": "min=0.558, mean=0.593, max=0.615, sum=1.78 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.329, mean=0.348, max=0.373, sum=1.043 (3)", - "tab": "Calibration", - "score": 0.3477434253470754 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.479, mean=0.516, max=0.54, sum=1.549 (3)", - "tab": "Robustness", - "score": 0.5163098878695208 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.448, mean=0.491, max=0.521, sum=1.474 (3)", - "tab": "Fairness", - "score": 0.491335372069317 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644, - "details": { - "description": "min=0.611, mean=0.644, max=0.662, sum=1.931 (3)", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "min=0.292, mean=0.304, max=0.319, sum=0.911 (3)", - "tab": "Robustness", - "score": 0.3037781746031745 - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "min=0.578, mean=0.616, max=0.645, sum=1.848 (3)", - "tab": "Robustness", - "score": 0.6160995919712035 - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "min=0.322, mean=0.335, max=0.353, sum=1.005 (3)", - "tab": "Fairness", - "score": 0.33500119047619026 - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "min=0.603, mean=0.633, max=0.652, sum=1.898 (3)", - "tab": "Fairness", - "score": 0.6326849780192724 - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "MS MARCO (regular) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (regular) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)", - "tab": "General information", - "score": 532.5653333333333 - }, - "MS MARCO (regular) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (regular) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (TREC) - # eval": { - "description": "min=43, mean=43, max=43, sum=129 (3)", - "tab": "General information", - "score": 43.0 - }, - "MS MARCO (TREC) - # train": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "MS MARCO (TREC) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)", - "tab": "General information", - "score": 515.8217054263565 - }, - "MS MARCO (TREC) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "MS MARCO (TREC) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.156, - "details": { - "description": "min=0.151, mean=0.156, max=0.16, sum=0.935 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=60.524, mean=64.315, max=67.878, sum=385.888 (6)", - "tab": "General information", - "score": 64.31473533619457 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.643, mean=0.646, max=0.652, sum=3.879 (6)", - "tab": "Bias", - "score": 0.6464418252138059 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.404, mean=0.414, max=0.427, sum=2.482 (6)", - "tab": "Bias", - "score": 0.41359496216384023 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.245, mean=0.274, max=0.29, sum=1.641 (6)", - "tab": "Bias", - "score": 0.2735791651454302 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.074, mean=0.083, max=0.099, sum=0.498 (6)", - "tab": "Bias", - "score": 0.08299026507382476 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.33, mean=0.359, max=0.403, sum=1.077 (3)", - "tab": "Summarization metrics", - "score": 0.35893042891379157 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.336, mean=0.342, max=0.347, sum=1.026 (3)", - "tab": "Summarization metrics", - "score": 0.3420449797279243 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.953, mean=0.956, max=0.959, sum=5.734 (6)", - "tab": "Summarization metrics", - "score": 0.9556982855176755 - }, - "CNN/DailyMail - Density": { - "description": "min=7.169, mean=7.545, max=7.928, sum=45.269 (6)", - "tab": "Summarization metrics", - "score": 7.544859402012935 - }, - "CNN/DailyMail - Compression": { - "description": "min=8.736, mean=9.389, max=10.065, sum=56.334 (6)", - "tab": "Summarization metrics", - "score": 9.389062386727216 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.124, - "details": { - "description": "min=0.122, mean=0.124, max=0.126, sum=0.744 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "2 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.735, max=1539.402, sum=9064.409 (6)", - "tab": "General information", - "score": 1510.734877734878 - }, - "XSUM - # output tokens": { - "description": "min=34.797, mean=35.293, max=36.073, sum=211.761 (6)", - "tab": "General information", - "score": 35.293436293436294 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.413, mean=0.449, max=0.482, sum=2.694 (6)", - "tab": "Bias", - "score": 0.44896203413444785 - }, - "XSUM - Representation (race)": { - "description": "min=0.518, mean=0.534, max=0.545, sum=3.202 (6)", - "tab": "Bias", - "score": 0.533635827356637 - }, - "XSUM - Representation (gender)": { - "description": "min=0.234, mean=0.238, max=0.242, sum=1.427 (6)", - "tab": "Bias", - "score": 0.23788037651548422 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.004, sum=0.008 (6)", - "tab": "Toxicity", - "score": 0.001287001287001287 - }, - "XSUM - SummaC": { - "description": "min=-0.313, mean=-0.301, max=-0.289, sum=-0.902 (3)", - "tab": "Summarization metrics", - "score": -0.3005772048135215 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.406, mean=0.411, max=0.414, sum=1.233 (3)", - "tab": "Summarization metrics", - "score": 0.411029433026404 - }, - "XSUM - Coverage": { - "description": "min=0.814, mean=0.822, max=0.829, sum=4.933 (6)", - "tab": "Summarization metrics", - "score": 0.8221014569634312 - }, - "XSUM - Density": { - "description": "min=2.461, mean=2.63, max=2.752, sum=15.779 (6)", - "tab": "Summarization metrics", - "score": 2.6298820148802573 - }, - "XSUM - Compression": { - "description": "min=10.736, mean=10.932, max=11.034, sum=65.59 (6)", - "tab": "Summarization metrics", - "score": 10.931690583444237 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.848, - "details": { - "description": "min=0.828, mean=0.848, max=0.881, sum=2.545 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.086, mean=0.113, max=0.132, sum=0.339 (3)", - "tab": "Calibration", - "score": 0.11283562591578779 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.749, mean=0.779, max=0.827, sum=2.338 (3)", - "tab": "Robustness", - "score": 0.7793333333333333 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.814, mean=0.833, max=0.868, sum=2.498 (3)", - "tab": "Fairness", - "score": 0.8326666666666666 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1282.797, mean=1897.464, max=2572.797, sum=5692.391 (3)", - "tab": "General information", - "score": 1897.4636666666665 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.52, mean=0.684, max=0.863, sum=36.959 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.12, mean=0.292, max=0.449, sum=15.772 (54)", - "tab": "Calibration", - "score": 0.29207184855040197 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.366, mean=0.594, max=0.838, sum=32.08 (54)", - "tab": "Robustness", - "score": 0.5940672674614373 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.253, mean=0.559, max=0.863, sum=30.179 (54)", - "tab": "Fairness", - "score": 0.5588650073949972 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1.0, max=1.007, sum=54.007 (54)", - "tab": "General information", - "score": 1.0001279344975371 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.759, - "details": { - "description": "min=0.075, mean=0.759, max=0.95, sum=25.05 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.037, mean=0.203, max=0.736, sum=6.696 (33)", - "tab": "Calibration", - "score": 0.2029109351449743 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.05, mean=0.714, max=0.95, sum=23.55 (33)", - "tab": "Robustness", - "score": 0.7136363636363635 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.05, mean=0.705, max=0.95, sum=23.275 (33)", - "tab": "Fairness", - "score": 0.7053030303030302 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=2.025, mean=4.752, max=5, sum=156.8 (33)", - "tab": "General information", - "score": 4.751515151515152 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=1033.465, max=3591.4, sum=34104.35 (33)", - "tab": "General information", - "score": 1033.4651515151515 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=3.137, max=6.7, sum=103.525 (33)", - "tab": "General information", - "score": 3.1371212121212113 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json deleted file mode 100644 index cf2a4b297..000000000 --- a/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Alpaca 7B", - "id": "stanford/Alpaca-7B", - "developer": "stanford", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.381, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.3335337650323774 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.37923076923076926 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.3719114219114219 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4865162612605669 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6546037296037296 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.385, - "details": { - "description": "min=0.263, mean=0.385, max=0.6, sum=1.923 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.151, mean=0.234, max=0.32, sum=1.171 (5)", - "tab": "Calibration", - "score": 0.23428857555005617 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.18, mean=0.324, max=0.52, sum=1.621 (5)", - "tab": "Robustness", - "score": 0.32410526315789473 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.219, mean=0.346, max=0.53, sum=1.729 (5)", - "tab": "Fairness", - "score": 0.34585964912280703 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.778, mean=0.778, max=0.778, sum=0.778 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)", - "tab": "Calibration", - "score": 0.3432802705941571 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.643, mean=0.643, max=0.643, sum=0.643 (1)", - "tab": "Robustness", - "score": 0.643 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.729, mean=0.729, max=0.729, sum=0.729 (1)", - "tab": "Fairness", - "score": 0.729 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)", - "tab": "General information", - "score": 1439.447 - }, - "BoolQ - # output tokens": { - "description": "min=4.883, mean=4.883, max=4.883, sum=4.883 (1)", - "tab": "General information", - "score": 4.883 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396, - "details": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.396 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.046, mean=0.046, max=0.046, sum=0.046 (1)", - "tab": "Calibration", - "score": 0.045878175333070315 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.246, mean=0.246, max=0.246, sum=0.246 (1)", - "tab": "Robustness", - "score": 0.24590950452109447 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.299 (1)", - "tab": "Fairness", - "score": 0.2987402817318288 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)", - "tab": "General information", - "score": 1.4366197183098592 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)", - "tab": "General information", - "score": 1541.1154929577465 - }, - "NarrativeQA - # output tokens": { - "description": "min=26.006, mean=26.006, max=26.006, sum=26.006 (1)", - "tab": "General information", - "score": 26.005633802816902 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.41, mean=0.41, max=0.41, sum=0.41 (1)", - "tab": "Bias", - "score": 0.41025641025641024 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.196, mean=0.196, max=0.196, sum=0.196 (1)", - "tab": "Bias", - "score": 0.19627507163323785 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.006, mean=0.006, max=0.006, sum=0.006 (1)", - "tab": "Toxicity", - "score": 0.005633802816901409 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.592, - "details": { - "description": "min=0.592, mean=0.592, max=0.592, sum=0.592 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.134, mean=0.134, max=0.134, sum=0.134 (1)", - "tab": "Calibration", - "score": 0.13434354583448904 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)", - "tab": "Calibration", - "score": 0.23769723451909555 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.203, mean=0.203, max=0.203, sum=0.203 (1)", - "tab": "Robustness", - "score": 0.20255716308011695 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.491, mean=0.491, max=0.491, sum=0.491 (1)", - "tab": "Robustness", - "score": 0.4912677371744195 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.21, mean=0.21, max=0.21, sum=0.21 (1)", - "tab": "Fairness", - "score": 0.20966482260352876 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)", - "tab": "Fairness", - "score": 0.5302078541276196 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=84.53, mean=84.53, max=84.53, sum=84.53 (1)", - "tab": "General information", - "score": 84.53 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)", - "tab": "General information", - "score": 3.722 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)", - "tab": "General information", - "score": 0.049 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)", - "tab": "General information", - "score": 1407.178 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=122.525, mean=122.525, max=122.525, sum=122.525 (1)", - "tab": "General information", - "score": 122.525 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.456 (1)", - "tab": "Bias", - "score": 0.45588235294117646 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)", - "tab": "Bias", - "score": 0.4117647058823529 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.054, mean=0.054, max=0.054, sum=0.054 (1)", - "tab": "Bias", - "score": 0.053571428571428575 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.419, mean=0.419, max=0.419, sum=0.419 (1)", - "tab": "Bias", - "score": 0.4185185185185185 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.454, mean=0.454, max=0.454, sum=0.454 (1)", - "tab": "Bias", - "score": 0.4540682414698163 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.315 (1)", - "tab": "Bias", - "score": 0.31481481481481477 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27, - "details": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.04, mean=0.04, max=0.04, sum=0.04 (1)", - "tab": "Calibration", - "score": 0.04026034301598206 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.16, mean=0.16, max=0.16, sum=0.16 (1)", - "tab": "Robustness", - "score": 0.1604861950978603 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)", - "tab": "Fairness", - "score": 0.20395081036123316 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "General information", - "score": 0.507 - }, - "QuAC - truncated": { - "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)", - "tab": "General information", - "score": 0.06 - }, - "QuAC - # prompt tokens": { - "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)", - "tab": "General information", - "score": 1498.657 - }, - "QuAC - # output tokens": { - "description": "min=77.323, mean=77.323, max=77.323, sum=77.323 (1)", - "tab": "General information", - "score": 77.323 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.636, mean=0.636, max=0.636, sum=0.636 (1)", - "tab": "Bias", - "score": 0.6363636363636365 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)", - "tab": "Bias", - "score": 0.4349771051252814 - }, - "QuAC - Representation (race)": { - "description": "min=0.236, mean=0.236, max=0.236, sum=0.236 (1)", - "tab": "Bias", - "score": 0.23589743589743586 - }, - "QuAC - Representation (gender)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)", - "tab": "Bias", - "score": 0.2813953488372093 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.243, - "details": { - "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)", - "tab": "Calibration", - "score": 0.3750196178145884 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.199, mean=0.199, max=0.199, sum=0.199 (1)", - "tab": "Robustness", - "score": 0.19877675840978593 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.202, mean=0.202, max=0.202, sum=0.202 (1)", - "tab": "Fairness", - "score": 0.2018348623853211 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)", - "tab": "General information", - "score": 524.6024464831804 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.738, - "details": { - "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)", - "tab": "Calibration", - "score": 0.28073357253102127 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)", - "tab": "Robustness", - "score": 0.561 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)", - "tab": "Fairness", - "score": 0.699 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)", - "tab": "General information", - "score": 2.781 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)", - "tab": "General information", - "score": 1751.213 - }, - "IMDB - # output tokens": { - "description": "min=4.966, mean=4.966, max=4.966, sum=4.966 (1)", - "tab": "General information", - "score": 4.966 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566, - "details": { - "description": "min=0.158, mean=0.566, max=0.939, sum=10.184 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.162, mean=0.352, max=0.606, sum=6.328 (18)", - "tab": "Calibration", - "score": 0.3515610942498128 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.133, mean=0.482, max=0.844, sum=8.674 (18)", - "tab": "Robustness", - "score": 0.4818807145268457 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.122, mean=0.483, max=0.818, sum=8.691 (18)", - "tab": "Fairness", - "score": 0.4828512879651531 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)", - "tab": "General information", - "score": 855.2410378605821 - }, - "CivilComments - # output tokens": { - "description": "min=2.746, mean=4.216, max=4.89, sum=75.887 (18)", - "tab": "General information", - "score": 4.2159316386124255 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486, - "details": { - "description": "min=0, mean=0.486, max=0.9, sum=5.35 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.004, mean=0.33, max=0.711, sum=3.626 (11)", - "tab": "Calibration", - "score": 0.3296795633615674 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.42, max=0.875, sum=4.625 (11)", - "tab": "Robustness", - "score": 0.42045454545454536 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.459, max=0.9, sum=5.05 (11)", - "tab": "Fairness", - "score": 0.45909090909090916 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)", - "tab": "General information", - "score": 4.552272727272727 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)", - "tab": "General information", - "score": 954.1113636363635 - }, - "RAFT - # output tokens": { - "description": "min=3.7, mean=19.468, max=30, sum=214.15 (11)", - "tab": "General information", - "score": 19.468181818181815 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json deleted file mode 100644 index 97f13c6d9..000000000 --- a/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon 40B", - "id": "tiiuae/Falcon-40B", - "developer": "tiiuae", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.729, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.7051048951048952 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.6857342657342658 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.48586479674272687 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.4706876456876457 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "description": "min=0.32, mean=0.509, max=0.79, sum=2.545 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.26, mean=0.457, max=0.76, sum=2.283 (5)", - "tab": "Robustness", - "score": 0.4566315789473684 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.272, mean=0.48, max=0.78, sum=2.402 (5)", - "tab": "Fairness", - "score": 0.4803859649122807 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)", - "tab": "General information", - "score": 500.12014035087725 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.819, - "details": { - "description": "min=0.819, mean=0.819, max=0.819, sum=0.819 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)", - "tab": "Robustness", - "score": 0.763 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", - "tab": "Fairness", - "score": 0.783 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)", - "tab": "General information", - "score": 1284.629 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673, - "details": { - "description": "min=0.673, mean=0.673, max=0.673, sum=0.673 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.557, mean=0.557, max=0.557, sum=0.557 (1)", - "tab": "Robustness", - "score": 0.5574684493620005 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)", - "tab": "Fairness", - "score": 0.5589601433703856 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)", - "tab": "General information", - "score": 2.0253521126760563 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)", - "tab": "General information", - "score": 1694.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.398 (1)", - "tab": "Bias", - "score": 0.39814814814814814 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.191, mean=0.191, max=0.191, sum=0.191 (1)", - "tab": "Bias", - "score": 0.19148936170212763 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)", - "tab": "Toxicity", - "score": 0.01971830985915493 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.329 (1)", - "tab": "Robustness", - "score": 0.32850713007659726 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.593, mean=0.593, max=0.593, sum=0.593 (1)", - "tab": "Robustness", - "score": 0.5930765119599164 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)", - "tab": "Fairness", - "score": 0.33840782877152153 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)", - "tab": "Fairness", - "score": 0.6251513417645462 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)", - "tab": "General information", - "score": 124.246 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.599, mean=4.599, max=4.599, sum=4.599 (1)", - "tab": "General information", - "score": 4.599 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)", - "tab": "General information", - "score": 1587.334 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)", - "tab": "General information", - "score": 0.995 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.256 (1)", - "tab": "Bias", - "score": 0.2556237218813906 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.107, mean=0.107, max=0.107, sum=0.107 (1)", - "tab": "Bias", - "score": 0.10714285714285715 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)", - "tab": "Bias", - "score": 0.4428571428571429 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.382 (1)", - "tab": "Bias", - "score": 0.38245614035087716 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)", - "tab": "Bias", - "score": 0.13157894736842105 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307, - "details": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.162, mean=0.162, max=0.162, sum=0.162 (1)", - "tab": "Robustness", - "score": 0.16237264946195393 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.256 (1)", - "tab": "Fairness", - "score": 0.25646510454177246 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)", - "tab": "General information", - "score": 0.862 - }, - "QuAC - truncated": { - "description": "min=0.031, mean=0.031, max=0.031, sum=0.031 (1)", - "tab": "General information", - "score": 0.031 - }, - "QuAC - # prompt tokens": { - "description": "min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)", - "tab": "General information", - "score": 1667.28 - }, - "QuAC - # output tokens": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "General information", - "score": 0.999 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.468 (1)", - "tab": "Bias", - "score": 0.4681547619047619 - }, - "QuAC - Representation (race)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.423 (1)", - "tab": "Bias", - "score": 0.42342342342342343 - }, - "QuAC - Representation (gender)": { - "description": "min=0.141, mean=0.141, max=0.141, sum=0.141 (1)", - "tab": "Bias", - "score": 0.141304347826087 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.353, - "details": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.303 (1)", - "tab": "Robustness", - "score": 0.30275229357798167 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)", - "tab": "Fairness", - "score": 0.29204892966360857 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=507.503, mean=507.503, max=507.503, sum=507.503 (1)", - "tab": "General information", - "score": 507.50305810397555 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.959, - "details": { - "description": "min=0.959, mean=0.959, max=0.959, sum=0.959 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)", - "tab": "Robustness", - "score": 0.935 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)", - "tab": "Fairness", - "score": 0.954 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.871, mean=2.871, max=2.871, sum=2.871 (1)", - "tab": "General information", - "score": 2.871 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)", - "tab": "General information", - "score": 1666.079 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.552, - "details": { - "description": "min=0.098, mean=0.552, max=0.969, sum=9.936 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.037, mean=0.412, max=0.827, sum=7.414 (18)", - "tab": "Robustness", - "score": 0.4118677862671613 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.098, mean=0.292, max=0.594, sum=5.248 (18)", - "tab": "Fairness", - "score": 0.29157916197633543 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)", - "tab": "General information", - "score": 782.7590374602355 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.2, mean=0.661, max=0.975, sum=7.275 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.586, max=0.975, sum=6.45 (11)", - "tab": "Robustness", - "score": 0.5863636363636363 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.15, mean=0.611, max=0.975, sum=6.725 (11)", - "tab": "Fairness", - "score": 0.6113636363636364 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.6, max=5, sum=50.6 (11)", - "tab": "General information", - "score": 4.6000000000000005 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)", - "tab": "General information", - "score": 877.4636363636364 - }, - "RAFT - # output tokens": { - "description": "min=0.7, mean=0.973, max=1, sum=10.7 (11)", - "tab": "General information", - "score": 0.9727272727272727 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json deleted file mode 100644 index 80c0ac18a..000000000 --- a/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon 7B", - "id": "tiiuae/Falcon-7B", - "developer": "tiiuae", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.4253379953379953 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.4469230769230769 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.35594420480554084 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5821678321678322 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.286, - "details": { - "description": "min=0.17, mean=0.286, max=0.39, sum=1.432 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.13, mean=0.236, max=0.37, sum=1.181 (5)", - "tab": "Robustness", - "score": 0.23610526315789473 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.15, mean=0.261, max=0.33, sum=1.303 (5)", - "tab": "Fairness", - "score": 0.26063157894736844 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)", - "tab": "General information", - "score": 500.12014035087725 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.753, - "details": { - "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)", - "tab": "Robustness", - "score": 0.65 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.702, mean=0.702, max=0.702, sum=0.702 (1)", - "tab": "Fairness", - "score": 0.702 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)", - "tab": "General information", - "score": 1284.629 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621, - "details": { - "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.436 (1)", - "tab": "Robustness", - "score": 0.4358401092976052 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)", - "tab": "Fairness", - "score": 0.5199130399003071 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)", - "tab": "General information", - "score": 2.0253521126760563 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)", - "tab": "General information", - "score": 1694.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)", - "tab": "Bias", - "score": 0.4444444444444444 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)", - "tab": "Bias", - "score": 0.2046979865771812 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.017, mean=0.017, max=0.017, sum=0.017 (1)", - "tab": "Toxicity", - "score": 0.016901408450704224 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.185, mean=0.185, max=0.185, sum=0.185 (1)", - "tab": "Robustness", - "score": 0.18513134554094532 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.489, mean=0.489, max=0.489, sum=0.489 (1)", - "tab": "Robustness", - "score": 0.4889733445855735 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)", - "tab": "Fairness", - "score": 0.2334955595363806 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.537, mean=0.537, max=0.537, sum=0.537 (1)", - "tab": "Fairness", - "score": 0.536571121609654 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)", - "tab": "General information", - "score": 124.246 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.599, mean=4.599, max=4.599, sum=4.599 (1)", - "tab": "General information", - "score": 4.599 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)", - "tab": "General information", - "score": 1587.334 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=0.994 (1)", - "tab": "General information", - "score": 0.994 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.389 (1)", - "tab": "Bias", - "score": 0.38888888888888884 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.476 (1)", - "tab": "Bias", - "score": 0.47619047619047616 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.14, mean=0.14, max=0.14, sum=0.14 (1)", - "tab": "Bias", - "score": 0.14 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Bias", - "score": 0.3333333333333333 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)", - "tab": "Bias", - "score": 0.5528942115768464 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.275, mean=0.275, max=0.275, sum=0.275 (1)", - "tab": "Bias", - "score": 0.2745098039215687 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.332, - "details": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.164, mean=0.164, max=0.164, sum=0.164 (1)", - "tab": "Robustness", - "score": 0.16389145934637706 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.262, mean=0.262, max=0.262, sum=0.262 (1)", - "tab": "Fairness", - "score": 0.2622208848575014 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)", - "tab": "General information", - "score": 0.862 - }, - "QuAC - truncated": { - "description": "min=0.031, mean=0.031, max=0.031, sum=0.031 (1)", - "tab": "General information", - "score": 0.031 - }, - "QuAC - # prompt tokens": { - "description": "min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)", - "tab": "General information", - "score": 1667.28 - }, - "QuAC - # output tokens": { - "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)", - "tab": "General information", - "score": 0.995 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)", - "tab": "Bias", - "score": 0.45680272108843534 - }, - "QuAC - Representation (race)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.402 (1)", - "tab": "Bias", - "score": 0.4022988505747127 - }, - "QuAC - Representation (gender)": { - "description": "min=0.247, mean=0.247, max=0.247, sum=0.247 (1)", - "tab": "Bias", - "score": 0.24695863746958635 - }, - "QuAC - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.234, - "details": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)", - "tab": "Robustness", - "score": 0.20489296636085627 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.213, mean=0.213, max=0.213, sum=0.213 (1)", - "tab": "Fairness", - "score": 0.21253822629969418 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=507.503, mean=507.503, max=507.503, sum=507.503 (1)", - "tab": "General information", - "score": 507.50305810397555 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.692, mean=0.692, max=0.692, sum=0.692 (1)", - "tab": "Robustness", - "score": 0.692 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.794, mean=0.794, max=0.794, sum=0.794 (1)", - "tab": "Fairness", - "score": 0.794 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.871, mean=2.871, max=2.871, sum=2.871 (1)", - "tab": "General information", - "score": 2.871 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)", - "tab": "General information", - "score": 1666.079 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514, - "details": { - "description": "min=0, mean=0.514, max=0.999, sum=9.257 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.485, max=0.999, sum=8.731 (18)", - "tab": "Robustness", - "score": 0.4850751828621894 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.494, max=0.999, sum=8.898 (18)", - "tab": "Fairness", - "score": 0.49430637095445207 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)", - "tab": "General information", - "score": 782.7590374602355 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602, - "details": { - "description": "min=0.15, mean=0.602, max=0.975, sum=6.625 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.516, max=0.975, sum=5.675 (11)", - "tab": "Robustness", - "score": 0.5159090909090908 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.15, mean=0.555, max=0.975, sum=6.1 (11)", - "tab": "Fairness", - "score": 0.5545454545454546 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.6, max=5, sum=50.6 (11)", - "tab": "General information", - "score": 4.6000000000000005 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)", - "tab": "General information", - "score": 877.4636363636364 - }, - "RAFT - # output tokens": { - "description": "min=0.725, mean=0.975, max=1, sum=10.725 (11)", - "tab": "General information", - "score": 0.975 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json deleted file mode 100644 index 4b7c6b681..000000000 --- a/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon-Instruct 40B", - "id": "tiiuae/Falcon-Instruct-40B", - "developer": "tiiuae", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.7631002331002331 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.7087645687645687 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4307003912490803 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.44994172494172496 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.497, - "details": { - "description": "min=0.263, mean=0.497, max=0.82, sum=2.483 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.228, mean=0.446, max=0.78, sum=2.228 (5)", - "tab": "Robustness", - "score": 0.44561403508771924 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.219, mean=0.466, max=0.8, sum=2.329 (5)", - "tab": "Fairness", - "score": 0.4658596491228071 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)", - "tab": "General information", - "score": 500.12014035087725 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.829, mean=0.829, max=0.829, sum=0.829 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.781, mean=0.781, max=0.781, sum=0.781 (1)", - "tab": "Robustness", - "score": 0.781 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)", - "tab": "Fairness", - "score": 0.799 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)", - "tab": "General information", - "score": 1284.629 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.508, mean=0.508, max=0.508, sum=0.508 (1)", - "tab": "Robustness", - "score": 0.5082425698893845 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.543, mean=0.543, max=0.543, sum=0.543 (1)", - "tab": "Fairness", - "score": 0.543279669317833 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)", - "tab": "General information", - "score": 2.0253521126760563 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)", - "tab": "General information", - "score": 1694.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)", - "tab": "Bias", - "score": 0.33194444444444443 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", - "tab": "Bias", - "score": 0.4666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.175, mean=0.175, max=0.175, sum=0.175 (1)", - "tab": "Bias", - "score": 0.17464114832535887 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)", - "tab": "Toxicity", - "score": 0.011267605633802818 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.666, - "details": { - "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.335 (1)", - "tab": "Robustness", - "score": 0.33514492181201283 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.591, mean=0.591, max=0.591, sum=0.591 (1)", - "tab": "Robustness", - "score": 0.5912781280483248 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.331 (1)", - "tab": "Fairness", - "score": 0.33094416222152356 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)", - "tab": "Fairness", - "score": 0.6067807528449897 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)", - "tab": "General information", - "score": 124.246 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "General information", - "score": 0.999 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.599, mean=4.599, max=4.599, sum=4.599 (1)", - "tab": "General information", - "score": 4.599 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)", - "tab": "General information", - "score": 1587.334 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)", - "tab": "General information", - "score": 0.995 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.415 (1)", - "tab": "Bias", - "score": 0.41463414634146334 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.155, mean=0.155, max=0.155, sum=0.155 (1)", - "tab": "Bias", - "score": 0.15517241379310343 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)", - "tab": "Bias", - "score": 0.42000000000000004 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.552, mean=0.552, max=0.552, sum=0.552 (1)", - "tab": "Bias", - "score": 0.5516224188790559 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.195, mean=0.195, max=0.195, sum=0.195 (1)", - "tab": "Bias", - "score": 0.19491525423728814 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "Toxicity", - "score": 0.002 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.371, - "details": { - "description": "min=0.371, mean=0.371, max=0.371, sum=0.371 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.212, mean=0.212, max=0.212, sum=0.212 (1)", - "tab": "Robustness", - "score": 0.21167117057056115 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.308 (1)", - "tab": "Fairness", - "score": 0.3078257563786361 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)", - "tab": "General information", - "score": 0.862 - }, - "QuAC - truncated": { - "description": "min=0.031, mean=0.031, max=0.031, sum=0.031 (1)", - "tab": "General information", - "score": 0.031 - }, - "QuAC - # prompt tokens": { - "description": "min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)", - "tab": "General information", - "score": 1667.28 - }, - "QuAC - # output tokens": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "General information", - "score": 0.999 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.418 (1)", - "tab": "Bias", - "score": 0.4182641806722689 - }, - "QuAC - Representation (race)": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.476 (1)", - "tab": "Bias", - "score": 0.4756554307116105 - }, - "QuAC - Representation (gender)": { - "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)", - "tab": "Bias", - "score": 0.2142857142857143 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.384, - "details": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)", - "tab": "Robustness", - "score": 0.3379204892966361 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.312 (1)", - "tab": "Fairness", - "score": 0.3119266055045872 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=507.503, mean=507.503, max=507.503, sum=507.503 (1)", - "tab": "General information", - "score": 507.50305810397555 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.959, - "details": { - "description": "min=0.959, mean=0.959, max=0.959, sum=0.959 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)", - "tab": "Robustness", - "score": 0.938 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.957, mean=0.957, max=0.957, sum=0.957 (1)", - "tab": "Fairness", - "score": 0.957 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.871, mean=2.871, max=2.871, sum=2.871 (1)", - "tab": "General information", - "score": 2.871 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)", - "tab": "General information", - "score": 1666.079 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.603, - "details": { - "description": "min=0.203, mean=0.603, max=0.918, sum=10.849 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.135, mean=0.523, max=0.864, sum=9.414 (18)", - "tab": "Robustness", - "score": 0.5230033316869794 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.16, mean=0.462, max=0.762, sum=8.312 (18)", - "tab": "Fairness", - "score": 0.4617550507789773 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)", - "tab": "General information", - "score": 782.7590374602355 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.586, - "details": { - "description": "min=0.175, mean=0.586, max=0.925, sum=6.45 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.025, mean=0.523, max=0.875, sum=5.75 (11)", - "tab": "Robustness", - "score": 0.5227272727272726 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.175, mean=0.561, max=0.875, sum=6.175 (11)", - "tab": "Fairness", - "score": 0.5613636363636363 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.6, max=5, sum=50.6 (11)", - "tab": "General information", - "score": 4.6000000000000005 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)", - "tab": "General information", - "score": 877.4636363636364 - }, - "RAFT - # output tokens": { - "description": "min=0.825, mean=0.984, max=1, sum=10.825 (11)", - "tab": "General information", - "score": 0.984090909090909 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json deleted file mode 100644 index cd7efa818..000000000 --- a/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon-Instruct 7B", - "id": "tiiuae/Falcon-Instruct-7B", - "developer": "tiiuae", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.244, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.3032867132867133 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.2968298368298368 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.514714004225644 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.29545454545454547 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.275, - "details": { - "description": "min=0.21, mean=0.275, max=0.34, sum=1.374 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.2, mean=0.25, max=0.32, sum=1.248 (5)", - "tab": "Robustness", - "score": 0.24961403508771932 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.2, mean=0.261, max=0.32, sum=1.307 (5)", - "tab": "Fairness", - "score": 0.2613684210526316 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)", - "tab": "General information", - "score": 500.12014035087725 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=0.72 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.593, mean=0.593, max=0.593, sum=0.593 (1)", - "tab": "Robustness", - "score": 0.593 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.637, mean=0.637, max=0.637, sum=0.637 (1)", - "tab": "Fairness", - "score": 0.637 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)", - "tab": "General information", - "score": 1284.629 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.476, - "details": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.476 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.258, mean=0.258, max=0.258, sum=0.258 (1)", - "tab": "Robustness", - "score": 0.2582769089885097 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)", - "tab": "Fairness", - "score": 0.3536054591455644 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)", - "tab": "General information", - "score": 2.0253521126760563 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)", - "tab": "General information", - "score": 1694.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)", - "tab": "Bias", - "score": 0.4444444444444444 - }, - "NarrativeQA - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)", - "tab": "Bias", - "score": 0.1870229007633588 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.017, mean=0.017, max=0.017, sum=0.017 (1)", - "tab": "Toxicity", - "score": 0.016901408450704224 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449, - "details": { - "description": "min=0.449, mean=0.449, max=0.449, sum=0.449 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)", - "tab": "Robustness", - "score": 0.1322266230747346 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)", - "tab": "Robustness", - "score": 0.32667933185026377 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.148, mean=0.148, max=0.148, sum=0.148 (1)", - "tab": "Fairness", - "score": 0.14824932914209746 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)", - "tab": "Fairness", - "score": 0.38333017617065734 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)", - "tab": "General information", - "score": 124.246 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "General information", - "score": 0.999 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.599, mean=4.599, max=4.599, sum=4.599 (1)", - "tab": "General information", - "score": 4.599 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)", - "tab": "General information", - "score": 1587.334 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.984, mean=0.984, max=0.984, sum=0.984 (1)", - "tab": "General information", - "score": 0.984 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)", - "tab": "Bias", - "score": 0.2716049382716049 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.071, mean=0.071, max=0.071, sum=0.071 (1)", - "tab": "Bias", - "score": 0.07142857142857142 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)", - "tab": "Bias", - "score": 0.4257907542579076 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.068, mean=0.068, max=0.068, sum=0.068 (1)", - "tab": "Bias", - "score": 0.0684931506849315 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.311, - "details": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.311 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.179, mean=0.179, max=0.179, sum=0.179 (1)", - "tab": "Robustness", - "score": 0.1789889679486199 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.219, mean=0.219, max=0.219, sum=0.219 (1)", - "tab": "Fairness", - "score": 0.21915649953692506 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)", - "tab": "General information", - "score": 0.862 - }, - "QuAC - truncated": { - "description": "min=0.031, mean=0.031, max=0.031, sum=0.031 (1)", - "tab": "General information", - "score": 0.031 - }, - "QuAC - # prompt tokens": { - "description": "min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)", - "tab": "General information", - "score": 1667.28 - }, - "QuAC - # output tokens": { - "description": "min=0.997, mean=0.997, max=0.997, sum=0.997 (1)", - "tab": "General information", - "score": 0.997 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)", - "tab": "Bias", - "score": 0.625 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.456 (1)", - "tab": "Bias", - "score": 0.4561372269705603 - }, - "QuAC - Representation (race)": { - "description": "min=0.262, mean=0.262, max=0.262, sum=0.262 (1)", - "tab": "Bias", - "score": 0.26241134751773054 - }, - "QuAC - Representation (gender)": { - "description": "min=0.251, mean=0.251, max=0.251, sum=0.251 (1)", - "tab": "Bias", - "score": 0.25052854122621565 - }, - "QuAC - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "Toxicity", - "score": 0.002 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.213, - "details": { - "description": "min=0.213, mean=0.213, max=0.213, sum=0.213 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.17, mean=0.17, max=0.17, sum=0.17 (1)", - "tab": "Robustness", - "score": 0.16972477064220184 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.183, mean=0.183, max=0.183, sum=0.183 (1)", - "tab": "Fairness", - "score": 0.1834862385321101 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=507.503, mean=507.503, max=507.503, sum=507.503 (1)", - "tab": "General information", - "score": 507.50305810397555 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.759, mean=0.759, max=0.759, sum=0.759 (1)", - "tab": "Robustness", - "score": 0.759 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)", - "tab": "Fairness", - "score": 0.811 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.871, mean=2.871, max=2.871, sum=2.871 (1)", - "tab": "General information", - "score": 2.871 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)", - "tab": "General information", - "score": 1666.079 - }, - "IMDB - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.511, - "details": { - "description": "min=0, mean=0.511, max=1, sum=9.199 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.487, max=0.999, sum=8.769 (18)", - "tab": "Robustness", - "score": 0.4871679045873981 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.502, max=1, sum=9.031 (18)", - "tab": "Fairness", - "score": 0.5017354752179064 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)", - "tab": "General information", - "score": 782.7590374602355 - }, - "CivilComments - # output tokens": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.523, - "details": { - "description": "min=0.15, mean=0.523, max=0.975, sum=5.75 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.15, mean=0.445, max=0.975, sum=4.9 (11)", - "tab": "Robustness", - "score": 0.4454545454545454 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.15, mean=0.5, max=0.975, sum=5.5 (11)", - "tab": "Fairness", - "score": 0.5 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.6, max=5, sum=50.6 (11)", - "tab": "General information", - "score": 4.6000000000000005 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)", - "tab": "General information", - "score": 877.4636363636364 - }, - "RAFT - # output tokens": { - "description": "min=0.95, mean=0.995, max=1, sum=10.95 (11)", - "tab": "General information", - "score": 0.9954545454545454 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json deleted file mode 100644 index f25c83f2e..000000000 --- a/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-Base 7B", - "id": "together/RedPajama-INCITE-Base-7B", - "developer": "together", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.40883441258094355 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.3311188811188811 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.3233799533799534 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.41358382155085455 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.1998834498834499 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302, - "details": { - "description": "min=0.228, mean=0.302, max=0.38, sum=1.508 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.08, mean=0.098, max=0.13, sum=0.49 (5)", - "tab": "Calibration", - "score": 0.09791468112621773 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.2, mean=0.25, max=0.33, sum=1.251 (5)", - "tab": "Robustness", - "score": 0.2501052631578947 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.219, mean=0.276, max=0.34, sum=1.379 (5)", - "tab": "Fairness", - "score": 0.275859649122807 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.713, - "details": { - "description": "min=0.713, mean=0.713, max=0.713, sum=0.713 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.127, mean=0.127, max=0.127, sum=0.127 (1)", - "tab": "Calibration", - "score": 0.1268200294718189 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)", - "tab": "Robustness", - "score": 0.569 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)", - "tab": "Fairness", - "score": 0.65 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)", - "tab": "General information", - "score": 1251.897 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.617, - "details": { - "description": "min=0.617, mean=0.617, max=0.617, sum=0.617 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)", - "tab": "Calibration", - "score": 0.27605359630786236 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)", - "tab": "Robustness", - "score": 0.4240469400392869 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)", - "tab": "Fairness", - "score": 0.5239003837979788 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "General information", - "score": 1.9690140845070423 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)", - "tab": "General information", - "score": 1691.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)", - "tab": "Bias", - "score": 0.4375 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.171, mean=0.171, max=0.171, sum=0.171 (1)", - "tab": "Bias", - "score": 0.17123287671232879 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.014, max=0.014, sum=0.014 (1)", - "tab": "Toxicity", - "score": 0.014084507042253521 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.586, - "details": { - "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.127, mean=0.127, max=0.127, sum=0.127 (1)", - "tab": "Calibration", - "score": 0.12699960693149975 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.396 (1)", - "tab": "Calibration", - "score": 0.39598996118757757 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)", - "tab": "Robustness", - "score": 0.1665503977180178 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.472 (1)", - "tab": "Robustness", - "score": 0.47226706838923 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.193, mean=0.193, max=0.193, sum=0.193 (1)", - "tab": "Fairness", - "score": 0.19300226376410895 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)", - "tab": "Fairness", - "score": 0.5136843159783826 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)", - "tab": "General information", - "score": 117.299 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)", - "tab": "General information", - "score": 4.704 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)", - "tab": "General information", - "score": 1495.552 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.498, mean=0.498, max=0.498, sum=0.498 (1)", - "tab": "Bias", - "score": 0.49783549783549785 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.289 (1)", - "tab": "Bias", - "score": 0.2894736842105263 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.408 (1)", - "tab": "Bias", - "score": 0.4081597222222222 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)", - "tab": "Bias", - "score": 0.4124293785310734 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.256 (1)", - "tab": "Bias", - "score": 0.25630252100840334 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)", - "tab": "Toxicity", - "score": 0.002 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.336, - "details": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.336 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.131, mean=0.131, max=0.131, sum=0.131 (1)", - "tab": "Calibration", - "score": 0.13131742636553145 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.186, mean=0.186, max=0.186, sum=0.186 (1)", - "tab": "Robustness", - "score": 0.18577129287689287 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)", - "tab": "Fairness", - "score": 0.23848247289290064 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)", - "tab": "General information", - "score": 0.883 - }, - "QuAC - truncated": { - "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)", - "tab": "General information", - "score": 0.021 - }, - "QuAC - # prompt tokens": { - "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)", - "tab": "General information", - "score": 1655.708 - }, - "QuAC - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.382 (1)", - "tab": "Bias", - "score": 0.38163008049881736 - }, - "QuAC - Representation (race)": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.38 (1)", - "tab": "Bias", - "score": 0.3802816901408451 - }, - "QuAC - Representation (gender)": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.249 (1)", - "tab": "Bias", - "score": 0.24864864864864863 - }, - "QuAC - Toxic fraction": { - "description": "min=0.005, mean=0.005, max=0.005, sum=0.005 (1)", - "tab": "Toxicity", - "score": 0.005 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.205, - "details": { - "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.063, mean=0.063, max=0.063, sum=0.063 (1)", - "tab": "Calibration", - "score": 0.06284277332135296 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.173, mean=0.173, max=0.173, sum=0.173 (1)", - "tab": "Robustness", - "score": 0.172782874617737 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.17, mean=0.17, max=0.17, sum=0.17 (1)", - "tab": "Fairness", - "score": 0.16972477064220184 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)", - "tab": "General information", - "score": 505.35168195718654 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.206, mean=0.206, max=0.206, sum=0.206 (1)", - "tab": "Calibration", - "score": 0.20649886073889429 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.56, mean=0.56, max=0.56, sum=0.56 (1)", - "tab": "Robustness", - "score": 0.56 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.694, mean=0.694, max=0.694, sum=0.694 (1)", - "tab": "Fairness", - "score": 0.694 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)", - "tab": "General information", - "score": 2.911 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)", - "tab": "General information", - "score": 1619.568 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.547, - "details": { - "description": "min=0.064, mean=0.547, max=0.954, sum=9.838 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.109, mean=0.305, max=0.471, sum=5.486 (18)", - "tab": "Calibration", - "score": 0.3047575712176879 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.054, mean=0.401, max=0.835, sum=7.221 (18)", - "tab": "Robustness", - "score": 0.4011569280490217 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.06, mean=0.431, max=0.811, sum=7.756 (18)", - "tab": "Fairness", - "score": 0.43087088541137863 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)", - "tab": "General information", - "score": 771.6539847352628 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.648, - "details": { - "description": "min=0.3, mean=0.648, max=0.925, sum=7.125 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.3, mean=0.648, max=0.925, sum=7.123 (11)", - "tab": "Calibration", - "score": 0.6475429539256364 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.489, max=0.925, sum=5.375 (11)", - "tab": "Robustness", - "score": 0.48863636363636365 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.275, mean=0.595, max=0.925, sum=6.55 (11)", - "tab": "Fairness", - "score": 0.5954545454545455 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)", - "tab": "General information", - "score": 4.6045454545454545 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)", - "tab": "General information", - "score": 869.6909090909089 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=330 (11)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json deleted file mode 100644 index d4d85552c..000000000 --- a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-Base-v1 3B", - "id": "together/RedPajama-INCITE-Base-v1-3B", - "developer": "together", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.311, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.4387141535615171 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.293006993006993 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.26995337995338 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.4599624127215427 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.7068181818181818 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.263, - "details": { - "description": "min=0.24, mean=0.263, max=0.3, sum=1.314 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.082, mean=0.115, max=0.149, sum=0.575 (5)", - "tab": "Calibration", - "score": 0.11506526711032969 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.184, mean=0.217, max=0.29, sum=1.084 (5)", - "tab": "Robustness", - "score": 0.2168421052631579 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.2, mean=0.232, max=0.29, sum=1.161 (5)", - "tab": "Fairness", - "score": 0.23210526315789473 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.685, - "details": { - "description": "min=0.685, mean=0.685, max=0.685, sum=0.685 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)", - "tab": "Calibration", - "score": 0.1865846445420437 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)", - "tab": "Robustness", - "score": 0.585 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)", - "tab": "Fairness", - "score": 0.624 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)", - "tab": "General information", - "score": 1251.897 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555, - "details": { - "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)", - "tab": "Calibration", - "score": 0.2338003327407993 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.346 (1)", - "tab": "Robustness", - "score": 0.3460535146763825 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)", - "tab": "Fairness", - "score": 0.42019517663794076 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "General information", - "score": 1.9690140845070423 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)", - "tab": "General information", - "score": 1691.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.463, mean=0.463, max=0.463, sum=0.463 (1)", - "tab": "Bias", - "score": 0.4629629629629629 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)", - "tab": "Bias", - "score": 0.16666666666666666 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)", - "tab": "Toxicity", - "score": 0.008450704225352112 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.116 (1)", - "tab": "Calibration", - "score": 0.1159999973291356 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.345 (1)", - "tab": "Calibration", - "score": 0.34498406074093657 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.134, mean=0.134, max=0.134, sum=0.134 (1)", - "tab": "Robustness", - "score": 0.1341635313992508 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.396 (1)", - "tab": "Robustness", - "score": 0.3964044537010397 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.145, mean=0.145, max=0.145, sum=0.145 (1)", - "tab": "Fairness", - "score": 0.14546689822682907 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)", - "tab": "Fairness", - "score": 0.4521647378074364 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)", - "tab": "General information", - "score": 117.299 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)", - "tab": "General information", - "score": 4.704 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)", - "tab": "General information", - "score": 1495.552 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=299.738, mean=299.738, max=299.738, sum=299.738 (1)", - "tab": "General information", - "score": 299.738 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.46 (1)", - "tab": "Bias", - "score": 0.4597701149425287 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.182, mean=0.182, max=0.182, sum=0.182 (1)", - "tab": "Bias", - "score": 0.18181818181818182 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)", - "tab": "Bias", - "score": 0.4642857142857143 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.453 (1)", - "tab": "Bias", - "score": 0.45299145299145294 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)", - "tab": "Bias", - "score": 0.24223602484472045 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.309, - "details": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.309 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.078, mean=0.078, max=0.078, sum=0.078 (1)", - "tab": "Calibration", - "score": 0.07775925403447285 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.177, mean=0.177, max=0.177, sum=0.177 (1)", - "tab": "Robustness", - "score": 0.17735561911839576 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)", - "tab": "Fairness", - "score": 0.23753496056157644 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)", - "tab": "General information", - "score": 0.883 - }, - "QuAC - truncated": { - "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)", - "tab": "General information", - "score": 0.021 - }, - "QuAC - # prompt tokens": { - "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)", - "tab": "General information", - "score": 1655.708 - }, - "QuAC - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.575, mean=0.575, max=0.575, sum=0.575 (1)", - "tab": "Bias", - "score": 0.575 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.389 (1)", - "tab": "Bias", - "score": 0.38936550778656037 - }, - "QuAC - Representation (race)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)", - "tab": "Bias", - "score": 0.3003300330033003 - }, - "QuAC - Representation (gender)": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.269 (1)", - "tab": "Bias", - "score": 0.268640350877193 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.277, - "details": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.277 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.048, mean=0.048, max=0.048, sum=0.048 (1)", - "tab": "Calibration", - "score": 0.04833037892853392 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.226, mean=0.226, max=0.226, sum=0.226 (1)", - "tab": "Robustness", - "score": 0.22629969418960244 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)", - "tab": "Fairness", - "score": 0.24770642201834864 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)", - "tab": "General information", - "score": 505.35168195718654 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.907, - "details": { - "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)", - "tab": "Calibration", - "score": 0.24822902119068743 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)", - "tab": "Robustness", - "score": 0.843 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)", - "tab": "Fairness", - "score": 0.89 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)", - "tab": "General information", - "score": 2.911 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)", - "tab": "General information", - "score": 1619.568 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549, - "details": { - "description": "min=0.013, mean=0.549, max=0.996, sum=9.877 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.105, mean=0.303, max=0.532, sum=5.455 (18)", - "tab": "Calibration", - "score": 0.3030711579633833 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.336, max=0.996, sum=6.045 (18)", - "tab": "Robustness", - "score": 0.3358431190860201 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.393, max=0.996, sum=7.082 (18)", - "tab": "Fairness", - "score": 0.39345093425226885 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)", - "tab": "General information", - "score": 771.6539847352628 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502, - "details": { - "description": "min=0.225, mean=0.502, max=0.975, sum=5.525 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.225, mean=0.502, max=0.975, sum=5.524 (11)", - "tab": "Calibration", - "score": 0.5021656428017803 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.427, max=0.975, sum=4.7 (11)", - "tab": "Robustness", - "score": 0.4272727272727273 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.175, mean=0.475, max=0.975, sum=5.225 (11)", - "tab": "Fairness", - "score": 0.47500000000000003 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)", - "tab": "General information", - "score": 4.6045454545454545 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)", - "tab": "General information", - "score": 869.6909090909089 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=330 (11)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json deleted file mode 100644 index 9d60f7506..000000000 --- a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-Instruct 7B", - "id": "together/RedPajama-INCITE-Instruct-7B", - "developer": "together", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.38751156336725257 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.4953146853146853 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.46615384615384614 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.33794748465968927 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.29364801864801865 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363, - "details": { - "description": "min=0.246, mean=0.363, max=0.52, sum=1.816 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.092, mean=0.143, max=0.182, sum=0.715 (5)", - "tab": "Calibration", - "score": 0.14292977551638825 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.175, mean=0.291, max=0.46, sum=1.455 (5)", - "tab": "Robustness", - "score": 0.2910877192982456 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.167, mean=0.305, max=0.48, sum=1.527 (5)", - "tab": "Fairness", - "score": 0.30533333333333335 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.705, - "details": { - "description": "min=0.705, mean=0.705, max=0.705, sum=0.705 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.035, mean=0.035, max=0.035, sum=0.035 (1)", - "tab": "Calibration", - "score": 0.034644312737608846 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.599, mean=0.599, max=0.599, sum=0.599 (1)", - "tab": "Robustness", - "score": 0.599 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)", - "tab": "Fairness", - "score": 0.616 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)", - "tab": "General information", - "score": 1251.897 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.638, - "details": { - "description": "min=0.638, mean=0.638, max=0.638, sum=0.638 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.247, mean=0.247, max=0.247, sum=0.247 (1)", - "tab": "Calibration", - "score": 0.24703559378209236 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)", - "tab": "Robustness", - "score": 0.4816661888359549 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=0.506 (1)", - "tab": "Fairness", - "score": 0.5062845788047843 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "General information", - "score": 1.9690140845070423 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)", - "tab": "General information", - "score": 1691.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.193, mean=0.193, max=0.193, sum=0.193 (1)", - "tab": "Bias", - "score": 0.19318181818181815 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.025, mean=0.025, max=0.025, sum=0.025 (1)", - "tab": "Toxicity", - "score": 0.02535211267605634 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.142, mean=0.142, max=0.142, sum=0.142 (1)", - "tab": "Calibration", - "score": 0.14200000000000002 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)", - "tab": "Calibration", - "score": 0.4659999973351183 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.137, mean=0.137, max=0.137, sum=0.137 (1)", - "tab": "Robustness", - "score": 0.13717330495393032 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)", - "tab": "Robustness", - "score": 0.5468327185577326 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.164, mean=0.164, max=0.164, sum=0.164 (1)", - "tab": "Fairness", - "score": 0.16419040044922398 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.592, mean=0.592, max=0.592, sum=0.592 (1)", - "tab": "Fairness", - "score": 0.5920301139461878 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)", - "tab": "General information", - "score": 117.299 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)", - "tab": "General information", - "score": 4.704 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)", - "tab": "General information", - "score": 1495.552 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)", - "tab": "Bias", - "score": 0.4061624649859944 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Bias", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)", - "tab": "Bias", - "score": 0.5238095238095237 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)", - "tab": "Bias", - "score": 0.28125 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.074, mean=0.074, max=0.074, sum=0.074 (1)", - "tab": "Calibration", - "score": 0.07389119661461117 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.164, mean=0.164, max=0.164, sum=0.164 (1)", - "tab": "Robustness", - "score": 0.16438450644529176 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.181, mean=0.181, max=0.181, sum=0.181 (1)", - "tab": "Fairness", - "score": 0.18079535886869938 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)", - "tab": "General information", - "score": 0.883 - }, - "QuAC - truncated": { - "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)", - "tab": "General information", - "score": 0.021 - }, - "QuAC - # prompt tokens": { - "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)", - "tab": "General information", - "score": 1655.708 - }, - "QuAC - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)", - "tab": "Bias", - "score": 0.6296296296296297 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.445 (1)", - "tab": "Bias", - "score": 0.4446840232318048 - }, - "QuAC - Representation (race)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Bias", - "score": 0.33333333333333337 - }, - "QuAC - Representation (gender)": { - "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)", - "tab": "Bias", - "score": 0.24226804123711343 - }, - "QuAC - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)", - "tab": "Toxicity", - "score": 0.003 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.243, - "details": { - "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.232, mean=0.232, max=0.232, sum=0.232 (1)", - "tab": "Calibration", - "score": 0.23215642305686054 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.197, mean=0.197, max=0.197, sum=0.197 (1)", - "tab": "Robustness", - "score": 0.19724770642201836 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.183, mean=0.183, max=0.183, sum=0.183 (1)", - "tab": "Fairness", - "score": 0.1834862385321101 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)", - "tab": "General information", - "score": 505.35168195718654 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927, - "details": { - "description": "min=0.927, mean=0.927, max=0.927, sum=0.927 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.159, mean=0.159, max=0.159, sum=0.159 (1)", - "tab": "Calibration", - "score": 0.15862422483580252 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.82, mean=0.82, max=0.82, sum=0.82 (1)", - "tab": "Robustness", - "score": 0.82 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)", - "tab": "Fairness", - "score": 0.907 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)", - "tab": "General information", - "score": 2.911 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)", - "tab": "General information", - "score": 1619.568 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.664, - "details": { - "description": "min=0.487, mean=0.664, max=0.77, sum=11.961 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.035, mean=0.102, max=0.234, sum=1.831 (18)", - "tab": "Calibration", - "score": 0.10174488153691034 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0.277, mean=0.527, max=0.77, sum=9.491 (18)", - "tab": "Robustness", - "score": 0.5272697486345442 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0.25, mean=0.54, max=0.743, sum=9.724 (18)", - "tab": "Fairness", - "score": 0.5401968527212513 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)", - "tab": "General information", - "score": 771.6539847352628 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.695, - "details": { - "description": "min=0.175, mean=0.695, max=0.925, sum=7.65 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.175, mean=0.695, max=0.925, sum=7.647 (11)", - "tab": "Calibration", - "score": 0.69518288885631 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.175, mean=0.605, max=0.9, sum=6.65 (11)", - "tab": "Robustness", - "score": 0.6045454545454546 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.175, mean=0.67, max=0.875, sum=7.375 (11)", - "tab": "Fairness", - "score": 0.6704545454545454 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)", - "tab": "General information", - "score": 4.6045454545454545 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)", - "tab": "General information", - "score": 869.6909090909089 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=330 (11)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json deleted file mode 100644 index 57ffafd39..000000000 --- a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-Instruct-v1 3B", - "id": "together/RedPajama-INCITE-Instruct-v1-3B", - "developer": "together", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.37183163737280295 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.3874825174825175 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.3690909090909091 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.18974591969523494 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.6051282051282051 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.257, - "details": { - "description": "min=0.22, mean=0.257, max=0.29, sum=1.287 (5)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.09, mean=0.124, max=0.157, sum=0.619 (5)", - "tab": "Calibration", - "score": 0.1238999810101579 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.18, mean=0.218, max=0.23, sum=1.089 (5)", - "tab": "Robustness", - "score": 0.21785964912280703 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.18, mean=0.222, max=0.27, sum=1.111 (5)", - "tab": "Fairness", - "score": 0.22210526315789475 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677, - "details": { - "description": "min=0.677, mean=0.677, max=0.677, sum=0.677 (1)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.141, mean=0.141, max=0.141, sum=0.141 (1)", - "tab": "Calibration", - "score": 0.14082220350962116 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.629, mean=0.629, max=0.629, sum=0.629 (1)", - "tab": "Robustness", - "score": 0.629 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.648, mean=0.648, max=0.648, sum=0.648 (1)", - "tab": "Fairness", - "score": 0.648 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)", - "tab": "General information", - "score": 1251.897 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.638, - "details": { - "description": "min=0.638, mean=0.638, max=0.638, sum=0.638 (1)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.254 (1)", - "tab": "Calibration", - "score": 0.25351615672342864 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)", - "tab": "Robustness", - "score": 0.4034697604028265 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=0.506 (1)", - "tab": "Fairness", - "score": 0.5060331991298288 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "General information", - "score": 1.9690140845070423 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)", - "tab": "General information", - "score": 1691.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "NarrativeQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.184, mean=0.184, max=0.184, sum=0.184 (1)", - "tab": "Bias", - "score": 0.18354430379746836 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.014, max=0.014, sum=0.014 (1)", - "tab": "Toxicity", - "score": 0.014084507042253521 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "description": "min=0.637, mean=0.637, max=0.637, sum=0.637 (1)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.12, mean=0.12, max=0.12, sum=0.12 (1)", - "tab": "Calibration", - "score": 0.12000000000000001 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.454, mean=0.454, max=0.454, sum=0.454 (1)", - "tab": "Calibration", - "score": 0.4539999913132661 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)", - "tab": "Robustness", - "score": 0.13162030419976034 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.536, mean=0.536, max=0.536, sum=0.536 (1)", - "tab": "Robustness", - "score": 0.5356772534642628 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.143, mean=0.143, max=0.143, sum=0.143 (1)", - "tab": "Fairness", - "score": 0.1431948167839223 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.571, mean=0.571, max=0.571, sum=0.571 (1)", - "tab": "Fairness", - "score": 0.57068667733919 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)", - "tab": "General information", - "score": 117.299 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)", - "tab": "General information", - "score": 4.704 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)", - "tab": "General information", - "score": 1495.552 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=300, mean=300, max=300, sum=300 (1)", - "tab": "General information", - "score": 300.0 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", - "tab": "Bias", - "score": 0.4666666666666666 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.278, mean=0.278, max=0.278, sum=0.278 (1)", - "tab": "Bias", - "score": 0.2777777777777778 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)", - "tab": "Bias", - "score": 0.5660749506903353 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)", - "tab": "Bias", - "score": 0.32352941176470584 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.259, - "details": { - "description": "min=0.259, mean=0.259, max=0.259, sum=0.259 (1)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.1, mean=0.1, max=0.1, sum=0.1 (1)", - "tab": "Calibration", - "score": 0.09989902749544036 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.137, mean=0.137, max=0.137, sum=0.137 (1)", - "tab": "Robustness", - "score": 0.1368222933188553 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.183, mean=0.183, max=0.183, sum=0.183 (1)", - "tab": "Fairness", - "score": 0.18270531445590665 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)", - "tab": "General information", - "score": 0.883 - }, - "QuAC - truncated": { - "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)", - "tab": "General information", - "score": 0.021 - }, - "QuAC - # prompt tokens": { - "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)", - "tab": "General information", - "score": 1655.708 - }, - "QuAC - # output tokens": { - "description": "min=100, mean=100, max=100, sum=100 (1)", - "tab": "General information", - "score": 100.0 - }, - "QuAC - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.439 (1)", - "tab": "Bias", - "score": 0.4393162393162393 - }, - "QuAC - Representation (race)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.34 (1)", - "tab": "Bias", - "score": 0.33993399339933994 - }, - "QuAC - Representation (gender)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)", - "tab": "Bias", - "score": 0.28532608695652173 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.208, - "details": { - "description": "min=0.208, mean=0.208, max=0.208, sum=0.208 (1)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.097, mean=0.097, max=0.097, sum=0.097 (1)", - "tab": "Calibration", - "score": 0.09733177984986514 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.173, mean=0.173, max=0.173, sum=0.173 (1)", - "tab": "Robustness", - "score": 0.172782874617737 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.179, mean=0.179, max=0.179, sum=0.179 (1)", - "tab": "Fairness", - "score": 0.17889908256880735 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=654 (1)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)", - "tab": "General information", - "score": 505.35168195718654 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "CNN/DailyMail - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "CNN/DailyMail - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "XSUM - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "XSUM - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "XSUM - SummaC": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Density": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Compression": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-faithfulness": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "No matching runs", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=0.894 (1)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.04, mean=0.04, max=0.04, sum=0.04 (1)", - "tab": "Calibration", - "score": 0.04045821313550608 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)", - "tab": "Robustness", - "score": 0.852 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)", - "tab": "Fairness", - "score": 0.876 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)", - "tab": "General information", - "score": 2.911 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)", - "tab": "General information", - "score": 1619.568 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549, - "details": { - "description": "min=0.028, mean=0.549, max=0.997, sum=9.891 (18)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.09, mean=0.383, max=0.8, sum=6.9 (18)", - "tab": "Calibration", - "score": 0.3833406193329736 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.506, max=0.993, sum=9.105 (18)", - "tab": "Robustness", - "score": 0.5058374710841333 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.499, max=0.985, sum=8.983 (18)", - "tab": "Fairness", - "score": 0.4990473523687277 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=6688 (18)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (18)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)", - "tab": "General information", - "score": 771.6539847352628 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=90 (18)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=1, mean=1, max=1, sum=18 (18)", - "tab": "General information", - "score": 1.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.2, mean=0.661, max=0.975, sum=7.275 (11)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.2, mean=0.661, max=0.975, sum=7.274 (11)", - "tab": "Calibration", - "score": 0.6612967467806994 - }, - "RAFT - EM (Robustness)": { - "description": "min=0.075, mean=0.548, max=0.95, sum=6.025 (11)", - "tab": "Robustness", - "score": 0.5477272727272727 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.175, mean=0.632, max=0.975, sum=6.95 (11)", - "tab": "Fairness", - "score": 0.631818181818182 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=440 (11)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)", - "tab": "General information", - "score": 4.6045454545454545 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (11)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)", - "tab": "General information", - "score": 869.6909090909089 - }, - "RAFT - # output tokens": { - "description": "min=30, mean=30, max=30, sum=330 (11)", - "tab": "General information", - "score": 30.0 - }, - "RAFT - # trials": { - "description": "min=1, mean=1, max=1, sum=11 (11)", - "tab": "General information", - "score": 1.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json deleted file mode 100644 index fe1ab40e2..000000000 --- a/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "InstructPalmyra 30B", - "id": "writer/InstructPalmyra-30B", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.568, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": null - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.5224242424242425 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.5379254079254079 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": null - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.47136458620459815 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.5811383061383062 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403, - "details": { - "description": "min=0.23, mean=0.403, max=0.7, sum=6.041 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "MMLU - EM (Robustness)": { - "description": "min=0.14, mean=0.348, max=0.65, sum=5.223 (15)", - "tab": "Robustness", - "score": 0.34819883040935673 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.19, mean=0.371, max=0.66, sum=5.572 (15)", - "tab": "Fairness", - "score": 0.3714502923976608 - }, - "MMLU - Denoised inference time (s)": { - "description": "5 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)", - "tab": "General information", - "score": 472.2740350877193 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.751, - "details": { - "description": "min=0.698, mean=0.751, max=0.798, sum=2.254 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.564, mean=0.656, max=0.719, sum=1.967 (3)", - "tab": "Robustness", - "score": 0.6556666666666667 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.636, mean=0.7, max=0.762, sum=2.099 (3)", - "tab": "Fairness", - "score": 0.6996666666666668 - }, - "BoolQ - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)", - "tab": "General information", - "score": 908.4063333333334 - }, - "BoolQ - # output tokens": { - "description": "min=1, mean=1, max=1, sum=3 (3)", - "tab": "General information", - "score": 1.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.496, - "details": { - "description": "min=0.253, mean=0.496, max=0.636, sum=1.489 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.091, mean=0.317, max=0.444, sum=0.952 (3)", - "tab": "Robustness", - "score": 0.3173185298582432 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.18, mean=0.405, max=0.538, sum=1.214 (3)", - "tab": "Fairness", - "score": 0.40467419690737483 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.051, mean=1.646, max=2.085, sum=4.938 (3)", - "tab": "General information", - "score": 1.6460093896713615 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1600.366, mean=1651.848, max=1705.003, sum=4955.544 (3)", - "tab": "General information", - "score": 1651.8478873239437 - }, - "NarrativeQA - # output tokens": { - "description": "min=1.93, mean=5.347, max=7.079, sum=16.042 (3)", - "tab": "General information", - "score": 5.347417840375587 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.381, mean=0.445, max=0.5, sum=1.335 (3)", - "tab": "Bias", - "score": 0.44516594516594515 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.333, mean=0.444, max=0.667, sum=1.333 (3)", - "tab": "Bias", - "score": 0.4444444444444445 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.164, mean=0.196, max=0.241, sum=0.588 (3)", - "tab": "Bias", - "score": 0.1960646593836042 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.003, mean=0.012, max=0.017, sum=0.037 (3)", - "tab": "Toxicity", - "score": 0.01220657276995305 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.682, - "details": { - "description": "min=0.678, mean=0.682, max=0.688, sum=2.046 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.257, mean=0.267, max=0.272, sum=0.8 (3)", - "tab": "Robustness", - "score": 0.2667976861519438 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.52, mean=0.567, max=0.61, sum=1.701 (3)", - "tab": "Robustness", - "score": 0.5669828313348768 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.272, mean=0.276, max=0.282, sum=0.829 (3)", - "tab": "Fairness", - "score": 0.276181640672073 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.621, mean=0.63, max=0.639, sum=1.891 (3)", - "tab": "Fairness", - "score": 0.6303513019528806 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)", - "tab": "General information", - "score": 112.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=3.935, mean=4.247, max=4.675, sum=12.74 (3)", - "tab": "General information", - "score": 4.246666666666667 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.647, mean=4.691, max=4.723, sum=14.072 (3)", - "tab": "General information", - "score": 4.690666666666666 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1231.056, mean=1419.328, max=1523.222, sum=4257.983 (3)", - "tab": "General information", - "score": 1419.3276666666668 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.778, mean=7.657, max=8.266, sum=22.97 (3)", - "tab": "General information", - "score": 7.656666666666666 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.498, mean=0.525, max=0.55, sum=1.576 (3)", - "tab": "Bias", - "score": 0.5252747252747252 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.088, mean=0.134, max=0.206, sum=0.401 (3)", - "tab": "Bias", - "score": 0.13375350140056022 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.3, mean=0.392, max=0.443, sum=1.176 (3)", - "tab": "Bias", - "score": 0.39206349206349206 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.45, mean=0.49, max=0.533, sum=1.47 (3)", - "tab": "Bias", - "score": 0.4899991188650981 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.327, mean=0.384, max=0.422, sum=1.152 (3)", - "tab": "Bias", - "score": 0.3838592033738646 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.433, - "details": { - "description": "min=0.423, mean=0.433, max=0.447, sum=1.3 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.23, mean=0.248, max=0.258, sum=0.743 (3)", - "tab": "Robustness", - "score": 0.24761534139298128 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.328, mean=0.337, max=0.353, sum=1.011 (3)", - "tab": "Fairness", - "score": 0.3370729442565461 - }, - "QuAC - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.845, mean=0.944, max=1.084, sum=2.831 (3)", - "tab": "General information", - "score": 0.9436666666666667 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1624.371, mean=1644.436, max=1670.589, sum=4933.308 (3)", - "tab": "General information", - "score": 1644.436 - }, - "QuAC - # output tokens": { - "description": "min=18.652, mean=22.969, max=26.445, sum=68.907 (3)", - "tab": "General information", - "score": 22.969000000000005 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.571, mean=0.582, max=0.59, sum=1.745 (3)", - "tab": "Bias", - "score": 0.5815018315018315 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.413, mean=0.431, max=0.463, sum=1.292 (3)", - "tab": "Bias", - "score": 0.43052581120508293 - }, - "QuAC - Representation (race)": { - "description": "min=0.317, mean=0.337, max=0.368, sum=1.012 (3)", - "tab": "Bias", - "score": 0.33749135321526574 - }, - "QuAC - Representation (gender)": { - "description": "min=0.224, mean=0.236, max=0.243, sum=0.707 (3)", - "tab": "Bias", - "score": 0.2355073330063574 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.185, - "details": { - "description": "min=0.18, mean=0.185, max=0.19, sum=0.555 (3)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.148, mean=0.151, max=0.154, sum=0.454 (3)", - "tab": "Robustness", - "score": 0.1513761467889908 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.147, mean=0.152, max=0.157, sum=0.456 (3)", - "tab": "Fairness", - "score": 0.15188583078491336 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=1962 (3)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)", - "tab": "General information", - "score": 511.12079510703364 - }, - "TruthfulQA - # output tokens": { - "description": "min=0.998, mean=0.999, max=1, sum=2.997 (3)", - "tab": "General information", - "score": 0.998980632008155 - }, - "TruthfulQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.152, - "details": { - "description": "min=0.142, mean=0.152, max=0.165, sum=0.455 (3)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=1398 (3)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1531.586, mean=1549.919, max=1567.586, sum=4649.758 (3)", - "tab": "General information", - "score": 1549.9191702432045 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=74.511, mean=83.965, max=95.704, sum=251.895 (3)", - "tab": "General information", - "score": 83.96494992846924 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.619, mean=0.638, max=0.651, sum=1.914 (3)", - "tab": "Bias", - "score": 0.638095238095238 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.344, mean=0.371, max=0.398, sum=1.112 (3)", - "tab": "Bias", - "score": 0.3705770935558364 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.199, mean=0.258, max=0.288, sum=0.773 (3)", - "tab": "Bias", - "score": 0.2575629817009127 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.108, mean=0.117, max=0.129, sum=0.351 (3)", - "tab": "Bias", - "score": 0.11691353772442492 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.002, max=0.006, sum=0.006 (3)", - "tab": "Toxicity", - "score": 0.002145922746781116 - }, - "CNN/DailyMail - SummaC": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - QAFactEval": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.961, mean=0.972, max=0.979, sum=2.915 (3)", - "tab": "Summarization metrics", - "score": 0.9716859203819838 - }, - "CNN/DailyMail - Density": { - "description": "min=22.383, mean=28.97, max=38.633, sum=86.91 (3)", - "tab": "Summarization metrics", - "score": 28.97014469233496 - }, - "CNN/DailyMail - Compression": { - "description": "min=6.723, mean=7.901, max=9.103, sum=23.703 (3)", - "tab": "Summarization metrics", - "score": 7.901010404629208 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.104, - "details": { - "description": "min=0.1, mean=0.104, max=0.106, sum=0.312 (3)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=1554 (3)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.998, mean=4.999, max=5, sum=14.996 (3)", - "tab": "General information", - "score": 4.998712998712999 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1456.402, mean=1510.418, max=1538.921, sum=4531.255 (3)", - "tab": "General information", - "score": 1510.4182754182755 - }, - "XSUM - # output tokens": { - "description": "min=26.207, mean=26.632, max=27.241, sum=79.896 (3)", - "tab": "General information", - "score": 26.631917631917634 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.436, mean=0.459, max=0.489, sum=1.376 (3)", - "tab": "Bias", - "score": 0.45852730200556285 - }, - "XSUM - Representation (race)": { - "description": "min=0.532, mean=0.59, max=0.667, sum=1.771 (3)", - "tab": "Bias", - "score": 0.5901750807411186 - }, - "XSUM - Representation (gender)": { - "description": "min=0.17, mean=0.187, max=0.207, sum=0.562 (3)", - "tab": "Bias", - "score": 0.18720575071822934 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006435006435006435 - }, - "XSUM - SummaC": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - QAFactEval": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - Coverage": { - "description": "min=0.836, mean=0.844, max=0.853, sum=2.531 (3)", - "tab": "Summarization metrics", - "score": 0.8437121246995759 - }, - "XSUM - Density": { - "description": "min=3.292, mean=3.441, max=3.518, sum=10.323 (3)", - "tab": "Summarization metrics", - "score": 3.4410181202034944 - }, - "XSUM - Compression": { - "description": "min=15.467, mean=15.707, max=15.837, sum=47.122 (3)", - "tab": "Summarization metrics", - "score": 15.707173220790708 - }, - "XSUM - HumanEval-faithfulness": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "1 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.936, mean=0.94, max=0.946, sum=2.821 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "IMDB - EM (Robustness)": { - "description": "min=0.898, mean=0.906, max=0.916, sum=2.718 (3)", - "tab": "Robustness", - "score": 0.906 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.925, mean=0.931, max=0.94, sum=2.793 (3)", - "tab": "Fairness", - "score": 0.931 - }, - "IMDB - Denoised inference time (s)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)", - "tab": "General information", - "score": 4.242 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)", - "tab": "General information", - "score": 1553.363 - }, - "IMDB - # output tokens": { - "description": "min=0.995, mean=0.997, max=0.999, sum=2.992 (3)", - "tab": "General information", - "score": 0.9973333333333333 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555, - "details": { - "description": "min=0, mean=0.555, max=0.877, sum=29.976 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.443, max=0.774, sum=23.937 (54)", - "tab": "Robustness", - "score": 0.4432801514699601 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.449, max=0.871, sum=24.239 (54)", - "tab": "Fairness", - "score": 0.44887663628250224 - }, - "CivilComments - Denoised inference time (s)": { - "description": "9 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)", - "tab": "General information", - "score": 722.6354931173206 - }, - "CivilComments - # output tokens": { - "description": "min=0, mean=0.905, max=1, sum=48.891 (54)", - "tab": "General information", - "score": 0.9053814074087929 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "9 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.275, mean=0.652, max=0.95, sum=21.5 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Calibration", - "score": null - }, - "RAFT - EM (Robustness)": { - "description": "min=0.05, mean=0.518, max=0.95, sum=17.1 (33)", - "tab": "Robustness", - "score": 0.5181818181818182 - }, - "RAFT - EM (Fairness)": { - "description": "min=0.25, mean=0.618, max=0.925, sum=20.4 (33)", - "tab": "Fairness", - "score": 0.6181818181818182 - }, - "RAFT - Denoised inference time (s)": { - "description": "11 matching runs, but no matching metrics", - "tab": "Efficiency", - "score": null - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.556, max=5, sum=150.35 (33)", - "tab": "General information", - "score": 4.556060606060607 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)", - "tab": "General information", - "score": 812.937878787879 - }, - "RAFT - # output tokens": { - "description": "min=1, mean=2.967, max=6.15, sum=97.925 (33)", - "tab": "General information", - "score": 2.9674242424242423 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "11 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json deleted file mode 100644 index 61a019ad2..000000000 --- a/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/yandex_YaLM-100B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "YaLM 100B", - "id": "yandex/YaLM-100B", - "developer": "yandex", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.075, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.40175763182238666 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.20536130536130537 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.16727272727272727 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.2658333333333333 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.37929404953000706 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.24189051689051688 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.04536340852130326 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.243, - "details": { - "description": "min=0.2, mean=0.243, max=0.28, sum=3.651 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.619, mean=0.708, max=0.769, sum=10.615 (15)", - "tab": "Calibration", - "score": 0.7076962372990694 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.2, mean=0.243, max=0.28, sum=3.651 (15)", - "tab": "Robustness", - "score": 0.2433684210526316 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.2, mean=0.243, max=0.28, sum=3.651 (15)", - "tab": "Fairness", - "score": 0.2433684210526316 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.09, mean=0.143, max=0.217, sum=2.144 (15)", - "tab": "Efficiency", - "score": 0.14296402070471761 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=354.96, mean=453.383, max=580.833, sum=6800.74 (15)", - "tab": "General information", - "score": 453.38266666666664 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.634, - "details": { - "description": "min=0.631, mean=0.634, max=0.64, sum=1.902 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.114, mean=0.147, max=0.167, sum=0.442 (3)", - "tab": "Calibration", - "score": 0.14717484078898194 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.437, mean=0.566, max=0.631, sum=1.698 (3)", - "tab": "Robustness", - "score": 0.566 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.486, mean=0.583, max=0.631, sum=1.748 (3)", - "tab": "Fairness", - "score": 0.5826666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.546, mean=0.828, max=1.136, sum=2.485 (3)", - "tab": "Efficiency", - "score": 0.8282727491158176 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=649.339, mean=899.006, max=1233.339, sum=2697.017 (3)", - "tab": "General information", - "score": 899.0056666666666 - }, - "BoolQ - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.252, - "details": { - "description": "min=0.213, mean=0.252, max=0.297, sum=0.756 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.029, mean=0.06, max=0.101, sum=0.179 (3)", - "tab": "Calibration", - "score": 0.05960283323299867 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.078, mean=0.088, max=0.096, sum=0.264 (3)", - "tab": "Robustness", - "score": 0.08788676556219112 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.131, mean=0.146, max=0.169, sum=0.437 (3)", - "tab": "Fairness", - "score": 0.14573784149261218 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=2.158, mean=2.314, max=2.397, sum=6.943 (3)", - "tab": "Efficiency", - "score": 2.314193915889056 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.028, mean=1.604, max=2.008, sum=4.811 (3)", - "tab": "General information", - "score": 1.603755868544601 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1603.569, mean=1644.878, max=1690.352, sum=4934.634 (3)", - "tab": "General information", - "score": 1644.8779342723003 - }, - "NarrativeQA - # output tokens": { - "description": "min=94.115, mean=96.018, max=98.566, sum=288.054 (3)", - "tab": "General information", - "score": 96.01784037558686 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.434, mean=0.449, max=0.478, sum=1.347 (3)", - "tab": "Bias", - "score": 0.449065994913171 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.429, mean=0.568, max=0.667, sum=1.703 (3)", - "tab": "Bias", - "score": 0.5676937441643325 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.127, mean=0.177, max=0.216, sum=0.53 (3)", - "tab": "Bias", - "score": 0.17681914997964296 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.014, mean=0.017, max=0.02, sum=0.051 (3)", - "tab": "Toxicity", - "score": 0.016901408450704227 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.227, - "details": { - "description": "min=0.197, mean=0.227, max=0.258, sum=0.68 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.019, mean=0.02, max=0.02, sum=0.059 (3)", - "tab": "Calibration", - "score": 0.019790335675494927 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.069, mean=0.086, max=0.12, sum=0.259 (3)", - "tab": "Calibration", - "score": 0.08637064333353452 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.045, mean=0.047, max=0.05, sum=0.14 (3)", - "tab": "Robustness", - "score": 0.04678550801735826 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.111, mean=0.125, max=0.146, sum=0.375 (3)", - "tab": "Robustness", - "score": 0.12496123369617401 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.051, mean=0.052, max=0.053, sum=0.155 (3)", - "tab": "Fairness", - "score": 0.0516362934670568 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.15, mean=0.177, max=0.207, sum=0.53 (3)", - "tab": "Fairness", - "score": 0.1768275232054711 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=2.669, mean=2.722, max=2.827, sum=8.167 (3)", - "tab": "Efficiency", - "score": 2.7221932611479644 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=4.373, mean=4.463, max=4.531, sum=13.389 (3)", - "tab": "Efficiency", - "score": 4.463013303365339 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=108.201, mean=111.534, max=117.201, sum=334.603 (3)", - "tab": "General information", - "score": 111.53433333333332 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=298.545, mean=299.515, max=300, sum=898.545 (3)", - "tab": "General information", - "score": 299.51500000000004 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.669, mean=4.702, max=4.738, sum=14.107 (3)", - "tab": "General information", - "score": 4.702333333333333 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1218.159, mean=1409.24, max=1510.891, sum=4227.721 (3)", - "tab": "General information", - "score": 1409.2403333333332 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=289.149, mean=291.572, max=293.886, sum=874.715 (3)", - "tab": "General information", - "score": 291.57166666666666 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.433, mean=0.478, max=0.5, sum=1.433 (3)", - "tab": "Bias", - "score": 0.4776758409785933 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.324, mean=0.327, max=0.33, sum=0.982 (3)", - "tab": "Bias", - "score": 0.3274145329078469 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.014, mean=0.168, max=0.277, sum=0.504 (3)", - "tab": "Bias", - "score": 0.16816448651008897 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.204, mean=0.385, max=0.523, sum=1.154 (3)", - "tab": "Bias", - "score": 0.38473904949347787 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.102, mean=0.175, max=0.25, sum=0.526 (3)", - "tab": "Bias", - "score": 0.17544176986611967 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0.007, mean=0.008, max=0.009, sum=0.024 (3)", - "tab": "Toxicity", - "score": 0.008 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.003, mean=0.003, max=0.003, sum=0.009 (3)", - "tab": "Toxicity", - "score": 0.0030000000000000005 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.162, - "details": { - "description": "min=0.156, mean=0.162, max=0.172, sum=0.485 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.012, mean=0.029, max=0.039, sum=0.087 (3)", - "tab": "Calibration", - "score": 0.028959032200530792 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.077, mean=0.08, max=0.082, sum=0.239 (3)", - "tab": "Robustness", - "score": 0.0795025876916194 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.092, mean=0.1, max=0.108, sum=0.301 (3)", - "tab": "Fairness", - "score": 0.10047785618783804 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=2.259, mean=2.278, max=2.297, sum=6.834 (3)", - "tab": "Efficiency", - "score": 2.278147567048529 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.841, mean=0.951, max=1.111, sum=2.853 (3)", - "tab": "General information", - "score": 0.951 - }, - "QuAC - truncated": { - "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)", - "tab": "General information", - "score": 0.016 - }, - "QuAC - # prompt tokens": { - "description": "min=1630.348, mean=1646.729, max=1667.958, sum=4940.188 (3)", - "tab": "General information", - "score": 1646.7293333333334 - }, - "QuAC - # output tokens": { - "description": "min=99.146, mean=99.146, max=99.146, sum=297.438 (3)", - "tab": "General information", - "score": 99.146 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.44, mean=0.454, max=0.465, sum=1.363 (3)", - "tab": "Bias", - "score": 0.4543925551127126 - }, - "QuAC - Representation (race)": { - "description": "min=0.312, mean=0.465, max=0.582, sum=1.396 (3)", - "tab": "Bias", - "score": 0.4653480174056855 - }, - "QuAC - Representation (gender)": { - "description": "min=0.335, mean=0.343, max=0.358, sum=1.029 (3)", - "tab": "Bias", - "score": 0.3431307584494557 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.003, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.202, - "details": { - "description": "min=0.197, mean=0.202, max=0.203, sum=0.807 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.621, mean=0.679, max=0.751, sum=2.716 (4)", - "tab": "Calibration", - "score": 0.6789622806094777 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.197, mean=0.202, max=0.203, sum=0.807 (4)", - "tab": "Robustness", - "score": 0.2018348623853211 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.197, mean=0.202, max=0.203, sum=0.807 (4)", - "tab": "Fairness", - "score": 0.2018348623853211 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.058, mean=0.092, max=0.136, sum=0.37 (4)", - "tab": "Efficiency", - "score": 0.09243018414244196 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.75, max=5, sum=15 (4)", - "tab": "General information", - "score": 3.75 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=85.664, mean=405.414, max=531.664, sum=1621.654 (4)", - "tab": "General information", - "score": 405.41360856269114 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.017, - "details": { - "description": "min=0.009, mean=0.017, max=0.022, sum=0.103 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=2.334, mean=2.346, max=2.352, sum=14.074 (6)", - "tab": "Efficiency", - "score": 2.3457143735281405 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1536.099, mean=1544.765, max=1562.099, sum=9268.592 (6)", - "tab": "General information", - "score": 1544.7653791130188 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=90.71, mean=102.407, max=108.32, sum=614.442 (6)", - "tab": "General information", - "score": 102.40701001430614 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.406, mean=0.42, max=0.438, sum=2.518 (6)", - "tab": "Bias", - "score": 0.4196869049681346 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.429, mean=0.588, max=0.667, sum=3.525 (6)", - "tab": "Bias", - "score": 0.5875706214689266 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.171, mean=0.206, max=0.237, sum=1.238 (6)", - "tab": "Bias", - "score": 0.20635612913269732 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)", - "tab": "Toxicity", - "score": 0.000715307582260372 - }, - "CNN/DailyMail - SummaC": { - "description": "min=-0.35, mean=-0.322, max=-0.296, sum=-0.965 (3)", - "tab": "Summarization metrics", - "score": -0.3217409663792838 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=-0.154, mean=-0.145, max=-0.127, sum=-0.435 (3)", - "tab": "Summarization metrics", - "score": -0.14496527560996572 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.406, mean=0.541, max=0.615, sum=3.249 (6)", - "tab": "Summarization metrics", - "score": 0.5414806522156069 - }, - "CNN/DailyMail - Density": { - "description": "min=0.681, mean=1.09, max=1.303, sum=6.541 (6)", - "tab": "Summarization metrics", - "score": 1.0902141864760964 - }, - "CNN/DailyMail - Compression": { - "description": "min=6.289, mean=6.936, max=8.148, sum=41.615 (6)", - "tab": "Summarization metrics", - "score": 6.935882429972025 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.021, - "details": { - "description": "min=0.019, mean=0.021, max=0.022, sum=0.124 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=1.653, mean=1.671, max=1.681, sum=10.028 (6)", - "tab": "Efficiency", - "score": 1.6713877910966286 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1452.164, mean=1507.497, max=1536.164, sum=9044.985 (6)", - "tab": "General information", - "score": 1507.497425997426 - }, - "XSUM - # output tokens": { - "description": "min=46.541, mean=49.401, max=51.544, sum=296.405 (6)", - "tab": "General information", - "score": 49.4009009009009 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.434, mean=0.442, max=0.456, sum=2.652 (6)", - "tab": "Bias", - "score": 0.4419820754826329 - }, - "XSUM - Representation (race)": { - "description": "min=0.333, mean=0.501, max=0.595, sum=3.009 (6)", - "tab": "Bias", - "score": 0.5014430014430014 - }, - "XSUM - Representation (gender)": { - "description": "min=0.209, mean=0.248, max=0.286, sum=1.485 (6)", - "tab": "Bias", - "score": 0.24754799603959324 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.352, mean=-0.347, max=-0.344, sum=-1.04 (3)", - "tab": "Summarization metrics", - "score": -0.3466731809697447 - }, - "XSUM - QAFactEval": { - "description": "min=0.856, mean=1.176, max=1.555, sum=7.058 (6)", - "tab": "Summarization metrics", - "score": 1.1763058409064706 - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.007, mean=0.031, max=0.057, sum=0.093 (3)", - "tab": "Summarization metrics", - "score": 0.031129963643441894 - }, - "XSUM - Coverage": { - "description": "min=0.557, mean=0.567, max=0.574, sum=3.405 (6)", - "tab": "Summarization metrics", - "score": 0.5674251187038739 - }, - "XSUM - Density": { - "description": "min=1.005, mean=1.041, max=1.081, sum=6.248 (6)", - "tab": "Summarization metrics", - "score": 1.0413571284332044 - }, - "XSUM - Compression": { - "description": "min=9.397, mean=9.951, max=10.96, sum=59.706 (6)", - "tab": "Summarization metrics", - "score": 9.951019350255967 - }, - "XSUM - HumanEval-faithfulness": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-relevance": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - HumanEval-coherence": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.776, mean=0.836, max=0.876, sum=2.509 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.369, mean=0.418, max=0.496, sum=1.255 (3)", - "tab": "Calibration", - "score": 0.41834259640752514 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.578, mean=0.719, max=0.79, sum=2.158 (3)", - "tab": "Robustness", - "score": 0.7193333333333333 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.709, mean=0.8, max=0.853, sum=2.4 (3)", - "tab": "Fairness", - "score": 0.7999999999999999 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=1.076, mean=1.137, max=1.23, sum=3.41 (3)", - "tab": "Efficiency", - "score": 1.1365543731623833 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.845, mean=4.929, max=4.982, sum=14.788 (3)", - "tab": "General information", - "score": 4.929333333333333 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1161.789, mean=1402.276, max=1747.837, sum=4206.828 (3)", - "tab": "General information", - "score": 1402.2759999999998 - }, - "IMDB - # output tokens": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0, mean=0.49, max=1, sum=26.448 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.108, mean=0.437, max=0.784, sum=23.581 (54)", - "tab": "Calibration", - "score": 0.43669079652569004 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.463, max=1, sum=25.008 (54)", - "tab": "Robustness", - "score": 0.4631081891632545 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.456, max=0.998, sum=24.603 (54)", - "tab": "Fairness", - "score": 0.4556089334763174 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.291, mean=0.41, max=0.737, sum=22.139 (54)", - "tab": "Efficiency", - "score": 0.4099806397254133 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=365, mean=729.671, max=1285.924, sum=39402.252 (54)", - "tab": "General information", - "score": 729.6713289334527 - }, - "CivilComments - # output tokens": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395, - "details": { - "description": "min=0, mean=0.395, max=0.975, sum=13.05 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.011, mean=0.278, max=0.881, sum=9.176 (33)", - "tab": "Calibration", - "score": 0.2780574023642052 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.211, max=0.65, sum=6.975 (33)", - "tab": "Robustness", - "score": 0.21136363636363636 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.342, max=0.975, sum=11.3 (33)", - "tab": "Fairness", - "score": 0.3424242424242424 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.132, mean=0.89, max=1.838, sum=29.385 (33)", - "tab": "Efficiency", - "score": 0.8904544346562409 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.562, max=5, sum=150.55 (33)", - "tab": "General information", - "score": 4.5621212121212125 - }, - "RAFT - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "General information", - "score": 0.0 - }, - "RAFT - # prompt tokens": { - "description": "min=255.875, mean=784.961, max=1758.075, sum=25903.725 (33)", - "tab": "General information", - "score": 784.9613636363637 - }, - "RAFT - # output tokens": { - "description": "min=5, mean=13.615, max=30, sum=449.3 (33)", - "tab": "General information", - "score": 13.615151515151515 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json deleted file mode 100644 index 04bdfa490..000000000 --- a/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json +++ /dev/null @@ -1,1613 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770834891.1472661", - "retrieved_timestamp": "1770834891.1472661", - "source_metadata": { - "source_name": "helm_classic", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GLM 130B", - "id": "zhipu-ai/GLM-130B", - "developer": "zhipu-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_classic", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512, - "details": { - "tab": "Accuracy", - "Mean win rate - Calibration": { - "description": null, - "tab": "Calibration", - "score": 0.6523126734505088 - }, - "Mean win rate - Robustness": { - "description": null, - "tab": "Robustness", - "score": 0.6465501165501165 - }, - "Mean win rate - Fairness": { - "description": null, - "tab": "Fairness", - "score": 0.5133566433566433 - }, - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.1511111111111111 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - }, - "Mean win rate - Bias": { - "description": null, - "tab": "Bias", - "score": 0.45074793034678545 - }, - "Mean win rate - Toxicity": { - "description": null, - "tab": "Toxicity", - "score": 0.3347137430470764 - }, - "Mean win rate - Summarization metrics": { - "description": null, - "tab": "Summarization metrics", - "score": 0.4714285714285714 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344, - "details": { - "description": "min=0.23, mean=0.344, max=0.47, sum=5.16 (15)", - "tab": "Accuracy", - "MMLU - ECE (10-bin)": { - "description": "min=0.075, mean=0.128, max=0.196, sum=1.914 (15)", - "tab": "Calibration", - "score": 0.12760096192658882 - }, - "MMLU - EM (Robustness)": { - "description": "min=0.17, mean=0.32, max=0.44, sum=4.806 (15)", - "tab": "Robustness", - "score": 0.3203859649122807 - }, - "MMLU - EM (Fairness)": { - "description": "min=0.22, mean=0.315, max=0.43, sum=4.723 (15)", - "tab": "Fairness", - "score": 0.3148771929824561 - }, - "MMLU - Denoised inference time (s)": { - "description": "min=0.194, mean=0.335, max=0.546, sum=5.029 (15)", - "tab": "Efficiency", - "score": 0.33523606010994367 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=1542 (15)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=75 (15)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (15)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=354.52, mean=460.637, max=611.877, sum=6909.562 (15)", - "tab": "General information", - "score": 460.63743859649117 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=15 (15)", - "tab": "General information", - "score": 1.0 - }, - "MMLU - # trials": { - "description": "min=3, mean=3, max=3, sum=45 (15)", - "tab": "General information", - "score": 3.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "BoolQ", - "source_data": { - "dataset_name": "BoolQ", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on BoolQ", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.784, - "details": { - "description": "min=0.729, mean=0.784, max=0.819, sum=2.351 (3)", - "tab": "Accuracy", - "BoolQ - ECE (10-bin)": { - "description": "min=0.111, mean=0.171, max=0.205, sum=0.513 (3)", - "tab": "Calibration", - "score": 0.1710477879835662 - }, - "BoolQ - EM (Robustness)": { - "description": "min=0.68, mean=0.728, max=0.758, sum=2.183 (3)", - "tab": "Robustness", - "score": 0.7276666666666668 - }, - "BoolQ - EM (Fairness)": { - "description": "min=0.625, mean=0.69, max=0.722, sum=2.069 (3)", - "tab": "Fairness", - "score": 0.6896666666666667 - }, - "BoolQ - Denoised inference time (s)": { - "description": "min=0.942, mean=1.191, max=1.332, sum=3.574 (3)", - "tab": "Efficiency", - "score": 1.1913305165274586 - }, - "BoolQ - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "BoolQ - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "BoolQ - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "BoolQ - # prompt tokens": { - "description": "min=679.091, mean=931.424, max=1276.091, sum=2794.273 (3)", - "tab": "General information", - "score": 931.4243333333333 - }, - "BoolQ - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "BoolQ - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "BoolQ - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "BoolQ - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.655, mean=0.706, max=0.736, sum=2.118 (3)", - "tab": "Accuracy", - "NarrativeQA - ECE (10-bin)": { - "description": "min=0.027, mean=0.037, max=0.058, sum=0.112 (3)", - "tab": "Calibration", - "score": 0.03732324115716399 - }, - "NarrativeQA - F1 (Robustness)": { - "description": "min=0.531, mean=0.629, max=0.682, sum=1.888 (3)", - "tab": "Robustness", - "score": 0.6293880948208791 - }, - "NarrativeQA - F1 (Fairness)": { - "description": "min=0.55, mean=0.615, max=0.656, sum=1.846 (3)", - "tab": "Fairness", - "score": 0.6154230898629193 - }, - "NarrativeQA - Denoised inference time (s)": { - "description": "min=1.78, mean=2.315, max=3.197, sum=6.946 (3)", - "tab": "Efficiency", - "score": 2.3151894005635367 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=1065 (3)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.101, mean=1.675, max=2.11, sum=5.025 (3)", - "tab": "General information", - "score": 1.6751173708920186 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1597.372, mean=1658.811, max=1711.876, sum=4976.434 (3)", - "tab": "General information", - "score": 1658.8112676056337 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.008, mean=9.939, max=17.439, sum=29.817 (3)", - "tab": "General information", - "score": 9.938967136150234 - }, - "NarrativeQA - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NarrativeQA - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NarrativeQA - Stereotypes (gender)": { - "description": "min=0.365, mean=0.372, max=0.375, sum=1.115 (3)", - "tab": "Bias", - "score": 0.3717948717948718 - }, - "NarrativeQA - Representation (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Bias", - "score": 0.6666666666666667 - }, - "NarrativeQA - Representation (gender)": { - "description": "min=0.168, mean=0.19, max=0.215, sum=0.569 (3)", - "tab": "Bias", - "score": 0.1896318370894642 - }, - "NarrativeQA - Toxic fraction": { - "description": "min=0.011, mean=0.012, max=0.014, sum=0.037 (3)", - "tab": "Toxicity", - "score": 0.012206572769953052 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (open-book)", - "source_data": { - "dataset_name": "NaturalQuestions (open-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (open-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.642, - "details": { - "description": "min=0.639, mean=0.642, max=0.649, sum=1.927 (3)", - "tab": "Accuracy", - "NaturalQuestions (closed-book) - ECE (10-bin)": { - "description": "min=0.02, mean=0.022, max=0.023, sum=0.065 (3)", - "tab": "Calibration", - "score": 0.021760896948719733 - }, - "NaturalQuestions (open-book) - ECE (10-bin)": { - "description": "min=0.071, mean=0.076, max=0.082, sum=0.228 (3)", - "tab": "Calibration", - "score": 0.07592608066404687 - }, - "NaturalQuestions (closed-book) - F1 (Robustness)": { - "description": "min=0.11, mean=0.117, max=0.122, sum=0.35 (3)", - "tab": "Robustness", - "score": 0.11665134142344884 - }, - "NaturalQuestions (open-book) - F1 (Robustness)": { - "description": "min=0.592, mean=0.6, max=0.608, sum=1.8 (3)", - "tab": "Robustness", - "score": 0.5998399895408899 - }, - "NaturalQuestions (closed-book) - F1 (Fairness)": { - "description": "min=0.112, mean=0.12, max=0.124, sum=0.361 (3)", - "tab": "Fairness", - "score": 0.12026039507733897 - }, - "NaturalQuestions (open-book) - F1 (Fairness)": { - "description": "min=0.592, mean=0.597, max=0.603, sum=1.79 (3)", - "tab": "Fairness", - "score": 0.5967933879081116 - }, - "NaturalQuestions (closed-book) - Denoised inference time (s)": { - "description": "min=0.822, mean=0.953, max=1.045, sum=2.859 (3)", - "tab": "Efficiency", - "score": 0.9528701016867446 - }, - "NaturalQuestions (open-book) - Denoised inference time (s)": { - "description": "min=2.251, mean=2.369, max=2.58, sum=7.108 (3)", - "tab": "Efficiency", - "score": 2.3693331199589207 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=15 (3)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=121.658, mean=122.991, max=125.658, sum=368.974 (3)", - "tab": "General information", - "score": 122.99133333333333 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=6.22, mean=6.707, max=7.262, sum=20.12 (3)", - "tab": "General information", - "score": 6.706666666666667 - }, - "NaturalQuestions (closed-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.505, mean=4.631, max=4.705, sum=13.892 (3)", - "tab": "General information", - "score": 4.630666666666667 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.042, mean=0.047, max=0.056, sum=0.14 (3)", - "tab": "General information", - "score": 0.04666666666666667 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1340.319, mean=1502.677, max=1625.084, sum=4508.03 (3)", - "tab": "General information", - "score": 1502.676666666667 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=19.342, mean=21.064, max=23.914, sum=63.193 (3)", - "tab": "General information", - "score": 21.064333333333334 - }, - "NaturalQuestions (open-book) - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "NaturalQuestions (closed-book) - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "NaturalQuestions (closed-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (closed-book) - Representation (race)": { - "description": "min=0.121, mean=0.269, max=0.393, sum=0.807 (3)", - "tab": "Bias", - "score": 0.2689924681892553 - }, - "NaturalQuestions (closed-book) - Representation (gender)": { - "description": "min=0.038, mean=0.059, max=0.083, sum=0.177 (3)", - "tab": "Bias", - "score": 0.05911680911680913 - }, - "NaturalQuestions (open-book) - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)", - "tab": "Bias", - "score": 0.6666666666666666 - }, - "NaturalQuestions (open-book) - Stereotypes (gender)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)", - "tab": "Bias", - "score": 0.5 - }, - "NaturalQuestions (open-book) - Representation (race)": { - "description": "min=0.571, mean=0.585, max=0.598, sum=1.754 (3)", - "tab": "Bias", - "score": 0.584615044473471 - }, - "NaturalQuestions (open-book) - Representation (gender)": { - "description": "min=0.068, mean=0.073, max=0.079, sum=0.22 (3)", - "tab": "Bias", - "score": 0.07328275644065117 - }, - "NaturalQuestions (closed-book) - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)", - "tab": "Toxicity", - "score": 0.001 - }, - "NaturalQuestions (open-book) - Toxic fraction": { - "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)", - "tab": "Toxicity", - "score": 0.0016666666666666668 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "QuAC", - "source_data": { - "dataset_name": "QuAC", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on QuAC", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.272, - "details": { - "description": "min=0.23, mean=0.272, max=0.297, sum=0.815 (3)", - "tab": "Accuracy", - "QuAC - ECE (10-bin)": { - "description": "min=0.012, mean=0.027, max=0.043, sum=0.082 (3)", - "tab": "Calibration", - "score": 0.02731272826999052 - }, - "QuAC - F1 (Robustness)": { - "description": "min=0.178, mean=0.193, max=0.202, sum=0.579 (3)", - "tab": "Robustness", - "score": 0.19293634470384977 - }, - "QuAC - F1 (Fairness)": { - "description": "min=0.173, mean=0.205, max=0.225, sum=0.616 (3)", - "tab": "Fairness", - "score": 0.20535008777735797 - }, - "QuAC - Denoised inference time (s)": { - "description": "min=4.186, mean=4.219, max=4.235, sum=12.656 (3)", - "tab": "Efficiency", - "score": 4.218568385192325 - }, - "QuAC - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "QuAC - # train": { - "description": "min=0.823, mean=0.874, max=0.929, sum=2.622 (3)", - "tab": "General information", - "score": 0.874 - }, - "QuAC - truncated": { - "description": "min=0.094, mean=0.134, max=0.177, sum=0.401 (3)", - "tab": "General information", - "score": 0.13366666666666668 - }, - "QuAC - # prompt tokens": { - "description": "min=1621.422, mean=1651.972, max=1668.212, sum=4955.915 (3)", - "tab": "General information", - "score": 1651.9716666666666 - }, - "QuAC - # output tokens": { - "description": "min=65.116, mean=73.565, max=88.524, sum=220.696 (3)", - "tab": "General information", - "score": 73.56533333333333 - }, - "QuAC - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "QuAC - Stereotypes (race)": { - "description": "min=0.604, mean=0.62, max=0.642, sum=1.86 (3)", - "tab": "Bias", - "score": 0.6201234839116704 - }, - "QuAC - Stereotypes (gender)": { - "description": "min=0.411, mean=0.431, max=0.451, sum=1.294 (3)", - "tab": "Bias", - "score": 0.43137624831417937 - }, - "QuAC - Representation (race)": { - "description": "min=0.372, mean=0.408, max=0.45, sum=1.224 (3)", - "tab": "Bias", - "score": 0.40815960651383004 - }, - "QuAC - Representation (gender)": { - "description": "min=0.259, mean=0.268, max=0.282, sum=0.803 (3)", - "tab": "Bias", - "score": 0.2675064821442643 - }, - "QuAC - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)", - "tab": "Toxicity", - "score": 0.0006666666666666666 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "HellaSwag", - "source_data": { - "dataset_name": "HellaSwag", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on HellaSwag", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "HellaSwag - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "HellaSwag - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "HellaSwag - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "HellaSwag - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "HellaSwag - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "HellaSwag - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "OpenbookQA - ECE (10-bin)": { - "description": "No matching runs", - "tab": "Calibration", - "score": null - }, - "OpenbookQA - EM (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "OpenbookQA - EM (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "OpenbookQA - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "OpenbookQA - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "OpenbookQA - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "TruthfulQA", - "source_data": { - "dataset_name": "TruthfulQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on TruthfulQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.218, - "details": { - "description": "min=0.185, mean=0.218, max=0.232, sum=0.873 (4)", - "tab": "Accuracy", - "TruthfulQA - ECE (10-bin)": { - "description": "min=0.04, mean=0.088, max=0.12, sum=0.351 (4)", - "tab": "Calibration", - "score": 0.08770199071414088 - }, - "TruthfulQA - EM (Robustness)": { - "description": "min=0.147, mean=0.196, max=0.229, sum=0.784 (4)", - "tab": "Robustness", - "score": 0.19610091743119268 - }, - "TruthfulQA - EM (Fairness)": { - "description": "min=0.148, mean=0.192, max=0.229, sum=0.766 (4)", - "tab": "Fairness", - "score": 0.1915137614678899 - }, - "TruthfulQA - Denoised inference time (s)": { - "description": "min=0.069, mean=0.158, max=0.193, sum=0.633 (4)", - "tab": "Efficiency", - "score": 0.15830796687302695 - }, - "TruthfulQA - # eval": { - "description": "min=654, mean=654, max=654, sum=2616 (4)", - "tab": "General information", - "score": 654.0 - }, - "TruthfulQA - # train": { - "description": "min=0, mean=3.75, max=5, sum=15 (4)", - "tab": "General information", - "score": 3.75 - }, - "TruthfulQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "TruthfulQA - # prompt tokens": { - "description": "min=80.786, mean=389.036, max=521.786, sum=1556.144 (4)", - "tab": "General information", - "score": 389.0359327217125 - }, - "TruthfulQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "TruthfulQA - # trials": { - "description": "min=1, mean=2.5, max=3, sum=10 (4)", - "tab": "General information", - "score": 2.5 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "MS MARCO (TREC)", - "source_data": { - "dataset_name": "MS MARCO (TREC)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "NDCG@10 on MS MARCO (TREC)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "MS MARCO (regular) - RR@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Robustness)": { - "description": "No matching runs", - "tab": "Robustness", - "score": null - }, - "MS MARCO (regular) - RR@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (TREC) - NDCG@10 (Fairness)": { - "description": "No matching runs", - "tab": "Fairness", - "score": null - }, - "MS MARCO (regular) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (TREC) - Denoised inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "MS MARCO (regular) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (TREC) - # trials": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "MS MARCO (regular) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Stereotypes (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (race)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (TREC) - Representation (gender)": { - "description": "No matching runs", - "tab": "Bias", - "score": null - }, - "MS MARCO (regular) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - }, - "MS MARCO (TREC) - Toxic fraction": { - "description": "No matching runs", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CNN/DailyMail", - "source_data": { - "dataset_name": "CNN/DailyMail", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on CNN/DailyMail", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.154, - "details": { - "description": "min=0.144, mean=0.154, max=0.166, sum=0.926 (6)", - "tab": "Accuracy", - "CNN/DailyMail - Denoised inference time (s)": { - "description": "min=3.427, mean=3.514, max=3.581, sum=21.082 (6)", - "tab": "Efficiency", - "score": 3.5136688752771708 - }, - "CNN/DailyMail - # eval": { - "description": "min=466, mean=466, max=466, sum=2796 (6)", - "tab": "General information", - "score": 466.0 - }, - "CNN/DailyMail - # train": { - "description": "min=5, mean=5, max=5, sum=30 (6)", - "tab": "General information", - "score": 5.0 - }, - "CNN/DailyMail - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "CNN/DailyMail - # prompt tokens": { - "description": "min=1644.124, mean=1657.124, max=1680.124, sum=9942.747 (6)", - "tab": "General information", - "score": 1657.1244635193134 - }, - "CNN/DailyMail - # output tokens": { - "description": "min=74.479, mean=82.997, max=91.644, sum=497.983 (6)", - "tab": "General information", - "score": 82.99713876967097 - }, - "CNN/DailyMail - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "CNN/DailyMail - Stereotypes (race)": { - "description": "min=0.601, mean=0.611, max=0.623, sum=3.663 (6)", - "tab": "Bias", - "score": 0.61056496482126 - }, - "CNN/DailyMail - Stereotypes (gender)": { - "description": "min=0.377, mean=0.394, max=0.409, sum=2.367 (6)", - "tab": "Bias", - "score": 0.3944955327838351 - }, - "CNN/DailyMail - Representation (race)": { - "description": "min=0.276, mean=0.29, max=0.305, sum=1.741 (6)", - "tab": "Bias", - "score": 0.2901527051306585 - }, - "CNN/DailyMail - Representation (gender)": { - "description": "min=0.134, mean=0.139, max=0.147, sum=0.831 (6)", - "tab": "Bias", - "score": 0.13850777854837878 - }, - "CNN/DailyMail - Toxic fraction": { - "description": "min=0, mean=0.001, max=0.004, sum=0.009 (6)", - "tab": "Toxicity", - "score": 0.001430615164520744 - }, - "CNN/DailyMail - SummaC": { - "description": "min=0.537, mean=0.566, max=0.591, sum=1.699 (3)", - "tab": "Summarization metrics", - "score": 0.5663194802454004 - }, - "CNN/DailyMail - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "CNN/DailyMail - BERTScore (F1)": { - "description": "min=0.266, mean=0.288, max=0.312, sum=0.863 (3)", - "tab": "Summarization metrics", - "score": 0.287517514648812 - }, - "CNN/DailyMail - Coverage": { - "description": "min=0.96, mean=0.972, max=0.987, sum=5.835 (6)", - "tab": "Summarization metrics", - "score": 0.9724896258431271 - }, - "CNN/DailyMail - Density": { - "description": "min=24.014, mean=30.259, max=37.594, sum=181.554 (6)", - "tab": "Summarization metrics", - "score": 30.259024131398863 - }, - "CNN/DailyMail - Compression": { - "description": "min=7.643, mean=8.687, max=9.754, sum=52.123 (6)", - "tab": "Summarization metrics", - "score": 8.68711944818053 - }, - "CNN/DailyMail - HumanEval-faithfulness": { - "description": "min=0.889, mean=0.963, max=1, sum=5.778 (6)", - "tab": "Summarization metrics", - "score": 0.9629629629629629 - }, - "CNN/DailyMail - HumanEval-relevance": { - "description": "min=3.889, mean=4.167, max=4.5, sum=25 (6)", - "tab": "Summarization metrics", - "score": 4.166666666666667 - }, - "CNN/DailyMail - HumanEval-coherence": { - "description": "min=3.111, mean=3.463, max=3.833, sum=20.778 (6)", - "tab": "Summarization metrics", - "score": 3.4629629629629632 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "XSUM", - "source_data": { - "dataset_name": "XSUM", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "ROUGE-2 on XSUM", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.132, - "details": { - "description": "min=0.131, mean=0.132, max=0.134, sum=0.794 (6)", - "tab": "Accuracy", - "XSUM - Denoised inference time (s)": { - "description": "min=2.516, mean=2.537, max=2.549, sum=15.224 (6)", - "tab": "Efficiency", - "score": 2.537310096660418 - }, - "XSUM - # eval": { - "description": "min=518, mean=518, max=518, sum=3108 (6)", - "tab": "General information", - "score": 518.0 - }, - "XSUM - # train": { - "description": "min=4.994, mean=4.996, max=4.998, sum=29.977 (6)", - "tab": "General information", - "score": 4.9961389961389955 - }, - "XSUM - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "General information", - "score": 0.0 - }, - "XSUM - # prompt tokens": { - "description": "min=1516.483, mean=1567.312, max=1610.471, sum=9403.873 (6)", - "tab": "General information", - "score": 1567.3120978120978 - }, - "XSUM - # output tokens": { - "description": "min=25.458, mean=25.737, max=26.021, sum=154.421 (6)", - "tab": "General information", - "score": 25.73680823680824 - }, - "XSUM - # trials": { - "description": "min=3, mean=3, max=3, sum=18 (6)", - "tab": "General information", - "score": 3.0 - }, - "XSUM - Stereotypes (race)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)", - "tab": "Bias", - "score": 0.6666666666666669 - }, - "XSUM - Stereotypes (gender)": { - "description": "min=0.399, mean=0.447, max=0.477, sum=2.684 (6)", - "tab": "Bias", - "score": 0.4473352072310406 - }, - "XSUM - Representation (race)": { - "description": "min=0.519, mean=0.545, max=0.579, sum=3.269 (6)", - "tab": "Bias", - "score": 0.5447683118463776 - }, - "XSUM - Representation (gender)": { - "description": "min=0.202, mean=0.207, max=0.211, sum=1.243 (6)", - "tab": "Bias", - "score": 0.2071945417372382 - }, - "XSUM - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (6)", - "tab": "Toxicity", - "score": 0.0 - }, - "XSUM - SummaC": { - "description": "min=-0.225, mean=-0.206, max=-0.183, sum=-0.617 (3)", - "tab": "Summarization metrics", - "score": -0.20556503322082545 - }, - "XSUM - QAFactEval": { - "description": "2 matching runs, but no matching metrics", - "tab": "Summarization metrics", - "score": null - }, - "XSUM - BERTScore (F1)": { - "description": "min=0.427, mean=0.427, max=0.428, sum=1.282 (3)", - "tab": "Summarization metrics", - "score": 0.42745522151316395 - }, - "XSUM - Coverage": { - "description": "min=0.813, mean=0.817, max=0.82, sum=4.905 (6)", - "tab": "Summarization metrics", - "score": 0.8174518357071618 - }, - "XSUM - Density": { - "description": "min=3.819, mean=4.041, max=4.367, sum=24.243 (6)", - "tab": "Summarization metrics", - "score": 4.040514978645572 - }, - "XSUM - Compression": { - "description": "min=16.122, mean=16.25, max=16.375, sum=97.5 (6)", - "tab": "Summarization metrics", - "score": 16.25000448561988 - }, - "XSUM - HumanEval-faithfulness": { - "description": "min=0.583, mean=0.763, max=0.905, sum=4.576 (6)", - "tab": "Summarization metrics", - "score": 0.7626984126984127 - }, - "XSUM - HumanEval-relevance": { - "description": "min=3.333, mean=3.843, max=4.1, sum=23.057 (6)", - "tab": "Summarization metrics", - "score": 3.842857142857143 - }, - "XSUM - HumanEval-coherence": { - "description": "min=3.417, mean=4.25, max=4.667, sum=25.5 (6)", - "tab": "Summarization metrics", - "score": 4.249999999999999 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "IMDB", - "source_data": { - "dataset_name": "IMDB", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on IMDB", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.955, - "details": { - "description": "min=0.946, mean=0.955, max=0.961, sum=2.864 (3)", - "tab": "Accuracy", - "IMDB - ECE (10-bin)": { - "description": "min=0.117, mean=0.18, max=0.225, sum=0.541 (3)", - "tab": "Calibration", - "score": 0.18041748611363093 - }, - "IMDB - EM (Robustness)": { - "description": "min=0.921, mean=0.938, max=0.955, sum=2.814 (3)", - "tab": "Robustness", - "score": 0.9380000000000001 - }, - "IMDB - EM (Fairness)": { - "description": "min=0.92, mean=0.933, max=0.951, sum=2.799 (3)", - "tab": "Fairness", - "score": 0.9329999999999999 - }, - "IMDB - Denoised inference time (s)": { - "description": "min=1.446, mean=1.497, max=1.55, sum=4.491 (3)", - "tab": "Efficiency", - "score": 1.4970239554705547 - }, - "IMDB - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=3000 (3)", - "tab": "General information", - "score": 1000.0 - }, - "IMDB - # train": { - "description": "min=4.832, mean=4.923, max=4.979, sum=14.77 (3)", - "tab": "General information", - "score": 4.923333333333333 - }, - "IMDB - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (3)", - "tab": "General information", - "score": 0.0 - }, - "IMDB - # prompt tokens": { - "description": "min=1182.719, mean=1412.285, max=1755.875, sum=4236.855 (3)", - "tab": "General information", - "score": 1412.2849999999999 - }, - "IMDB - # output tokens": { - "description": "min=2, mean=2, max=2, sum=6 (3)", - "tab": "General information", - "score": 2.0 - }, - "IMDB - # trials": { - "description": "min=3, mean=3, max=3, sum=9 (3)", - "tab": "General information", - "score": 3.0 - }, - "IMDB - Stereotypes (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Stereotypes (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (race)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Representation (gender)": { - "description": "1 matching runs, but no matching metrics", - "tab": "Bias", - "score": null - }, - "IMDB - Toxic fraction": { - "description": "1 matching runs, but no matching metrics", - "tab": "Toxicity", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "CivilComments", - "source_data": { - "dataset_name": "CivilComments", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on CivilComments", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0, mean=0.5, max=1, sum=27.019 (54)", - "tab": "Accuracy", - "CivilComments - ECE (10-bin)": { - "description": "min=0.22, mean=0.486, max=0.749, sum=26.268 (54)", - "tab": "Calibration", - "score": 0.4864398714978027 - }, - "CivilComments - EM (Robustness)": { - "description": "min=0, mean=0.5, max=1, sum=27.004 (54)", - "tab": "Robustness", - "score": 0.5000703286326241 - }, - "CivilComments - EM (Fairness)": { - "description": "min=0, mean=0.5, max=1, sum=26.982 (54)", - "tab": "Fairness", - "score": 0.4996593325872097 - }, - "CivilComments - Denoised inference time (s)": { - "description": "min=0.442, mean=0.695, max=1.665, sum=37.54 (54)", - "tab": "Efficiency", - "score": 0.695191819583079 - }, - "CivilComments - # eval": { - "description": "min=74, mean=371.556, max=683, sum=20064 (54)", - "tab": "General information", - "score": 371.55555555555554 - }, - "CivilComments - # train": { - "description": "min=5, mean=5, max=5, sum=270 (54)", - "tab": "General information", - "score": 5.0 - }, - "CivilComments - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "General information", - "score": 0.0 - }, - "CivilComments - # prompt tokens": { - "description": "min=342, mean=694.39, max=1246.337, sum=37497.067 (54)", - "tab": "General information", - "score": 694.3901297399493 - }, - "CivilComments - # output tokens": { - "description": "min=2, mean=2, max=2, sum=108 (54)", - "tab": "General information", - "score": 2.0 - }, - "CivilComments - # trials": { - "description": "min=3, mean=3, max=3, sum=162 (54)", - "tab": "General information", - "score": 3.0 - }, - "CivilComments - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "CivilComments - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (54)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "RAFT", - "source_data": { - "dataset_name": "RAFT", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on RAFT", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.598, - "details": { - "description": "min=0, mean=0.598, max=0.975, sum=19.725 (33)", - "tab": "Accuracy", - "RAFT - ECE (10-bin)": { - "description": "min=0.045, mean=0.226, max=0.392, sum=7.451 (33)", - "tab": "Calibration", - "score": 0.225785860693393 - }, - "RAFT - EM (Robustness)": { - "description": "min=0, mean=0.577, max=0.975, sum=19.05 (33)", - "tab": "Robustness", - "score": 0.5772727272727272 - }, - "RAFT - EM (Fairness)": { - "description": "min=0, mean=0.575, max=0.975, sum=18.975 (33)", - "tab": "Fairness", - "score": 0.575 - }, - "RAFT - Denoised inference time (s)": { - "description": "min=0.333, mean=1.471, max=2.214, sum=48.528 (33)", - "tab": "Efficiency", - "score": 1.4705579548050658 - }, - "RAFT - # eval": { - "description": "min=40, mean=40, max=40, sum=1320 (33)", - "tab": "General information", - "score": 40.0 - }, - "RAFT - # train": { - "description": "min=0, mean=4.563, max=5, sum=150.575 (33)", - "tab": "General information", - "score": 4.5628787878787875 - }, - "RAFT - truncated": { - "description": "min=0, mean=0.07, max=1, sum=2.3 (33)", - "tab": "General information", - "score": 0.06969696969696969 - }, - "RAFT - # prompt tokens": { - "description": "min=244.45, mean=803.318, max=1757.15, sum=26509.5 (33)", - "tab": "General information", - "score": 803.3181818181819 - }, - "RAFT - # output tokens": { - "description": "min=2.6, mean=4.886, max=11.6, sum=161.25 (33)", - "tab": "General information", - "score": 4.886363636363637 - }, - "RAFT - # trials": { - "description": "min=3, mean=3, max=3, sum=99 (33)", - "tab": "General information", - "score": 3.0 - }, - "RAFT - Stereotypes (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Stereotypes (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (race)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Representation (gender)": { - "description": "(0)", - "tab": "Bias", - "score": null - }, - "RAFT - Toxic fraction": { - "description": "min=0, mean=0, max=0, sum=0 (33)", - "tab": "Toxicity", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json deleted file mode 100644 index 31ab229b7..000000000 --- a/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json +++ /dev/null @@ -1,267 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770834858.3559701", - "retrieved_timestamp": "1770834858.3559701", - "source_metadata": { - "source_name": "helm_instruct", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Anthropic Claude v1.3", - "id": "anthropic/claude-v1.3", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611, - "details": { - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "Anthropic RLHF dataset", - "source_data": { - "dataset_name": "Anthropic RLHF dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Anthropic RLHF dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.965, - "details": { - "description": "min=4.925, mean=4.965, max=5, sum=39.72 (8)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Best ChatGPT Prompts", - "source_data": { - "dataset_name": "Best ChatGPT Prompts", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Best ChatGPT Prompts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.995, - "details": { - "description": "min=4.985, mean=4.995, max=5, sum=19.98 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Koala test dataset", - "source_data": { - "dataset_name": "Koala test dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Koala test dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.981, - "details": { - "description": "min=4.965, mean=4.981, max=5, sum=19.925 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Open Assistant", - "source_data": { - "dataset_name": "Open Assistant", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Open Assistant", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.975, - "details": { - "description": "min=4.935, mean=4.975, max=5, sum=19.9 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Self Instruct", - "source_data": { - "dataset_name": "Self Instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Self Instruct", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.992, - "details": { - "description": "min=4.98, mean=4.992, max=5, sum=19.97 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Vicuna", - "source_data": { - "dataset_name": "Vicuna", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Vicuna", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.989, - "details": { - "description": "min=4.956, mean=4.989, max=5, sum=19.956 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json deleted file mode 100644 index 2fd221159..000000000 --- a/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json +++ /dev/null @@ -1,267 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770834858.3559701", - "retrieved_timestamp": "1770834858.3559701", - "source_metadata": { - "source_name": "helm_instruct", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere Command beta 52.4B", - "id": "cohere/command-xlarge-beta", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.089, - "details": { - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "Anthropic RLHF dataset", - "source_data": { - "dataset_name": "Anthropic RLHF dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Anthropic RLHF dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.214, - "details": { - "description": "min=3.38, mean=4.214, max=4.92, sum=33.715 (8)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Best ChatGPT Prompts", - "source_data": { - "dataset_name": "Best ChatGPT Prompts", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Best ChatGPT Prompts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.988, - "details": { - "description": "min=4.98, mean=4.988, max=5, sum=19.95 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Koala test dataset", - "source_data": { - "dataset_name": "Koala test dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Koala test dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.969, - "details": { - "description": "min=4.936, mean=4.969, max=5, sum=19.874 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Open Assistant", - "source_data": { - "dataset_name": "Open Assistant", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Open Assistant", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.967, - "details": { - "description": "min=4.955, mean=4.967, max=5, sum=19.87 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Self Instruct", - "source_data": { - "dataset_name": "Self Instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Self Instruct", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.971, - "details": { - "description": "min=4.955, mean=4.971, max=5, sum=19.885 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Vicuna", - "source_data": { - "dataset_name": "Vicuna", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Vicuna", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.995, - "details": { - "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json deleted file mode 100644 index 23dfc4397..000000000 --- a/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json +++ /dev/null @@ -1,267 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770834858.3559701", - "retrieved_timestamp": "1770834858.3559701", - "source_metadata": { - "source_name": "helm_instruct", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-3.5 Turbo 0613", - "id": "openai/gpt-3.5-turbo-0613", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689, - "details": { - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "Anthropic RLHF dataset", - "source_data": { - "dataset_name": "Anthropic RLHF dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Anthropic RLHF dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.964, - "details": { - "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Best ChatGPT Prompts", - "source_data": { - "dataset_name": "Best ChatGPT Prompts", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Best ChatGPT Prompts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.986, - "details": { - "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Koala test dataset", - "source_data": { - "dataset_name": "Koala test dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Koala test dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.987, - "details": { - "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Open Assistant", - "source_data": { - "dataset_name": "Open Assistant", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Open Assistant", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.987, - "details": { - "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Self Instruct", - "source_data": { - "dataset_name": "Self Instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Self Instruct", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.99, - "details": { - "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Vicuna", - "source_data": { - "dataset_name": "Vicuna", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Vicuna", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.992, - "details": { - "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json deleted file mode 100644 index 9ad1bca2e..000000000 --- a/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json +++ /dev/null @@ -1,267 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770834858.3559701", - "retrieved_timestamp": "1770834858.3559701", - "source_metadata": { - "source_name": "helm_instruct", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4 0314", - "id": "openai/gpt-4-0314", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperform on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611, - "details": { - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "Anthropic RLHF dataset", - "source_data": { - "dataset_name": "Anthropic RLHF dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Anthropic RLHF dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.934, - "details": { - "description": "min=4.83, mean=4.934, max=5, sum=39.47 (8)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "hh", - "hh", - "hh", - "hh", - "red_team", - "red_team", - "red_team", - "red_team" - ], - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale", - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Best ChatGPT Prompts", - "source_data": { - "dataset_name": "Best ChatGPT Prompts", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Best ChatGPT Prompts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.973, - "details": { - "description": "min=4.915, mean=4.973, max=5, sum=19.894 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml", - "tags": "", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Koala test dataset", - "source_data": { - "dataset_name": "Koala test dataset", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Koala test dataset", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.966, - "details": { - "description": "min=4.913, mean=4.966, max=5, sum=19.863 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Open Assistant", - "source_data": { - "dataset_name": "Open Assistant", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Open Assistant", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.986, - "details": { - "description": "min=4.97, mean=4.986, max=5, sum=19.945 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "language": "en", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Self Instruct", - "source_data": { - "dataset_name": "Self Instruct", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Self Instruct", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.976, - "details": { - "description": "min=4.945, mean=4.976, max=5, sum=19.905 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - }, - { - "evaluation_name": "Vicuna", - "source_data": { - "dataset_name": "Vicuna", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json" - ] - }, - "metric_config": { - "evaluation_description": "Harmlessness on Vicuna", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 5.0 - }, - "score_details": { - "score": 4.995, - "details": { - "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)", - "tab": "Instruction Following" - } - }, - "generation_config": { - "additional_details": { - "category": "all", - "evaluator": [ - "claude", - "gpt4", - "mturk", - "scale" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json deleted file mode 100644 index 946b7db3e..000000000 --- a/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/01-ai_yi-34b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi 34B", - "id": "01-ai/yi-34b", - "developer": "01-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.2681148564294632 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782, - "details": { - "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=2.368, mean=2.368, max=2.368, sum=2.368 (1)", - "tab": "Efficiency", - "score": 2.368284817816506 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.868, mean=4.868, max=4.868, sum=4.868 (1)", - "tab": "General information", - "score": 4.867605633802817 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3611.445, mean=3611.445, max=3611.445, sum=3611.445 (1)", - "tab": "General information", - "score": 3611.445070422535 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.443, - "details": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.816, mean=1.816, max=1.816, sum=1.816 (1)", - "tab": "Efficiency", - "score": 1.8157690076828004 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.458, mean=1.458, max=1.458, sum=1.458 (1)", - "tab": "Efficiency", - "score": 1.4578230485916137 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.838, mean=4.838, max=4.838, sum=4.838 (1)", - "tab": "General information", - "score": 4.838 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2171.698, mean=2171.698, max=2171.698, sum=2171.698 (1)", - "tab": "General information", - "score": 2171.698 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)", - "tab": "General information", - "score": 0.995 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=131.695, mean=131.695, max=131.695, sum=131.695 (1)", - "tab": "General information", - "score": 131.695 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=0.92 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.823, mean=0.823, max=0.823, sum=0.823 (1)", - "tab": "Efficiency", - "score": 0.8229070715904235 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=260.002, mean=260.002, max=260.002, sum=260.002 (1)", - "tab": "General information", - "score": 260.002 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65, - "details": { - "description": "min=0.4, mean=0.65, max=0.91, sum=3.248 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.511, mean=0.697, max=0.925, sum=3.486 (5)", - "tab": "Efficiency", - "score": 0.6972272023485417 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=383.67, mean=502.654, max=667.789, sum=2513.269 (5)", - "tab": "General information", - "score": 502.65389473684206 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375, - "details": { - "description": "min=0.167, mean=0.375, max=0.563, sum=2.623 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.651, mean=3.809, max=4.649, sum=26.664 (7)", - "tab": "Efficiency", - "score": 3.809198633421 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=976.696, mean=1468.935, max=2582.038, sum=10282.547 (7)", - "tab": "General information", - "score": 1468.9352369693863 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.648, - "details": { - "description": "min=0.648, mean=0.648, max=0.648, sum=0.648 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=4.887, mean=4.887, max=4.887, sum=4.887 (1)", - "tab": "Efficiency", - "score": 4.886563032150269 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1170.814, mean=1170.814, max=1170.814, sum=1170.814 (1)", - "tab": "General information", - "score": 1170.814 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.618, - "details": { - "description": "min=0.311, mean=0.618, max=0.8, sum=3.089 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.465, mean=0.8, max=1.207, sum=4.002 (5)", - "tab": "Efficiency", - "score": 0.8004560962069804 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=2, mean=4.2, max=5, sum=21 (5)", - "tab": "General information", - "score": 4.2 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=211.779, mean=951.524, max=3359.547, sum=4757.621 (5)", - "tab": "General information", - "score": 951.5242922438443 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656, - "details": { - "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=1.064, mean=1.064, max=1.064, sum=1.064 (1)", - "tab": "Efficiency", - "score": 1.064007310696672 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1122.392, mean=1122.392, max=1122.392, sum=1122.392 (1)", - "tab": "General information", - "score": 1122.3916500994035 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.172, - "details": { - "description": "min=0.1, mean=0.172, max=0.218, sum=0.858 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.071, mean=1.404, max=2.506, sum=7.021 (5)", - "tab": "Efficiency", - "score": 1.4042062711970469 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=139.298, mean=187.092, max=317.56, sum=935.461 (5)", - "tab": "General information", - "score": 187.09213851506345 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json deleted file mode 100644 index 28ba5fb69..000000000 --- a/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/01-ai_yi-6b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi 6B", - "id": "01-ai/yi-6b", - "developer": "01-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.253, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6630461922596754 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=0.702 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.404, mean=1.404, max=1.404, sum=1.404 (1)", - "tab": "Efficiency", - "score": 1.4038719868995775 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.868, mean=4.868, max=4.868, sum=4.868 (1)", - "tab": "General information", - "score": 4.867605633802817 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3611.445, mean=3611.445, max=3611.445, sum=3611.445 (1)", - "tab": "General information", - "score": 3611.445070422535 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31, - "details": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.31 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.911, mean=0.911, max=0.911, sum=0.911 (1)", - "tab": "Efficiency", - "score": 0.9108293209075927 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)", - "tab": "Efficiency", - "score": 0.4127621691226959 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.838, mean=4.838, max=4.838, sum=4.838 (1)", - "tab": "General information", - "score": 4.838 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2171.698, mean=2171.698, max=2171.698, sum=2171.698 (1)", - "tab": "General information", - "score": 2171.698 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)", - "tab": "General information", - "score": 0.995 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=131.695, mean=131.695, max=131.695, sum=131.695 (1)", - "tab": "General information", - "score": 131.695 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)", - "tab": "Efficiency", - "score": 0.3535394024848938 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=260.002, mean=260.002, max=260.002, sum=260.002 (1)", - "tab": "General information", - "score": 260.002 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.3, mean=0.53, max=0.87, sum=2.651 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.323, mean=0.339, max=0.368, sum=1.696 (5)", - "tab": "Efficiency", - "score": 0.3391338364283244 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=383.67, mean=502.654, max=667.789, sum=2513.269 (5)", - "tab": "General information", - "score": 502.65389473684206 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.126, - "details": { - "description": "min=0.058, mean=0.126, max=0.2, sum=0.881 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.167, mean=1.837, max=2.263, sum=12.86 (7)", - "tab": "Efficiency", - "score": 1.8371926514375443 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=976.696, mean=1468.935, max=2582.038, sum=10282.547 (7)", - "tab": "General information", - "score": 1468.9352369693863 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375, - "details": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.878, mean=1.878, max=1.878, sum=1.878 (1)", - "tab": "Efficiency", - "score": 1.8781680135726928 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1170.814, mean=1170.814, max=1170.814, sum=1170.814 (1)", - "tab": "General information", - "score": 1170.814 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519, - "details": { - "description": "min=0.284, mean=0.519, max=0.779, sum=2.594 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.379, mean=0.553, max=1.149, sum=2.764 (5)", - "tab": "Efficiency", - "score": 0.5528668178286933 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=2, mean=4.2, max=5, sum=21 (5)", - "tab": "General information", - "score": 4.2 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=211.779, mean=951.524, max=3359.547, sum=4757.621 (5)", - "tab": "General information", - "score": 951.5242922438443 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.497, - "details": { - "description": "min=0.497, mean=0.497, max=0.497, sum=0.497 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)", - "tab": "Efficiency", - "score": 0.4053303655051806 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1122.392, mean=1122.392, max=1122.392, sum=1122.392 (1)", - "tab": "General information", - "score": 1122.3916500994035 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.117, - "details": { - "description": "min=0.055, mean=0.117, max=0.182, sum=0.584 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.602, mean=0.626, max=0.666, sum=3.129 (5)", - "tab": "Efficiency", - "score": 0.6257070175426044 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=139.298, mean=187.092, max=317.56, sum=935.461 (5)", - "tab": "General information", - "score": 187.09213851506345 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json deleted file mode 100644 index 9fe678bb4..000000000 --- a/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi Large Preview", - "id": "01-ai/yi-large-preview", - "developer": "01-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.471, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.17893882646691636 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.373, - "details": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=2.672, mean=2.672, max=2.672, sum=2.672 (1)", - "tab": "Efficiency", - "score": 2.6724000897206053 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3724.042, mean=3724.042, max=3724.042, sum=3724.042 (1)", - "tab": "General information", - "score": 3724.042253521127 - }, - "NarrativeQA - # output tokens": { - "description": "min=21.513, mean=21.513, max=21.513, sum=21.513 (1)", - "tab": "General information", - "score": 21.512676056338027 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428, - "details": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=2.506, mean=2.506, max=2.506, sum=2.506 (1)", - "tab": "Efficiency", - "score": 2.506305232524872 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.036, mean=1.036, max=1.036, sum=1.036 (1)", - "tab": "Efficiency", - "score": 1.0360134015083313 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.983, mean=4.983, max=4.983, sum=4.983 (1)", - "tab": "General information", - "score": 4.983 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)", - "tab": "General information", - "score": 0.003 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2368.513, mean=2368.513, max=2368.513, sum=2368.513 (1)", - "tab": "General information", - "score": 2368.513 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=23.703, mean=23.703, max=23.703, sum=23.703 (1)", - "tab": "General information", - "score": 23.703 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=160.695, mean=160.695, max=160.695, sum=160.695 (1)", - "tab": "General information", - "score": 160.695 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.629, mean=4.629, max=4.629, sum=4.629 (1)", - "tab": "General information", - "score": 4.629 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.946, - "details": { - "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", - "tab": "Efficiency", - "score": 0.77673295545578 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=273.002, mean=273.002, max=273.002, sum=273.002 (1)", - "tab": "General information", - "score": 273.002 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.712, - "details": { - "description": "min=0.52, mean=0.712, max=0.86, sum=3.558 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.679, mean=0.713, max=0.752, sum=3.567 (5)", - "tab": "Efficiency", - "score": 0.7133434140138459 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=396.67, mean=515.654, max=680.789, sum=2578.269 (5)", - "tab": "General information", - "score": 515.6538947368421 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.712, - "details": { - "description": "min=0.553, mean=0.712, max=0.874, sum=4.982 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=8.67, mean=11.511, max=13.559, sum=80.577 (7)", - "tab": "Efficiency", - "score": 11.510960669458308 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=976.696, mean=1468.935, max=2582.038, sum=10282.547 (7)", - "tab": "General information", - "score": 1468.9352369693863 - }, - "MATH - # output tokens": { - "description": "min=189.756, mean=254.005, max=296.346, sum=1778.034 (7)", - "tab": "General information", - "score": 254.00484808722263 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=0.69 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=13.45, mean=13.45, max=13.45, sum=13.45 (1)", - "tab": "Efficiency", - "score": 13.45040065407753 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1170.814, mean=1170.814, max=1170.814, sum=1170.814 (1)", - "tab": "General information", - "score": 1170.814 - }, - "GSM8K - # output tokens": { - "description": "min=288.079, mean=288.079, max=288.079, sum=288.079 (1)", - "tab": "General information", - "score": 288.079 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519, - "details": { - "description": "min=0.145, mean=0.519, max=0.884, sum=2.594 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.855, mean=1.472, max=3.502, sum=7.358 (5)", - "tab": "Efficiency", - "score": 1.471592522464795 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=228.779, mean=1656.095, max=6814.4, sum=8280.475 (5)", - "tab": "General information", - "score": 1656.0949044887425 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=3.339, max=6.263, sum=16.697 (5)", - "tab": "General information", - "score": 3.339402150569105 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)", - "tab": "Efficiency", - "score": 0.9931588552107157 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1135.392, mean=1135.392, max=1135.392, sum=1135.392 (1)", - "tab": "General information", - "score": 1135.3916500994035 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176, - "details": { - "description": "min=0.126, mean=0.176, max=0.218, sum=0.88 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.838, mean=2.095, max=2.409, sum=10.477 (5)", - "tab": "Efficiency", - "score": 2.095412739007152 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=157.298, mean=205.092, max=335.56, sum=1025.461 (5)", - "tab": "General information", - "score": 205.09213851506343 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.773, mean=29.058, max=36.698, sum=145.291 (5)", - "tab": "General information", - "score": 29.058130065759293 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json deleted file mode 100644 index fb405652b..000000000 --- a/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Luminous Base 13B", - "id": "AlephAlpha/luminous-base", - "developer": "AlephAlpha", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.041, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.29337078651685394 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.633, - "details": { - "description": "min=0.633, mean=0.633, max=0.633, sum=0.633 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.05, mean=1.05, max=1.05, sum=1.05 (1)", - "tab": "Efficiency", - "score": 1.05044368958809 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.037, mean=2.037, max=2.037, sum=2.037 (1)", - "tab": "General information", - "score": 2.036619718309859 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1694.642, mean=1694.642, max=1694.642, sum=1694.642 (1)", - "tab": "General information", - "score": 1694.6422535211268 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.521, mean=5.521, max=5.521, sum=5.521 (1)", - "tab": "General information", - "score": 5.52112676056338 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.197, - "details": { - "description": "min=0.197, mean=0.197, max=0.197, sum=0.197 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.329, mean=1.329, max=1.329, sum=1.329 (1)", - "tab": "Efficiency", - "score": 1.328731627702713 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.802, mean=0.802, max=0.802, sum=0.802 (1)", - "tab": "Efficiency", - "score": 0.8020290625095368 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.717, mean=4.717, max=4.717, sum=4.717 (1)", - "tab": "General information", - "score": 4.717 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1488.14, mean=1488.14, max=1488.14, sum=1488.14 (1)", - "tab": "General information", - "score": 1488.14 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=10.866, mean=10.866, max=10.866, sum=10.866 (1)", - "tab": "General information", - "score": 10.866 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=116.087, mean=116.087, max=116.087, sum=116.087 (1)", - "tab": "General information", - "score": 116.087 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.908, mean=5.908, max=5.908, sum=5.908 (1)", - "tab": "General information", - "score": 5.908 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.286, - "details": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.286 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)", - "tab": "Efficiency", - "score": 0.6669360423088073 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=254.652, mean=254.652, max=254.652, sum=254.652 (1)", - "tab": "General information", - "score": 254.652 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.243, - "details": { - "description": "min=0.22, mean=0.243, max=0.29, sum=1.217 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.619, mean=0.632, max=0.648, sum=3.162 (5)", - "tab": "Efficiency", - "score": 0.6324507230122884 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=360.75, mean=471.075, max=618.447, sum=2355.377 (5)", - "tab": "General information", - "score": 471.0754736842106 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.026, - "details": { - "description": "min=0, mean=0.026, max=0.067, sum=0.184 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=5.282, mean=9.204, max=20.088, sum=64.425 (7)", - "tab": "Efficiency", - "score": 9.203530075671766 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)", - "tab": "General information", - "score": 6.915558126084441 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=928.719, mean=1184.139, max=1546.442, sum=8288.975 (7)", - "tab": "General information", - "score": 1184.139339428874 - }, - "MATH - # output tokens": { - "description": "min=114.077, mean=139.637, max=180.663, sum=977.456 (7)", - "tab": "General information", - "score": 139.6365272403828 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.028, - "details": { - "description": "min=0.028, mean=0.028, max=0.028, sum=0.028 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=16.427, mean=16.427, max=16.427, sum=16.427 (1)", - "tab": "Efficiency", - "score": 16.42652773284912 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=943.121, mean=943.121, max=943.121, sum=943.121 (1)", - "tab": "General information", - "score": 943.121 - }, - "GSM8K - # output tokens": { - "description": "min=400, mean=400, max=400, sum=400 (1)", - "tab": "General information", - "score": 400.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.332, - "details": { - "description": "min=0.165, mean=0.332, max=0.601, sum=1.659 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.636, mean=0.753, max=1.073, sum=3.767 (5)", - "tab": "Efficiency", - "score": 0.7533007583490331 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.335, mean=3.867, max=5, sum=19.335 (5)", - "tab": "General information", - "score": 3.866938775510204 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.133, max=0.665, sum=0.665 (5)", - "tab": "General information", - "score": 0.1330612244897959 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.726, mean=566.59, max=1514.545, sum=2832.948 (5)", - "tab": "General information", - "score": 566.5895794484264 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.639, max=4.027, sum=8.196 (5)", - "tab": "General information", - "score": 1.6391061224489796 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)", - "tab": "Efficiency", - "score": 0.7258754989972882 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1005.229, mean=1005.229, max=1005.229, sum=1005.229 (1)", - "tab": "General information", - "score": 1005.2286282306163 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.066, - "details": { - "description": "min=0.0, mean=0.066, max=0.171, sum=0.331 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=4.671, mean=4.693, max=4.731, sum=23.465 (5)", - "tab": "Efficiency", - "score": 4.692985351748752 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=99.111, mean=157.232, max=255.504, sum=786.158 (5)", - "tab": "General information", - "score": 157.2315362631901 - }, - "WMT 2014 - # output tokens": { - "description": "min=99.869, mean=99.974, max=100, sum=499.869 (5)", - "tab": "General information", - "score": 99.97375745526838 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json deleted file mode 100644 index 786a7e340..000000000 --- a/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Luminous Extended 30B", - "id": "AlephAlpha/luminous-extended", - "developer": "AlephAlpha", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.078, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.2278027465667915 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.467, mean=1.467, max=1.467, sum=1.467 (1)", - "tab": "Efficiency", - "score": 1.4667296523779212 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.037, mean=2.037, max=2.037, sum=2.037 (1)", - "tab": "General information", - "score": 2.036619718309859 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1694.642, mean=1694.642, max=1694.642, sum=1694.642 (1)", - "tab": "General information", - "score": 1694.6422535211268 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.335, mean=6.335, max=6.335, sum=6.335 (1)", - "tab": "General information", - "score": 6.335211267605634 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.253, - "details": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.778, mean=1.778, max=1.778, sum=1.778 (1)", - "tab": "Efficiency", - "score": 1.777582576751709 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.98, mean=0.98, max=0.98, sum=0.98 (1)", - "tab": "Efficiency", - "score": 0.9799906523227692 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.717, mean=4.717, max=4.717, sum=4.717 (1)", - "tab": "General information", - "score": 4.717 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1488.14, mean=1488.14, max=1488.14, sum=1488.14 (1)", - "tab": "General information", - "score": 1488.14 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=11.063, mean=11.063, max=11.063, sum=11.063 (1)", - "tab": "General information", - "score": 11.063 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=116.087, mean=116.087, max=116.087, sum=116.087 (1)", - "tab": "General information", - "score": 116.087 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=6.869, mean=6.869, max=6.869, sum=6.869 (1)", - "tab": "General information", - "score": 6.869 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.272, - "details": { - "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)", - "tab": "Efficiency", - "score": 0.6750410146713257 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=254.652, mean=254.652, max=254.652, sum=254.652 (1)", - "tab": "General information", - "score": 254.652 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.248, - "details": { - "description": "min=0.2, mean=0.248, max=0.31, sum=1.242 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.69, mean=0.718, max=0.754, sum=3.592 (5)", - "tab": "Efficiency", - "score": 0.7183412402554562 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=360.75, mean=471.075, max=618.447, sum=2355.377 (5)", - "tab": "General information", - "score": 471.0754736842106 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04, - "details": { - "description": "min=0, mean=0.04, max=0.088, sum=0.278 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=5.96, mean=9.364, max=12.108, sum=65.551 (7)", - "tab": "Efficiency", - "score": 9.364456500699777 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)", - "tab": "General information", - "score": 6.915558126084441 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=928.719, mean=1184.139, max=1546.442, sum=8288.975 (7)", - "tab": "General information", - "score": 1184.139339428874 - }, - "MATH - # output tokens": { - "description": "min=92.684, mean=142.866, max=180.2, sum=1000.065 (7)", - "tab": "General information", - "score": 142.86643564287382 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.075, - "details": { - "description": "min=0.075, mean=0.075, max=0.075, sum=0.075 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=22.685, mean=22.685, max=22.685, sum=22.685 (1)", - "tab": "Efficiency", - "score": 22.685439155817033 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=943.121, mean=943.121, max=943.121, sum=943.121 (1)", - "tab": "General information", - "score": 943.121 - }, - "GSM8K - # output tokens": { - "description": "min=400, mean=400, max=400, sum=400 (1)", - "tab": "General information", - "score": 400.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421, - "details": { - "description": "min=0.204, mean=0.421, max=0.632, sum=2.107 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.7, mean=0.858, max=1.261, sum=4.291 (5)", - "tab": "Efficiency", - "score": 0.8581969152200717 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.335, mean=3.867, max=5, sum=19.335 (5)", - "tab": "General information", - "score": 3.866938775510204 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.133, max=0.665, sum=0.665 (5)", - "tab": "General information", - "score": 0.1330612244897959 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.726, mean=566.59, max=1514.545, sum=2832.948 (5)", - "tab": "General information", - "score": 566.5895794484264 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.548, max=3.196, sum=7.739 (5)", - "tab": "General information", - "score": 1.5478898257711229 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276, - "details": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.895, mean=0.895, max=0.895, sum=0.895 (1)", - "tab": "Efficiency", - "score": 0.8947408758622277 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1005.229, mean=1005.229, max=1005.229, sum=1005.229 (1)", - "tab": "General information", - "score": 1005.2286282306163 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.083, - "details": { - "description": "min=0.0, mean=0.083, max=0.194, sum=0.415 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=5.231, mean=5.336, max=5.406, sum=26.68 (5)", - "tab": "Efficiency", - "score": 5.33597646673717 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=99.111, mean=157.232, max=255.504, sum=786.158 (5)", - "tab": "General information", - "score": 157.2315362631901 - }, - "WMT 2014 - # output tokens": { - "description": "min=100, mean=100, max=100, sum=500 (5)", - "tab": "General information", - "score": 100.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json deleted file mode 100644 index 78da47969..000000000 --- a/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Luminous Supreme 70B", - "id": "AlephAlpha/luminous-supreme", - "developer": "AlephAlpha", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.1344569288389513 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=2.951, mean=2.951, max=2.951, sum=2.951 (1)", - "tab": "Efficiency", - "score": 2.9511526873413945 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.037, mean=2.037, max=2.037, sum=2.037 (1)", - "tab": "General information", - "score": 2.036619718309859 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1694.642, mean=1694.642, max=1694.642, sum=1694.642 (1)", - "tab": "General information", - "score": 1694.6422535211268 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.685, mean=5.685, max=5.685, sum=5.685 (1)", - "tab": "General information", - "score": 5.6845070422535215 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.299, - "details": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.299 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=2.657, mean=2.657, max=2.657, sum=2.657 (1)", - "tab": "Efficiency", - "score": 2.656584274530411 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.272, mean=1.272, max=1.272, sum=1.272 (1)", - "tab": "Efficiency", - "score": 1.2722365505695343 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.717, mean=4.717, max=4.717, sum=4.717 (1)", - "tab": "General information", - "score": 4.717 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1488.14, mean=1488.14, max=1488.14, sum=1488.14 (1)", - "tab": "General information", - "score": 1488.14 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.864, mean=6.864, max=6.864, sum=6.864 (1)", - "tab": "General information", - "score": 6.864 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=116.087, mean=116.087, max=116.087, sum=116.087 (1)", - "tab": "General information", - "score": 116.087 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.666, mean=4.666, max=4.666, sum=4.666 (1)", - "tab": "General information", - "score": 4.666 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.284, - "details": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)", - "tab": "Efficiency", - "score": 0.778845920085907 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=254.652, mean=254.652, max=254.652, sum=254.652 (1)", - "tab": "General information", - "score": 254.652 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.316, - "details": { - "description": "min=0.18, mean=0.316, max=0.5, sum=1.582 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.825, mean=0.907, max=1.009, sum=4.537 (5)", - "tab": "Efficiency", - "score": 0.9073754794472141 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=360.75, mean=471.075, max=618.447, sum=2355.377 (5)", - "tab": "General information", - "score": 471.0754736842106 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.078, - "details": { - "description": "min=0.038, mean=0.078, max=0.158, sum=0.548 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=13.143, mean=16.874, max=20.77, sum=118.115 (7)", - "tab": "Efficiency", - "score": 16.873623512856078 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)", - "tab": "General information", - "score": 6.915558126084441 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=928.719, mean=1184.139, max=1546.442, sum=8288.975 (7)", - "tab": "General information", - "score": 1184.139339428874 - }, - "MATH - # output tokens": { - "description": "min=90.605, mean=127.587, max=150.635, sum=893.112 (7)", - "tab": "General information", - "score": 127.58738933898053 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.137, - "details": { - "description": "min=0.137, mean=0.137, max=0.137, sum=0.137 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=48.242, mean=48.242, max=48.242, sum=48.242 (1)", - "tab": "Efficiency", - "score": 48.241569149971006 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=943.121, mean=943.121, max=943.121, sum=943.121 (1)", - "tab": "General information", - "score": 943.121 - }, - "GSM8K - # output tokens": { - "description": "min=400, mean=400, max=400, sum=400 (1)", - "tab": "General information", - "score": 400.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.221, mean=0.452, max=0.768, sum=2.26 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.84, mean=1.156, max=2.035, sum=5.781 (5)", - "tab": "Efficiency", - "score": 1.1561943690304337 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.335, mean=3.867, max=5, sum=19.335 (5)", - "tab": "General information", - "score": 3.866938775510204 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.133, max=0.665, sum=0.665 (5)", - "tab": "General information", - "score": 0.1330612244897959 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.726, mean=566.59, max=1514.545, sum=2832.948 (5)", - "tab": "General information", - "score": 566.5895794484264 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.266, max=1.769, sum=6.329 (5)", - "tab": "General information", - "score": 1.2657996218650946 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276, - "details": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=1.326, mean=1.326, max=1.326, sum=1.326 (1)", - "tab": "Efficiency", - "score": 1.325726029887114 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1005.229, mean=1005.229, max=1005.229, sum=1005.229 (1)", - "tab": "General information", - "score": 1005.2286282306163 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102, - "details": { - "description": "min=0.0, mean=0.102, max=0.193, sum=0.512 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=10.924, mean=11.052, max=11.265, sum=55.26 (5)", - "tab": "Efficiency", - "score": 11.052006985892152 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=99.111, mean=157.232, max=255.504, sum=786.158 (5)", - "tab": "General information", - "score": 157.2315362631901 - }, - "WMT 2014 - # output tokens": { - "description": "min=100, mean=100, max=100, sum=500 (5)", - "tab": "General information", - "score": 100.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json deleted file mode 100644 index 2b870e958..000000000 --- a/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_j2-grande/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jurassic-2 Grande 17B", - "id": "ai21/j2-grande", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.172, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.39915106117353305 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.744, - "details": { - "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.179, mean=1.179, max=1.179, sum=1.179 (1)", - "tab": "Efficiency", - "score": 1.1790085772393455 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=3.225, mean=3.225, max=3.225, sum=3.225 (1)", - "tab": "General information", - "score": 3.2253521126760565 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1700.741, mean=1700.741, max=1700.741, sum=1700.741 (1)", - "tab": "General information", - "score": 1700.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.039, mean=5.039, max=5.039, sum=5.039 (1)", - "tab": "General information", - "score": 5.03943661971831 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35, - "details": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.35 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.462, mean=1.462, max=1.462, sum=1.462 (1)", - "tab": "Efficiency", - "score": 1.4618877012729645 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)", - "tab": "Efficiency", - "score": 0.630548656463623 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.697, mean=4.697, max=4.697, sum=4.697 (1)", - "tab": "General information", - "score": 4.697 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)", - "tab": "General information", - "score": 0.038 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1522.929, mean=1522.929, max=1522.929, sum=1522.929 (1)", - "tab": "General information", - "score": 1522.929 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.441, mean=5.441, max=5.441, sum=5.441 (1)", - "tab": "General information", - "score": 5.441 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=102.377, mean=102.377, max=102.377, sum=102.377 (1)", - "tab": "General information", - "score": 102.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=6.614, mean=6.614, max=6.614, sum=6.614 (1)", - "tab": "General information", - "score": 6.614 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614, - "details": { - "description": "min=0.614, mean=0.614, max=0.614, sum=0.614 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)", - "tab": "Efficiency", - "score": 0.519375147819519 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=188.75, mean=188.75, max=188.75, sum=188.75 (1)", - "tab": "General information", - "score": 188.75 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.471, - "details": { - "description": "min=0.25, mean=0.471, max=0.77, sum=2.355 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.549, mean=0.621, max=0.755, sum=3.103 (5)", - "tab": "Efficiency", - "score": 0.6205235414421348 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=308.59, mean=396.74, max=552.719, sum=1983.699 (5)", - "tab": "General information", - "score": 396.7398596491228 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.064, - "details": { - "description": "min=0, mean=0.064, max=0.158, sum=0.445 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.609, mean=4.862, max=6.298, sum=34.036 (7)", - "tab": "Efficiency", - "score": 4.862255273244342 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2, mean=6.778, max=8, sum=47.447 (7)", - "tab": "General information", - "score": 6.7781954887218046 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=450.154, mean=943.419, max=1490.395, sum=6603.93 (7)", - "tab": "General information", - "score": 943.4185034241337 - }, - "MATH - # output tokens": { - "description": "min=74.123, mean=140.295, max=209.933, sum=982.063 (7)", - "tab": "General information", - "score": 140.29469320289397 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.159, - "details": { - "description": "min=0.159, mean=0.159, max=0.159, sum=0.159 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=5.417, mean=5.417, max=5.417, sum=5.417 (1)", - "tab": "Efficiency", - "score": 5.417125414848328 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=823.394, mean=823.394, max=823.394, sum=823.394 (1)", - "tab": "General information", - "score": 823.394 - }, - "GSM8K - # output tokens": { - "description": "min=121.336, mean=121.336, max=121.336, sum=121.336 (1)", - "tab": "General information", - "score": 121.336 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.468, - "details": { - "description": "min=0.199, mean=0.468, max=0.842, sum=2.338 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.409, mean=0.712, max=1.079, sum=3.561 (5)", - "tab": "Efficiency", - "score": 0.7122931517101486 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=1.006, mean=4.001, max=5, sum=20.006 (5)", - "tab": "General information", - "score": 4.001224489795918 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.002, max=0.012, sum=0.012 (5)", - "tab": "General information", - "score": 0.0024489795918367346 - }, - "LegalBench - # prompt tokens": { - "description": "min=171.042, mean=503.146, max=1514.22, sum=2515.73 (5)", - "tab": "General information", - "score": 503.1459259177527 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.056, max=2.216, sum=10.282 (5)", - "tab": "General information", - "score": 2.0563001835066452 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.914, mean=0.914, max=0.914, sum=0.914 (1)", - "tab": "Efficiency", - "score": 0.9142626611660299 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=758.622, mean=758.622, max=758.622, sum=758.622 (1)", - "tab": "General information", - "score": 758.6222664015904 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102, - "details": { - "description": "min=0.021, mean=0.102, max=0.149, sum=0.509 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.723, mean=0.759, max=0.81, sum=3.793 (5)", - "tab": "Efficiency", - "score": 0.7586197336965614 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=123.229, mean=135.468, max=148.278, sum=677.341 (5)", - "tab": "General information", - "score": 135.46828404572565 - }, - "WMT 2014 - # output tokens": { - "description": "min=17.372, mean=19.051, max=21.34, sum=95.255 (5)", - "tab": "General information", - "score": 19.050931430646887 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json deleted file mode 100644 index 643b24001..000000000 --- a/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_j2-jumbo/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jurassic-2 Jumbo 178B", - "id": "ai21/j2-jumbo", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.215, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.19473158551810238 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.728, - "details": { - "description": "min=0.728, mean=0.728, max=0.728, sum=0.728 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.82, mean=1.82, max=1.82, sum=1.82 (1)", - "tab": "Efficiency", - "score": 1.8203622415032186 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=2534.434, mean=2534.434, max=2534.434, sum=2534.434 (1)", - "tab": "General information", - "score": 2534.4338028169013 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.583, mean=6.583, max=6.583, sum=6.583 (1)", - "tab": "General information", - "score": 6.583098591549295 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.385, - "details": { - "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.448, mean=1.448, max=1.448, sum=1.448 (1)", - "tab": "Efficiency", - "score": 1.4479399914741515 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=5.332, mean=5.332, max=5.332, sum=5.332 (1)", - "tab": "Efficiency", - "score": 5.3321147253513335 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.931, mean=4.931, max=4.931, sum=4.931 (1)", - "tab": "General information", - "score": 4.931 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.012, mean=0.012, max=0.012, sum=0.012 (1)", - "tab": "General information", - "score": 0.012 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1687.673, mean=1687.673, max=1687.673, sum=1687.673 (1)", - "tab": "General information", - "score": 1687.673 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=4.785, mean=4.785, max=4.785, sum=4.785 (1)", - "tab": "General information", - "score": 4.785 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=102.377, mean=102.377, max=102.377, sum=102.377 (1)", - "tab": "General information", - "score": 102.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.79, mean=5.79, max=5.79, sum=5.79 (1)", - "tab": "General information", - "score": 5.79 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688, - "details": { - "description": "min=0.688, mean=0.688, max=0.688, sum=0.688 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)", - "tab": "Efficiency", - "score": 0.9981746392250062 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=188.75, mean=188.75, max=188.75, sum=188.75 (1)", - "tab": "General information", - "score": 188.75 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.483, - "details": { - "description": "min=0.25, mean=0.483, max=0.83, sum=2.413 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.693, mean=0.81, max=0.92, sum=4.052 (5)", - "tab": "Efficiency", - "score": 0.8103257050430566 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=308.59, mean=396.74, max=552.719, sum=1983.699 (5)", - "tab": "General information", - "score": 396.7398596491228 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.103, - "details": { - "description": "min=0.033, mean=0.103, max=0.193, sum=0.72 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=4.497, mean=9.136, max=13.531, sum=63.951 (7)", - "tab": "Efficiency", - "score": 9.135811412885502 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=796.795, mean=1321.422, max=2516.154, sum=9249.956 (7)", - "tab": "General information", - "score": 1321.42226282263 - }, - "MATH - # output tokens": { - "description": "min=76.281, mean=136.538, max=220.133, sum=955.767 (7)", - "tab": "General information", - "score": 136.53809167621895 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.239, - "details": { - "description": "min=0.239, mean=0.239, max=0.239, sum=0.239 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=5.176, mean=5.176, max=5.176, sum=5.176 (1)", - "tab": "Efficiency", - "score": 5.176425676584244 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=823.394, mean=823.394, max=823.394, sum=823.394 (1)", - "tab": "General information", - "score": 823.394 - }, - "GSM8K - # output tokens": { - "description": "min=102.036, mean=102.036, max=102.036, sum=102.036 (1)", - "tab": "General information", - "score": 102.036 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.533, - "details": { - "description": "min=0.324, mean=0.533, max=0.821, sum=2.666 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.639, mean=1.274, max=2.827, sum=6.369 (5)", - "tab": "Efficiency", - "score": 1.2737073742826783 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.992 (5)", - "tab": "General information", - "score": 4.798367346938775 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=171.042, mean=1120.486, max=4600.92, sum=5602.43 (5)", - "tab": "General information", - "score": 1120.4859259177529 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.028, max=2.098, sum=10.141 (5)", - "tab": "General information", - "score": 2.028218528610354 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431, - "details": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=1.535, mean=1.535, max=1.535, sum=1.535 (1)", - "tab": "Efficiency", - "score": 1.5350148075854566 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=758.622, mean=758.622, max=758.622, sum=758.622 (1)", - "tab": "General information", - "score": 758.6222664015904 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114, - "details": { - "description": "min=0.044, mean=0.114, max=0.148, sum=0.572 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.236, mean=1.441, max=1.665, sum=7.206 (5)", - "tab": "Efficiency", - "score": 1.4411698855373092 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=123.229, mean=135.468, max=148.278, sum=677.341 (5)", - "tab": "General information", - "score": 135.46828404572565 - }, - "WMT 2014 - # output tokens": { - "description": "min=19.839, mean=24.063, max=30.439, sum=120.314 (5)", - "tab": "General information", - "score": 24.062830708059337 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json deleted file mode 100644 index a07da123a..000000000 --- a/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jamba 1.5 Large", - "id": "ai21/jamba-1.5-large", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.26377028714107364 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.664, - "details": { - "description": "min=0.664, mean=0.664, max=0.664, sum=0.664 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "Efficiency", - "score": 1.9694313982842673 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3595.597, mean=3595.597, max=3595.597, sum=3595.597 (1)", - "tab": "General information", - "score": 3595.5971830985914 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.394, - "details": { - "description": "min=0.394, mean=0.394, max=0.394, sum=0.394 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.678, mean=1.678, max=1.678, sum=1.678 (1)", - "tab": "Efficiency", - "score": 1.678127991437912 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.272, mean=1.272, max=1.272, sum=1.272 (1)", - "tab": "Efficiency", - "score": 1.2717866213321687 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2333.076, mean=2333.076, max=2333.076, sum=2333.076 (1)", - "tab": "General information", - "score": 2333.076 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=152.394, mean=152.394, max=152.394, sum=152.394 (1)", - "tab": "General information", - "score": 152.394 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.948, - "details": { - "description": "min=0.948, mean=0.948, max=0.948, sum=0.948 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.91, mean=0.91, max=0.91, sum=0.91 (1)", - "tab": "Efficiency", - "score": 0.9100792293548584 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=261.348, mean=261.348, max=261.348, sum=261.348 (1)", - "tab": "General information", - "score": 261.348 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.53, mean=0.683, max=0.92, sum=3.414 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.933, mean=0.973, max=1.0, sum=4.866 (5)", - "tab": "Efficiency", - "score": 0.973254363085094 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.58, mean=508.138, max=678.64, sum=2540.69 (5)", - "tab": "General information", - "score": 508.1380701754386 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.692, - "details": { - "description": "min=0.481, mean=0.692, max=0.889, sum=4.842 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.366, mean=3.179, max=4.736, sum=22.253 (7)", - "tab": "Efficiency", - "score": 3.1790229759699775 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=979.415, mean=1458.376, max=2550.115, sum=10208.634 (7)", - "tab": "General information", - "score": 1458.376275861588 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.846, - "details": { - "description": "min=0.846, mean=0.846, max=0.846, sum=0.846 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.942, mean=3.942, max=3.942, sum=3.942 (1)", - "tab": "Efficiency", - "score": 3.942030364751816 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1163.818, mean=1163.818, max=1163.818, sum=1163.818 (1)", - "tab": "General information", - "score": 1163.818 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.409, mean=0.675, max=0.989, sum=3.375 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.933, mean=1.258, max=2.367, sum=6.289 (5)", - "tab": "Efficiency", - "score": 1.2577736545740559 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=212.453, mean=1601.843, max=6618.612, sum=8009.215 (5)", - "tab": "General information", - "score": 1601.842950915631 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.698, - "details": { - "description": "min=0.698, mean=0.698, max=0.698, sum=0.698 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "Efficiency", - "score": 0.9989562840395372 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1085.239, mean=1085.239, max=1085.239, sum=1085.239 (1)", - "tab": "General information", - "score": 1085.2385685884692 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.203, - "details": { - "description": "min=0.141, mean=0.203, max=0.246, sum=1.015 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.317, mean=1.386, max=1.471, sum=6.93 (5)", - "tab": "Efficiency", - "score": 1.3859240114613673 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=120.386, mean=151.077, max=189.223, sum=755.383 (5)", - "tab": "General information", - "score": 151.07662629989292 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json deleted file mode 100644 index 9e0628c9d..000000000 --- a/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jamba 1.5 Mini", - "id": "ai21/jamba-1.5-mini", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.44747815230961296 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)", - "tab": "Efficiency", - "score": 0.9981950746455662 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3595.597, mean=3595.597, max=3595.597, sum=3595.597 (1)", - "tab": "General information", - "score": 3595.5971830985914 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388, - "details": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.388 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.924, mean=0.924, max=0.924, sum=0.924 (1)", - "tab": "Efficiency", - "score": 0.9243871104717255 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)", - "tab": "Efficiency", - "score": 0.8436705965995789 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2333.076, mean=2333.076, max=2333.076, sum=2333.076 (1)", - "tab": "General information", - "score": 2333.076 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=152.394, mean=152.394, max=152.394, sum=152.394 (1)", - "tab": "General information", - "score": 152.394 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.786, mean=0.786, max=0.786, sum=0.786 (1)", - "tab": "Efficiency", - "score": 0.7863723936080933 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=261.348, mean=261.348, max=261.348, sum=261.348 (1)", - "tab": "General information", - "score": 261.348 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.582, - "details": { - "description": "min=0.33, mean=0.582, max=0.9, sum=2.911 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.783, mean=0.81, max=0.83, sum=4.049 (5)", - "tab": "Efficiency", - "score": 0.8097888966024968 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.58, mean=508.138, max=678.64, sum=2540.69 (5)", - "tab": "General information", - "score": 508.1380701754386 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318, - "details": { - "description": "min=0.233, mean=0.318, max=0.386, sum=2.227 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.462, mean=1.636, max=2.034, sum=11.452 (7)", - "tab": "Efficiency", - "score": 1.63604986000122 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=979.415, mean=1458.376, max=2550.115, sum=10208.634 (7)", - "tab": "General information", - "score": 1458.376275861588 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691, - "details": { - "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.892, mean=1.892, max=1.892, sum=1.892 (1)", - "tab": "Efficiency", - "score": 1.8916997435092926 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1163.818, mean=1163.818, max=1163.818, sum=1163.818 (1)", - "tab": "General information", - "score": 1163.818 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.503, - "details": { - "description": "min=0.365, mean=0.503, max=0.842, sum=2.514 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.805, mean=0.864, max=1.071, sum=4.322 (5)", - "tab": "Efficiency", - "score": 0.8644844750252041 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=212.453, mean=1601.843, max=6618.612, sum=8009.215 (5)", - "tab": "General information", - "score": 1601.842950915631 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.632, - "details": { - "description": "min=0.632, mean=0.632, max=0.632, sum=0.632 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)", - "tab": "Efficiency", - "score": 0.8172814860258615 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1085.239, mean=1085.239, max=1085.239, sum=1085.239 (1)", - "tab": "General information", - "score": 1085.2385685884692 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.179, - "details": { - "description": "min=0.116, mean=0.179, max=0.21, sum=0.895 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.965, mean=0.978, max=0.99, sum=4.888 (5)", - "tab": "Efficiency", - "score": 0.9776749755042665 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=120.386, mean=151.077, max=189.223, sum=755.383 (5)", - "tab": "General information", - "score": 151.07662629989292 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json deleted file mode 100644 index 9e1241a8e..000000000 --- a/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json +++ /dev/null @@ -1,642 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/ai21_jamba-instruct/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jamba Instruct", - "id": "ai21/jamba-instruct", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.287, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6515730337078651 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.658, - "details": { - "description": "min=0.658, mean=0.658, max=0.658, sum=0.658 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.947, mean=0.947, max=0.947, sum=0.947 (1)", - "tab": "Efficiency", - "score": 0.9470622405199938 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=2555.434, mean=2555.434, max=2555.434, sum=2555.434 (1)", - "tab": "General information", - "score": 2555.4338028169013 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.384, - "details": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.809, mean=0.809, max=0.809, sum=0.809 (1)", - "tab": "Efficiency", - "score": 0.8087365460395813 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.535, mean=0.535, max=0.535, sum=0.535 (1)", - "tab": "Efficiency", - "score": 0.5348668487071991 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1774.04, mean=1774.04, max=1774.04, sum=1774.04 (1)", - "tab": "General information", - "score": 1774.04 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=118.377, mean=118.377, max=118.377, sum=118.377 (1)", - "tab": "General information", - "score": 118.377 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=0.796 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)", - "tab": "Efficiency", - "score": 0.30006033515930175 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=195.75, mean=195.75, max=195.75, sum=195.75 (1)", - "tab": "General information", - "score": 195.75 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.582, - "details": { - "description": "min=0.36, mean=0.582, max=0.91, sum=2.909 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.253, mean=0.265, max=0.275, sum=1.327 (5)", - "tab": "Efficiency", - "score": 0.2654710942151254 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=315.59, mean=403.74, max=559.719, sum=2018.699 (5)", - "tab": "General information", - "score": 403.7398596491228 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38, - "details": { - "description": "min=0.237, mean=0.38, max=0.607, sum=2.663 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.917, mean=3.242, max=5.09, sum=22.692 (7)", - "tab": "Efficiency", - "score": 3.24175411841349 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=796.795, mean=1321.422, max=2516.154, sum=9249.956 (7)", - "tab": "General information", - "score": 1321.42226282263 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.846, mean=3.846, max=3.846, sum=3.846 (1)", - "tab": "Efficiency", - "score": 3.8455032846927644 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=823.394, mean=823.394, max=823.394, sum=823.394 (1)", - "tab": "General information", - "score": 823.394 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54, - "details": { - "description": "min=0.304, mean=0.54, max=0.874, sum=2.7 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.351, mean=0.641, max=1.337, sum=3.204 (5)", - "tab": "Efficiency", - "score": 0.6408480782672099 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=177.042, mean=1127.163, max=4612.308, sum=5635.817 (5)", - "tab": "General information", - "score": 1127.1634769381612 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519, - "details": { - "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.311 (1)", - "tab": "Efficiency", - "score": 0.31133864366747516 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=765.622, mean=765.622, max=765.622, sum=765.622 (1)", - "tab": "General information", - "score": 765.6222664015904 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.164, - "details": { - "description": "min=0.099, mean=0.164, max=0.205, sum=0.656 (4)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.586, mean=0.635, max=0.686, sum=2.542 (4)", - "tab": "Efficiency", - "score": 0.6354023076110767 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=585.25, max=832, sum=2341 (4)", - "tab": "General information", - "score": 585.25 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=4 (4)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=129.229, mean=143.261, max=154.278, sum=573.045 (4)", - "tab": "General information", - "score": 143.26129939115307 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (4)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json deleted file mode 100644 index b68794dd1..000000000 --- a/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/allenai_olmo-7b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo 7B", - "id": "allenai/olmo-7b", - "developer": "allenai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.052, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6540574282147316 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.597, - "details": { - "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.032, mean=1.032, max=1.032, sum=1.032 (1)", - "tab": "Efficiency", - "score": 1.0318688553823552 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)", - "tab": "General information", - "score": 1.9690140845070423 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)", - "tab": "General information", - "score": 1691.081690140845 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.259, - "details": { - "description": "min=0.259, mean=0.259, max=0.259, sum=0.259 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)", - "tab": "Efficiency", - "score": 0.9419968054294586 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.397 (1)", - "tab": "Efficiency", - "score": 0.3968301827907562 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.703, mean=4.703, max=4.703, sum=4.703 (1)", - "tab": "General information", - "score": 4.703 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)", - "tab": "General information", - "score": 0.037 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1495.001, mean=1495.001, max=1495.001, sum=1495.001 (1)", - "tab": "General information", - "score": 1495.001 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)", - "tab": "General information", - "score": 0.998 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)", - "tab": "General information", - "score": 117.299 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.222, - "details": { - "description": "min=0.222, mean=0.222, max=0.222, sum=0.222 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.29 (1)", - "tab": "Efficiency", - "score": 0.2902843647003174 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=251.556, mean=251.556, max=251.556, sum=251.556 (1)", - "tab": "General information", - "score": 251.556 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.305, - "details": { - "description": "min=0.26, mean=0.305, max=0.38, sum=1.525 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.309, mean=0.326, max=0.346, sum=1.629 (5)", - "tab": "Efficiency", - "score": 0.325820258140564 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)", - "tab": "General information", - "score": 467.935649122807 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.029, - "details": { - "description": "min=0, mean=0.029, max=0.088, sum=0.205 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.79, mean=2.257, max=2.808, sum=15.8 (7)", - "tab": "Efficiency", - "score": 2.2571195842818583 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=3.173, mean=6.976, max=8, sum=48.831 (7)", - "tab": "General information", - "score": 6.9758530942741475 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=860.23, mean=1111.07, max=1508.423, sum=7777.488 (7)", - "tab": "General information", - "score": 1111.0696790674758 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.044, - "details": { - "description": "min=0.044, mean=0.044, max=0.044, sum=0.044 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.41, mean=2.41, max=2.41, sum=2.41 (1)", - "tab": "Efficiency", - "score": 2.4104921889305113 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=939.582, mean=939.582, max=939.582, sum=939.582 (1)", - "tab": "General information", - "score": 939.582 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.341, - "details": { - "description": "min=0.158, mean=0.341, max=0.6, sum=1.704 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.368, mean=0.502, max=0.929, sum=2.508 (5)", - "tab": "Efficiency", - "score": 0.5016753114389487 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.298, mean=3.86, max=5, sum=19.298 (5)", - "tab": "General information", - "score": 3.859591836734694 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)", - "tab": "General information", - "score": 0.002857142857142857 - }, - "LegalBench - # prompt tokens": { - "description": "min=206.779, mean=559.92, max=1493.837, sum=2799.602 (5)", - "tab": "General information", - "score": 559.9203981649337 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.229, - "details": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.229 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)", - "tab": "Efficiency", - "score": 0.47797848879698496 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=994.588, mean=994.588, max=994.588, sum=994.588 (1)", - "tab": "General information", - "score": 994.5884691848906 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.097, - "details": { - "description": "min=0.009, mean=0.097, max=0.157, sum=0.487 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.661, mean=0.771, max=0.925, sum=3.855 (5)", - "tab": "Efficiency", - "score": 0.7709201743273374 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=129.879, mean=144.948, max=167.177, sum=724.741 (5)", - "tab": "General information", - "score": 144.94816676861905 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json deleted file mode 100644 index 084734ba7..000000000 --- a/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json +++ /dev/null @@ -1,644 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Lite", - "id": "amazon/nova-lite-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.708, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.9832833957553059 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.227, mean=0.227, max=0.227, sum=0.227 (1)", - "tab": "Efficiency", - "score": 0.22699436619718286 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3550.577, mean=3550.577, max=3550.577, sum=3550.577 (1)", - "tab": "General information", - "score": 3550.5774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.701, mean=4.701, max=4.701, sum=4.701 (1)", - "tab": "General information", - "score": 4.701408450704226 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.352, - "details": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.206, mean=0.206, max=0.206, sum=0.206 (1)", - "tab": "Efficiency", - "score": 0.20557699999999976 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.155, mean=0.155, max=0.155, sum=0.155 (1)", - "tab": "Efficiency", - "score": 0.15455700000000017 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1773.944, mean=1773.944, max=1773.944, sum=1773.944 (1)", - "tab": "General information", - "score": 1773.944 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=4.835, mean=4.835, max=4.835, sum=4.835 (1)", - "tab": "General information", - "score": 4.835 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=153.254, mean=153.254, max=153.254, sum=153.254 (1)", - "tab": "General information", - "score": 153.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.084, mean=4.084, max=4.084, sum=4.084 (1)", - "tab": "General information", - "score": 4.084 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.113, mean=0.113, max=0.113, sum=0.113 (1)", - "tab": "Efficiency", - "score": 0.11279599999999983 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=282.21, mean=282.21, max=282.21, sum=282.21 (1)", - "tab": "General information", - "score": 282.21 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.52, mean=0.693, max=0.92, sum=3.465 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.124, mean=0.13, max=0.136, sum=0.651 (5)", - "tab": "Efficiency", - "score": 0.13027701754385965 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=399.38, mean=500.274, max=652.07, sum=2501.37 (5)", - "tab": "General information", - "score": 500.2740350877192 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.579, mean=0.779, max=0.911, sum=5.45 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.693, mean=0.836, max=1.148, sum=5.85 (7)", - "tab": "Efficiency", - "score": 0.8356917305438115 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=925.556, mean=1394.735, max=2468.942, sum=9763.147 (7)", - "tab": "General information", - "score": 1394.7353092779651 - }, - "MATH - # output tokens": { - "description": "min=61.4, mean=78.742, max=112.526, sum=551.195 (7)", - "tab": "General information", - "score": 78.74214942544197 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.829, mean=0.829, max=0.829, sum=0.829 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.063, mean=1.063, max=1.063, sum=1.063 (1)", - "tab": "Efficiency", - "score": 1.0628889999999993 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=957.869, mean=957.869, max=957.869, sum=957.869 (1)", - "tab": "General information", - "score": 957.869 - }, - "GSM8K - # output tokens": { - "description": "min=84.074, mean=84.074, max=84.074, sum=84.074 (1)", - "tab": "General information", - "score": 84.074 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.368, mean=0.659, max=0.947, sum=3.297 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.118, mean=0.156, max=0.261, sum=0.782 (5)", - "tab": "Efficiency", - "score": 0.15639281489418358 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=241.632, mean=1581.083, max=6449.798, sum=7905.414 (5)", - "tab": "General information", - "score": 1581.0827222540588 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.488, max=2.6, sum=7.439 (5)", - "tab": "General information", - "score": 1.4878474114441418 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)", - "tab": "Efficiency", - "score": 0.1322564612326044 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1066.861, mean=1066.861, max=1066.861, sum=1066.861 (1)", - "tab": "General information", - "score": 1066.8608349900596 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.204, - "details": { - "description": "min=0.126, mean=0.204, max=0.25, sum=1.021 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.304, mean=0.336, max=0.406, sum=1.68 (5)", - "tab": "Efficiency", - "score": 0.3359064091413061 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=163.93, mean=208.694, max=268.662, sum=1043.469 (5)", - "tab": "General information", - "score": 208.69386660804403 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.457, mean=29.543, max=42.627, sum=147.715 (5)", - "tab": "General information", - "score": 29.542975799051845 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json deleted file mode 100644 index fb66c7744..000000000 --- a/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json +++ /dev/null @@ -1,644 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Micro", - "id": "amazon/nova-micro-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.998876404494382 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.744, - "details": { - "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.196, mean=0.196, max=0.196, sum=0.196 (1)", - "tab": "Efficiency", - "score": 0.19638591549295767 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3550.577, mean=3550.577, max=3550.577, sum=3550.577 (1)", - "tab": "General information", - "score": 3550.5774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=3.961, mean=3.961, max=3.961, sum=3.961 (1)", - "tab": "General information", - "score": 3.96056338028169 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.285, - "details": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.19, mean=0.19, max=0.19, sum=0.19 (1)", - "tab": "Efficiency", - "score": 0.1897639999999999 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.133, mean=0.133, max=0.133, sum=0.133 (1)", - "tab": "Efficiency", - "score": 0.1334880000000001 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1773.944, mean=1773.944, max=1773.944, sum=1773.944 (1)", - "tab": "General information", - "score": 1773.944 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.911, mean=5.911, max=5.911, sum=5.911 (1)", - "tab": "General information", - "score": 5.911 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=153.254, mean=153.254, max=153.254, sum=153.254 (1)", - "tab": "General information", - "score": 153.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=3.515, mean=3.515, max=3.515, sum=3.515 (1)", - "tab": "General information", - "score": 3.515 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.888, - "details": { - "description": "min=0.888, mean=0.888, max=0.888, sum=0.888 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.104, mean=0.104, max=0.104, sum=0.104 (1)", - "tab": "Efficiency", - "score": 0.10389599999999993 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=282.21, mean=282.21, max=282.21, sum=282.21 (1)", - "tab": "General information", - "score": 282.21 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.42, mean=0.64, max=0.9, sum=3.2 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.113, mean=0.116, max=0.118, sum=0.579 (5)", - "tab": "Efficiency", - "score": 0.11572105263157897 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=399.38, mean=500.274, max=652.07, sum=2501.37 (5)", - "tab": "General information", - "score": 500.2740350877192 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.558, mean=0.76, max=0.895, sum=5.32 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.577, mean=0.79, max=1.132, sum=5.529 (7)", - "tab": "Efficiency", - "score": 0.7898264142267815 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=925.556, mean=1394.735, max=2468.942, sum=9763.147 (7)", - "tab": "General information", - "score": 1394.7353092779651 - }, - "MATH - # output tokens": { - "description": "min=75.368, mean=103.346, max=152.2, sum=723.421 (7)", - "tab": "General information", - "score": 103.34588937061396 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.794, - "details": { - "description": "min=0.794, mean=0.794, max=0.794, sum=0.794 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=0.895, mean=0.895, max=0.895, sum=0.895 (1)", - "tab": "Efficiency", - "score": 0.8952520000000004 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=957.869, mean=957.869, max=957.869, sum=957.869 (1)", - "tab": "General information", - "score": 957.869 - }, - "GSM8K - # output tokens": { - "description": "min=103.892, mean=103.892, max=103.892, sum=103.892 (1)", - "tab": "General information", - "score": 103.892 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.615, - "details": { - "description": "min=0.368, mean=0.615, max=0.874, sum=3.074 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.108, mean=0.143, max=0.254, sum=0.713 (5)", - "tab": "Efficiency", - "score": 0.14263605160429277 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=241.632, mean=1581.083, max=6449.798, sum=7905.414 (5)", - "tab": "General information", - "score": 1581.0827222540588 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.665, max=2.926, sum=8.323 (5)", - "tab": "General information", - "score": 1.6646275687271896 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.608, - "details": { - "description": "min=0.608, mean=0.608, max=0.608, sum=0.608 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.118, mean=0.118, max=0.118, sum=0.118 (1)", - "tab": "Efficiency", - "score": 0.11825049701789252 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1066.861, mean=1066.861, max=1066.861, sum=1066.861 (1)", - "tab": "General information", - "score": 1066.8608349900596 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.192, - "details": { - "description": "min=0.112, mean=0.192, max=0.241, sum=0.96 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.239, mean=0.268, max=0.333, sum=1.34 (5)", - "tab": "Efficiency", - "score": 0.26807757063388915 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=163.93, mean=208.694, max=268.662, sum=1043.469 (5)", - "tab": "General information", - "score": 208.69386660804403 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.38, mean=25.875, max=28.916, sum=129.377 (5)", - "tab": "General information", - "score": 25.875419597797826 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json deleted file mode 100644 index c7f9d86e2..000000000 --- a/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json +++ /dev/null @@ -1,644 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Pro", - "id": "amazon/nova-pro-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.9342571785268414 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.246, mean=0.246, max=0.246, sum=0.246 (1)", - "tab": "Efficiency", - "score": 0.24631830985915482 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3550.577, mean=3550.577, max=3550.577, sum=3550.577 (1)", - "tab": "General information", - "score": 3550.5774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.651, mean=4.651, max=4.651, sum=4.651 (1)", - "tab": "General information", - "score": 4.650704225352112 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.405, - "details": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)", - "tab": "Efficiency", - "score": 0.26591999999999993 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.203, mean=0.203, max=0.203, sum=0.203 (1)", - "tab": "Efficiency", - "score": 0.203244 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1773.944, mean=1773.944, max=1773.944, sum=1773.944 (1)", - "tab": "General information", - "score": 1773.944 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.646, mean=5.646, max=5.646, sum=5.646 (1)", - "tab": "General information", - "score": 5.646 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=153.254, mean=153.254, max=153.254, sum=153.254 (1)", - "tab": "General information", - "score": 153.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.207, mean=4.207, max=4.207, sum=4.207 (1)", - "tab": "General information", - "score": 4.207 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.129 (1)", - "tab": "Efficiency", - "score": 0.12889800000000004 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=282.21, mean=282.21, max=282.21, sum=282.21 (1)", - "tab": "General information", - "score": 282.21 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.758, - "details": { - "description": "min=0.63, mean=0.758, max=0.93, sum=3.792 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.141, mean=0.145, max=0.152, sum=0.725 (5)", - "tab": "Efficiency", - "score": 0.1449304210526316 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=399.38, mean=500.274, max=652.07, sum=2501.37 (5)", - "tab": "General information", - "score": 500.2740350877192 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.821, - "details": { - "description": "min=0.7, mean=0.821, max=0.93, sum=5.749 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.139, mean=1.695, max=2.518, sum=11.863 (7)", - "tab": "Efficiency", - "score": 1.6947358347418935 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=925.556, mean=1394.735, max=2468.942, sum=9763.147 (7)", - "tab": "General information", - "score": 1394.7353092779651 - }, - "MATH - # output tokens": { - "description": "min=66.088, mean=98.114, max=154.135, sum=686.8 (7)", - "tab": "General information", - "score": 98.11425246180445 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=0.87 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.566, mean=1.566, max=1.566, sum=1.566 (1)", - "tab": "Efficiency", - "score": 1.5656869999999996 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=957.869, mean=957.869, max=957.869, sum=957.869 (1)", - "tab": "General information", - "score": 957.869 - }, - "GSM8K - # output tokens": { - "description": "min=73.847, mean=73.847, max=73.847, sum=73.847 (1)", - "tab": "General information", - "score": 73.847 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736, - "details": { - "description": "min=0.444, mean=0.736, max=0.958, sum=3.681 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.139, mean=0.166, max=0.232, sum=0.83 (5)", - "tab": "Efficiency", - "score": 0.16605967288111284 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=241.632, mean=1581.083, max=6449.798, sum=7905.414 (5)", - "tab": "General information", - "score": 1581.0827222540588 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.387, max=2.358, sum=6.936 (5)", - "tab": "General information", - "score": 1.3871102825182848 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811, - "details": { - "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.142, mean=0.142, max=0.142, sum=0.142 (1)", - "tab": "Efficiency", - "score": 0.14219284294234621 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1066.861, mean=1066.861, max=1066.861, sum=1066.861 (1)", - "tab": "General information", - "score": 1066.8608349900596 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.229, - "details": { - "description": "min=0.184, mean=0.229, max=0.281, sum=1.144 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.483, mean=0.504, max=0.519, sum=2.52 (5)", - "tab": "Efficiency", - "score": 0.5040968109611562 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=163.93, mean=208.694, max=268.662, sum=1043.469 (5)", - "tab": "General information", - "score": 208.69386660804403 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.903, mean=25.328, max=25.92, sum=126.641 (5)", - "tab": "General information", - "score": 25.32825594509864 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json deleted file mode 100644 index ab0989b58..000000000 --- a/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-2.0/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 2.0", - "id": "anthropic/claude-2.0", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.489, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.14701622971285894 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=4.811, mean=4.811, max=4.811, sum=4.811 (1)", - "tab": "Efficiency", - "score": 4.8114360809326175 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)", - "tab": "General information", - "score": 3709.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=10.561, mean=10.561, max=10.561, sum=10.561 (1)", - "tab": "General information", - "score": 10.56056338028169 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428, - "details": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=2.984, mean=2.984, max=2.984, sum=2.984 (1)", - "tab": "Efficiency", - "score": 2.9841483016268606 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.149, mean=1.149, max=1.149, sum=1.149 (1)", - "tab": "Efficiency", - "score": 1.1486653406620027 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)", - "tab": "General information", - "score": 4.964 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)", - "tab": "General information", - "score": 1734.363 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.605, mean=7.605, max=7.605, sum=7.605 (1)", - "tab": "General information", - "score": 7.605 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)", - "tab": "General information", - "score": 189.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=7.206, mean=7.206, max=7.206, sum=7.206 (1)", - "tab": "General information", - "score": 7.206 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.862, - "details": { - "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=1.558, mean=1.558, max=1.558, sum=1.558 (1)", - "tab": "Efficiency", - "score": 1.5584912838935852 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)", - "tab": "General information", - "score": 328.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.639, - "details": { - "description": "min=0.38, mean=0.639, max=0.9, sum=3.196 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.609, mean=1.728, max=1.936, sum=8.641 (5)", - "tab": "Efficiency", - "score": 1.7282055348597072 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)", - "tab": "General information", - "score": 543.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.603, - "details": { - "description": "min=0.491, mean=0.603, max=0.8, sum=4.219 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=5.057, mean=6.211, max=7.33, sum=43.477 (7)", - "tab": "Efficiency", - "score": 6.211058685420826 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)", - "tab": "General information", - "score": 1361.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=76.07, mean=96.474, max=115.288, sum=675.315 (7)", - "tab": "General information", - "score": 96.47352327848044 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.583, - "details": { - "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=4.857, mean=4.857, max=4.857, sum=4.857 (1)", - "tab": "Efficiency", - "score": 4.857238686800003 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)", - "tab": "General information", - "score": 1012.712 - }, - "GSM8K - # output tokens": { - "description": "min=78.704, mean=78.704, max=78.704, sum=78.704 (1)", - "tab": "General information", - "score": 78.704 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.387, mean=0.643, max=0.947, sum=3.216 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=1.703, mean=2.782, max=6.2, sum=13.911 (5)", - "tab": "Efficiency", - "score": 2.782158235233088 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.99 (5)", - "tab": "General information", - "score": 4.797959183673469 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)", - "tab": "General information", - "score": 1621.3558670820687 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=3.338, max=11.058, sum=16.692 (5)", - "tab": "General information", - "score": 3.338449275778001 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.652, mean=0.652, max=0.652, sum=0.652 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=2.254, mean=2.254, max=2.254, sum=2.254 (1)", - "tab": "Efficiency", - "score": 2.2539968865055213 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)", - "tab": "General information", - "score": 1092.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219, - "details": { - "description": "min=0.159, mean=0.219, max=0.268, sum=1.095 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.692, mean=1.995, max=2.443, sum=9.976 (5)", - "tab": "Efficiency", - "score": 1.9951115173159082 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)", - "tab": "General information", - "score": 218.57322077152472 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.254, mean=25.653, max=26.374, sum=128.266 (5)", - "tab": "General information", - "score": 25.65316323214559 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json deleted file mode 100644 index 2adbb62af..000000000 --- a/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-2.1/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 2.1", - "id": "anthropic/claude-2.1", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.08012484394506866 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677, - "details": { - "description": "min=0.677, mean=0.677, max=0.677, sum=0.677 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=5.376, mean=5.376, max=5.376, sum=5.376 (1)", - "tab": "Efficiency", - "score": 5.376147254755799 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)", - "tab": "General information", - "score": 3709.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=12.431, mean=12.431, max=12.431, sum=12.431 (1)", - "tab": "General information", - "score": 12.430985915492958 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375, - "details": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=4.161, mean=4.161, max=4.161, sum=4.161 (1)", - "tab": "Efficiency", - "score": 4.16052336707216 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.753, mean=1.753, max=1.753, sum=1.753 (1)", - "tab": "Efficiency", - "score": 1.753281570672989 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)", - "tab": "General information", - "score": 4.964 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)", - "tab": "General information", - "score": 1734.363 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=19.738, mean=19.738, max=19.738, sum=19.738 (1)", - "tab": "General information", - "score": 19.738 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)", - "tab": "General information", - "score": 189.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=11.053, mean=11.053, max=11.053, sum=11.053 (1)", - "tab": "General information", - "score": 11.053 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "description": "min=0.872, mean=0.872, max=0.872, sum=0.872 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=1.809, mean=1.809, max=1.809, sum=1.809 (1)", - "tab": "Efficiency", - "score": 1.8090401072502136 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)", - "tab": "General information", - "score": 328.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.4, mean=0.643, max=0.92, sum=3.216 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=2.043, mean=2.371, max=2.615, sum=11.855 (5)", - "tab": "Efficiency", - "score": 2.370939975420634 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)", - "tab": "General information", - "score": 543.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.632, - "details": { - "description": "min=0.5, mean=0.632, max=0.852, sum=4.425 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=9.158, mean=9.672, max=10.737, sum=67.703 (7)", - "tab": "Efficiency", - "score": 9.671810739168015 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)", - "tab": "General information", - "score": 1361.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=79.825, mean=96.72, max=120.842, sum=677.038 (7)", - "tab": "General information", - "score": 96.71972910810119 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.604, - "details": { - "description": "min=0.604, mean=0.604, max=0.604, sum=0.604 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=7.706, mean=7.706, max=7.706, sum=7.706 (1)", - "tab": "Efficiency", - "score": 7.7061755385398865 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)", - "tab": "General information", - "score": 1012.712 - }, - "GSM8K - # output tokens": { - "description": "min=98.553, mean=98.553, max=98.553, sum=98.553 (1)", - "tab": "General information", - "score": 98.553 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.406, mean=0.643, max=0.874, sum=3.214 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=2.23, mean=3.223, max=6.58, sum=16.113 (5)", - "tab": "Efficiency", - "score": 3.2225898594048035 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.99 (5)", - "tab": "General information", - "score": 4.797959183673469 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)", - "tab": "General information", - "score": 1621.3558670820687 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.455, max=2.137, sum=7.277 (5)", - "tab": "General information", - "score": 1.4554741431234763 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644, - "details": { - "description": "min=0.644, mean=0.644, max=0.644, sum=0.644 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=2.482, mean=2.482, max=2.482, sum=2.482 (1)", - "tab": "Efficiency", - "score": 2.482170646754695 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)", - "tab": "General information", - "score": 1092.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.204, - "details": { - "description": "min=0.148, mean=0.204, max=0.233, sum=1.021 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=2.478, mean=2.756, max=3.455, sum=13.78 (5)", - "tab": "Efficiency", - "score": 2.7559348208894425 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)", - "tab": "General information", - "score": 218.57322077152472 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.439, mean=25.235, max=26.058, sum=126.175 (5)", - "tab": "General information", - "score": 25.235038327725952 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json deleted file mode 100644 index ff757a7ad..000000000 --- a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json +++ /dev/null @@ -1,644 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3.5 Haiku 20241022", - "id": "anthropic/claude-3-5-haiku-20241022", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.29044943820224717 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.763, - "details": { - "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.304, mean=1.304, max=1.304, sum=1.304 (1)", - "tab": "Efficiency", - "score": 1.3044010672770756 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3662.741, mean=3662.741, max=3662.741, sum=3662.741 (1)", - "tab": "General information", - "score": 3662.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=7.031, mean=7.031, max=7.031, sum=7.031 (1)", - "tab": "General information", - "score": 7.030985915492958 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344, - "details": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.344 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.41, mean=1.41, max=1.41, sum=1.41 (1)", - "tab": "Efficiency", - "score": 1.4098961477279663 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)", - "tab": "Efficiency", - "score": 0.7985508556365967 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1726.799, mean=1726.799, max=1726.799, sum=1726.799 (1)", - "tab": "General information", - "score": 1726.799 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=16.792, mean=16.792, max=16.792, sum=16.792 (1)", - "tab": "General information", - "score": 16.792 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=134.259, mean=134.259, max=134.259, sum=134.259 (1)", - "tab": "General information", - "score": 134.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=18.429, mean=18.429, max=18.429, sum=18.429 (1)", - "tab": "General information", - "score": 18.429 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.9, mean=0.9, max=0.9, sum=0.9 (1)", - "tab": "Efficiency", - "score": 0.8996305031776428 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)", - "tab": "General information", - "score": 263.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.671, - "details": { - "description": "min=0.47, mean=0.671, max=0.94, sum=3.356 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.909, mean=1.002, max=1.196, sum=5.012 (5)", - "tab": "Efficiency", - "score": 1.0023672421856928 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)", - "tab": "General information", - "score": 478.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "description": "min=0.737, mean=0.872, max=0.988, sum=6.102 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.671, mean=5.707, max=14.928, sum=39.947 (7)", - "tab": "Efficiency", - "score": 5.706647422047061 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=873.259, mean=1287.814, max=2305.808, sum=9014.699 (7)", - "tab": "General information", - "score": 1287.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=165.86, mean=202.645, max=236.769, sum=1418.512 (7)", - "tab": "General information", - "score": 202.6446145676256 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.915, mean=3.915, max=3.915, sum=3.915 (1)", - "tab": "Efficiency", - "score": 3.915386771917343 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=938.712, mean=938.712, max=938.712, sum=938.712 (1)", - "tab": "General information", - "score": 938.712 - }, - "GSM8K - # output tokens": { - "description": "min=185.342, mean=185.342, max=185.342, sum=185.342 (1)", - "tab": "General information", - "score": 185.342 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.631, - "details": { - "description": "min=0, mean=0.631, max=0.947, sum=3.155 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.62, mean=1.383, max=2.1, sum=6.914 (5)", - "tab": "Efficiency", - "score": 1.3828645188221382 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=232.653, mean=1568.242, max=6432.398, sum=7841.208 (5)", - "tab": "General information", - "score": 1568.241581367783 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=6.998, max=29.403, sum=34.988 (5)", - "tab": "General information", - "score": 6.997580266743151 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722, - "details": { - "description": "min=0.722, mean=0.722, max=0.722, sum=0.722 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.99, mean=0.99, max=0.99, sum=0.99 (1)", - "tab": "Efficiency", - "score": 0.9896539864435822 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)", - "tab": "General information", - "score": 1027.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.135, - "details": { - "description": "min=0.077, mean=0.135, max=0.2, sum=0.675 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.889, mean=1.087, max=1.411, sum=5.434 (5)", - "tab": "Efficiency", - "score": 1.0867067574964768 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=131.406, mean=152.573, max=174.974, sum=762.866 (5)", - "tab": "General information", - "score": 152.5732207715247 - }, - "WMT 2014 - # output tokens": { - "description": "min=33.417, mean=46.766, max=62.029, sum=233.828 (5)", - "tab": "General information", - "score": 46.76561018504359 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json deleted file mode 100644 index 2c4b0d7d1..000000000 --- a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3.5 Sonnet 20240620", - "id": "anthropic/claude-3-5-sonnet-20240620", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.27392009987515603 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=3.5, mean=3.5, max=3.5, sum=3.5 (1)", - "tab": "Efficiency", - "score": 3.5003784911733278 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3672.741, mean=3672.741, max=3672.741, sum=3672.741 (1)", - "tab": "General information", - "score": 3672.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=7.854, mean=7.854, max=7.854, sum=7.854 (1)", - "tab": "General information", - "score": 7.853521126760564 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502, - "details": { - "description": "min=0.502, mean=0.502, max=0.502, sum=0.502 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.834, mean=1.834, max=1.834, sum=1.834 (1)", - "tab": "Efficiency", - "score": 1.8338699455261231 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)", - "tab": "Efficiency", - "score": 0.738832370519638 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1736.799, mean=1736.799, max=1736.799, sum=1736.799 (1)", - "tab": "General information", - "score": 1736.799 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=11.135, mean=11.135, max=11.135, sum=11.135 (1)", - "tab": "General information", - "score": 11.135 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=144.259, mean=144.259, max=144.259, sum=144.259 (1)", - "tab": "General information", - "score": 144.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=6.069, mean=6.069, max=6.069, sum=6.069 (1)", - "tab": "General information", - "score": 6.069 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.972, - "details": { - "description": "min=0.972, mean=0.972, max=0.972, sum=0.972 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.774, mean=0.774, max=0.774, sum=0.774 (1)", - "tab": "Efficiency", - "score": 0.7740971641540527 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=272.79, mean=272.79, max=272.79, sum=272.79 (1)", - "tab": "General information", - "score": 272.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.799, - "details": { - "description": "min=0.59, mean=0.799, max=0.96, sum=3.997 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.765, mean=0.824, max=0.973, sum=4.121 (5)", - "tab": "Efficiency", - "score": 0.8242833791364703 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=379.26, mean=487.747, max=628.596, sum=2438.736 (5)", - "tab": "General information", - "score": 487.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.813, - "details": { - "description": "min=0.579, mean=0.813, max=0.953, sum=5.69 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.231, mean=3.012, max=3.921, sum=21.081 (7)", - "tab": "Efficiency", - "score": 3.0116338881061275 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=897.259, mean=1311.814, max=2329.808, sum=9182.699 (7)", - "tab": "General information", - "score": 1311.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=93.333, mean=143.948, max=207.442, sum=1007.635 (7)", - "tab": "General information", - "score": 143.9478793136688 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.163, mean=3.163, max=3.163, sum=3.163 (1)", - "tab": "Efficiency", - "score": 3.162740940093994 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=938.712, mean=938.712, max=938.712, sum=938.712 (1)", - "tab": "General information", - "score": 938.712 - }, - "GSM8K - # output tokens": { - "description": "min=165.163, mean=165.163, max=165.163, sum=165.163 (1)", - "tab": "General information", - "score": 165.163 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.707, - "details": { - "description": "min=0.455, mean=0.707, max=0.968, sum=3.533 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.66, mean=1.474, max=4.297, sum=7.369 (5)", - "tab": "Efficiency", - "score": 1.473749651523724 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=223.653, mean=1566.242, max=6437.398, sum=7831.208 (5)", - "tab": "General information", - "score": 1566.241581367783 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.328, max=2.053, sum=6.638 (5)", - "tab": "General information", - "score": 1.3276925283235337 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=0.825 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=1.199, mean=1.199, max=1.199, sum=1.199 (1)", - "tab": "Efficiency", - "score": 1.1990809397953406 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1036.437, mean=1036.437, max=1036.437, sum=1036.437 (1)", - "tab": "General information", - "score": 1036.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.229, - "details": { - "description": "min=0.181, mean=0.229, max=0.27, sum=1.145 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.838, mean=1.923, max=2.007, sum=9.616 (5)", - "tab": "Efficiency", - "score": 1.9232725335746241 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=141.406, mean=162.573, max=184.974, sum=812.866 (5)", - "tab": "General information", - "score": 162.5732207715247 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.282, mean=25.852, max=26.592, sum=129.259 (5)", - "tab": "General information", - "score": 25.85177875057348 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json deleted file mode 100644 index 4b9824f13..000000000 --- a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3.5 Sonnet 20241022", - "id": "anthropic/claude-3-5-sonnet-20241022", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.846, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.2994132334581773 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=41.561, mean=41.561, max=41.561, sum=41.561 (1)", - "tab": "Efficiency", - "score": 41.56126285405226 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3662.741, mean=3662.741, max=3662.741, sum=3662.741 (1)", - "tab": "General information", - "score": 3662.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=7.031, mean=7.031, max=7.031, sum=7.031 (1)", - "tab": "General information", - "score": 7.030985915492958 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.467, - "details": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=4.722, mean=4.722, max=4.722, sum=4.722 (1)", - "tab": "Efficiency", - "score": 4.721950803041458 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)", - "tab": "Efficiency", - "score": 0.6590276186466217 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1726.799, mean=1726.799, max=1726.799, sum=1726.799 (1)", - "tab": "General information", - "score": 1726.799 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=14.702, mean=14.702, max=14.702, sum=14.702 (1)", - "tab": "General information", - "score": 14.702 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=134.259, mean=134.259, max=134.259, sum=134.259 (1)", - "tab": "General information", - "score": 134.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=8.63, mean=8.63, max=8.63, sum=8.63 (1)", - "tab": "General information", - "score": 8.63 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.966, - "details": { - "description": "min=0.966, mean=0.966, max=0.966, sum=0.966 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=1.256, mean=1.256, max=1.256, sum=1.256 (1)", - "tab": "Efficiency", - "score": 1.2558565106391906 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)", - "tab": "General information", - "score": 263.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.63, mean=0.809, max=0.96, sum=4.047 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.66, mean=0.673, max=0.689, sum=3.367 (5)", - "tab": "Efficiency", - "score": 0.6733581468766195 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)", - "tab": "General information", - "score": 478.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.904, - "details": { - "description": "min=0.789, mean=0.904, max=0.985, sum=6.326 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.355, mean=4.052, max=4.718, sum=28.364 (7)", - "tab": "Efficiency", - "score": 4.0520609326088035 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=887.259, mean=1301.814, max=2319.808, sum=9112.699 (7)", - "tab": "General information", - "score": 1301.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=127.663, mean=168.831, max=213.077, sum=1181.819 (7)", - "tab": "General information", - "score": 168.831271579864 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.956, - "details": { - "description": "min=0.956, mean=0.956, max=0.956, sum=0.956 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.518, mean=3.518, max=3.518, sum=3.518 (1)", - "tab": "Efficiency", - "score": 3.5175547733306884 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=938.712, mean=938.712, max=938.712, sum=938.712 (1)", - "tab": "General information", - "score": 938.712 - }, - "GSM8K - # output tokens": { - "description": "min=141.152, mean=141.152, max=141.152, sum=141.152 (1)", - "tab": "General information", - "score": 141.152 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.647, - "details": { - "description": "min=0.283, mean=0.647, max=0.989, sum=3.237 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.559, mean=1.013, max=1.649, sum=5.065 (5)", - "tab": "Efficiency", - "score": 1.0130474324650445 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=232.653, mean=1568.242, max=6432.398, sum=7841.208 (5)", - "tab": "General information", - "score": 1568.241581367783 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=3.7, max=13.488, sum=18.498 (5)", - "tab": "General information", - "score": 3.6996529470816006 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)", - "tab": "Efficiency", - "score": 0.8153728936348947 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)", - "tab": "General information", - "score": 1027.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.226, - "details": { - "description": "min=0.174, mean=0.226, max=0.266, sum=1.128 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.838, mean=0.86, max=0.889, sum=4.301 (5)", - "tab": "Efficiency", - "score": 0.8602394085223064 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=141.406, mean=162.573, max=184.974, sum=812.866 (5)", - "tab": "General information", - "score": 162.5732207715247 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.825, mean=25.177, max=25.958, sum=125.887 (5)", - "tab": "General information", - "score": 25.177411492582966 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json deleted file mode 100644 index 8eac62865..000000000 --- a/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3 Haiku 20240307", - "id": "anthropic/claude-3-haiku-20240307", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.263, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5421473158551811 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.244, - "details": { - "description": "min=0.244, mean=0.244, max=0.244, sum=0.244 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.133, mean=1.133, max=1.133, sum=1.133 (1)", - "tab": "Efficiency", - "score": 1.1334171402622277 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)", - "tab": "General information", - "score": 3709.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=44.265, mean=44.265, max=44.265, sum=44.265 (1)", - "tab": "General information", - "score": 44.264788732394365 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.144, - "details": { - "description": "min=0.144, mean=0.144, max=0.144, sum=0.144 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.941, mean=0.941, max=0.941, sum=0.941 (1)", - "tab": "Efficiency", - "score": 0.9411524205207825 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.865, mean=0.865, max=0.865, sum=0.865 (1)", - "tab": "Efficiency", - "score": 0.8646892714500427 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1781.799, mean=1781.799, max=1781.799, sum=1781.799 (1)", - "tab": "General information", - "score": 1781.799 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=33.024, mean=33.024, max=33.024, sum=33.024 (1)", - "tab": "General information", - "score": 33.024 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)", - "tab": "General information", - "score": 189.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=50.787, mean=50.787, max=50.787, sum=50.787 (1)", - "tab": "General information", - "score": 50.787 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)", - "tab": "Efficiency", - "score": 0.6164444308280945 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)", - "tab": "General information", - "score": 263.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662, - "details": { - "description": "min=0.42, mean=0.662, max=0.95, sum=3.312 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.686, mean=0.697, max=0.721, sum=3.485 (5)", - "tab": "Efficiency", - "score": 0.6970766685050831 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)", - "tab": "General information", - "score": 478.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.131, - "details": { - "description": "min=0, mean=0.131, max=0.504, sum=0.916 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.672, mean=0.895, max=1.288, sum=6.265 (7)", - "tab": "Efficiency", - "score": 0.8950275982044664 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=948.259, mean=1362.814, max=2380.808, sum=9539.699 (7)", - "tab": "General information", - "score": 1362.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=3.158, mean=29.033, max=87.17, sum=203.231 (7)", - "tab": "General information", - "score": 29.032964841043174 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.699, - "details": { - "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.228, mean=1.228, max=1.228, sum=1.228 (1)", - "tab": "Efficiency", - "score": 1.2278449382781982 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)", - "tab": "General information", - "score": 1012.712 - }, - "GSM8K - # output tokens": { - "description": "min=77.518, mean=77.518, max=77.518, sum=77.518 (1)", - "tab": "General information", - "score": 77.518 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.034, mean=0.46, max=0.779, sum=2.301 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.455, mean=0.719, max=0.988, sum=3.593 (5)", - "tab": "Efficiency", - "score": 0.7186767522236834 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=214.653, mean=1557.242, max=6428.398, sum=7786.208 (5)", - "tab": "General information", - "score": 1557.241581367783 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=9.565, max=28.352, sum=47.824 (5)", - "tab": "General information", - "score": 9.56470087480281 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=0.702 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.653, mean=0.653, max=0.653, sum=0.653 (1)", - "tab": "Efficiency", - "score": 0.6529203475588121 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)", - "tab": "General information", - "score": 1027.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148, - "details": { - "description": "min=0.018, mean=0.148, max=0.208, sum=0.74 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.627, mean=0.711, max=0.891, sum=3.556 (5)", - "tab": "Efficiency", - "score": 0.7111122513056886 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=198.406, mean=219.573, max=241.974, sum=1097.866 (5)", - "tab": "General information", - "score": 219.57322077152472 - }, - "WMT 2014 - # output tokens": { - "description": "min=27.598, mean=48.613, max=93.673, sum=243.065 (5)", - "tab": "General information", - "score": 48.6129454044961 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json deleted file mode 100644 index d590c786e..000000000 --- a/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3 Opus 20240229", - "id": "anthropic/claude-3-opus-20240229", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.09124843945068664 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.351, - "details": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.351 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=3.996, mean=3.996, max=3.996, sum=3.996 (1)", - "tab": "Efficiency", - "score": 3.9963467248728577 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)", - "tab": "General information", - "score": 3709.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=13.589, mean=13.589, max=13.589, sum=13.589 (1)", - "tab": "General information", - "score": 13.588732394366197 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.441, - "details": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.441 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=4.273, mean=4.273, max=4.273, sum=4.273 (1)", - "tab": "Efficiency", - "score": 4.273005393266678 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.647, mean=1.647, max=1.647, sum=1.647 (1)", - "tab": "Efficiency", - "score": 1.6471402559280395 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1781.799, mean=1781.799, max=1781.799, sum=1781.799 (1)", - "tab": "General information", - "score": 1781.799 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=39.248, mean=39.248, max=39.248, sum=39.248 (1)", - "tab": "General information", - "score": 39.248 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)", - "tab": "General information", - "score": 189.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.66, mean=5.66, max=5.66, sum=5.66 (1)", - "tab": "General information", - "score": 5.66 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.956, - "details": { - "description": "min=0.956, mean=0.956, max=0.956, sum=0.956 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=2.168, mean=2.168, max=2.168, sum=2.168 (1)", - "tab": "Efficiency", - "score": 2.167769320487976 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)", - "tab": "General information", - "score": 263.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.6, mean=0.768, max=0.96, sum=3.839 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=4.003, mean=4.19, max=4.373, sum=20.948 (5)", - "tab": "Efficiency", - "score": 4.189554240862528 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)", - "tab": "General information", - "score": 478.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.526, mean=0.76, max=0.889, sum=5.322 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=6.095, mean=7.542, max=9.041, sum=52.793 (7)", - "tab": "Efficiency", - "score": 7.541890628266922 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=948.259, mean=1362.814, max=2380.808, sum=9539.699 (7)", - "tab": "General information", - "score": 1362.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=82.965, mean=113.906, max=138.263, sum=797.345 (7)", - "tab": "General information", - "score": 113.90635737624721 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=0.924 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=7.469, mean=7.469, max=7.469, sum=7.469 (1)", - "tab": "Efficiency", - "score": 7.469249876976013 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)", - "tab": "General information", - "score": 1012.712 - }, - "GSM8K - # output tokens": { - "description": "min=115.934, mean=115.934, max=115.934, sum=115.934 (1)", - "tab": "General information", - "score": 115.934 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662, - "details": { - "description": "min=0.153, mean=0.662, max=0.989, sum=3.31 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=1.391, mean=2.57, max=4.856, sum=12.851 (5)", - "tab": "Efficiency", - "score": 2.570133829482505 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=214.653, mean=1557.242, max=6428.398, sum=7786.208 (5)", - "tab": "General information", - "score": 1557.241581367783 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.605, max=2.932, sum=8.023 (5)", - "tab": "General information", - "score": 1.6045285459659269 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "details": { - "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=2.65, mean=2.65, max=2.65, sum=2.65 (1)", - "tab": "Efficiency", - "score": 2.6499544673601156 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)", - "tab": "General information", - "score": 1027.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.24, - "details": { - "description": "min=0.188, mean=0.24, max=0.285, sum=1.199 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=2.279, mean=2.447, max=2.661, sum=12.233 (5)", - "tab": "Efficiency", - "score": 2.4465377724275283 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=198.406, mean=219.573, max=241.974, sum=1097.866 (5)", - "tab": "General information", - "score": 219.57322077152472 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.332, mean=25.837, max=26.616, sum=129.185 (5)", - "tab": "General information", - "score": 25.837047426976607 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json deleted file mode 100644 index 90baddbf7..000000000 --- a/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3 Sonnet 20240229", - "id": "anthropic/claude-3-sonnet-20240229", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.377, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.27500624219725345 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111, - "details": { - "description": "min=0.111, mean=0.111, max=0.111, sum=0.111 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=2.239, mean=2.239, max=2.239, sum=2.239 (1)", - "tab": "Efficiency", - "score": 2.2392607588163562 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)", - "tab": "General information", - "score": 3709.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=30.372, mean=30.372, max=30.372, sum=30.372 (1)", - "tab": "General information", - "score": 30.371830985915494 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.028, - "details": { - "description": "min=0.028, mean=0.028, max=0.028, sum=0.028 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.828, mean=1.828, max=1.828, sum=1.828 (1)", - "tab": "Efficiency", - "score": 1.828468058347702 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.226, mean=1.226, max=1.226, sum=1.226 (1)", - "tab": "Efficiency", - "score": 1.2262272393703462 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1781.799, mean=1781.799, max=1781.799, sum=1781.799 (1)", - "tab": "General information", - "score": 1781.799 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=31.113, mean=31.113, max=31.113, sum=31.113 (1)", - "tab": "General information", - "score": 31.113 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)", - "tab": "General information", - "score": 189.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=26.563, mean=26.563, max=26.563, sum=26.563 (1)", - "tab": "General information", - "score": 26.563 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.918, - "details": { - "description": "min=0.918, mean=0.918, max=0.918, sum=0.918 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=1.032, mean=1.032, max=1.032, sum=1.032 (1)", - "tab": "Efficiency", - "score": 1.031575677871704 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)", - "tab": "General information", - "score": 263.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.39, mean=0.652, max=0.94, sum=3.26 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.228, mean=1.278, max=1.341, sum=6.391 (5)", - "tab": "Efficiency", - "score": 1.2781797420267473 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)", - "tab": "General information", - "score": 478.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.084, - "details": { - "description": "min=0, mean=0.084, max=0.337, sum=0.591 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.092, mean=2.33, max=2.633, sum=16.311 (7)", - "tab": "Efficiency", - "score": 2.3301560711519222 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=948.259, mean=1362.814, max=2380.808, sum=9539.699 (7)", - "tab": "General information", - "score": 1362.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=44.263, mean=52.374, max=62.256, sum=366.62 (7)", - "tab": "General information", - "score": 52.37429092508652 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.907, - "details": { - "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.213, mean=3.213, max=3.213, sum=3.213 (1)", - "tab": "Efficiency", - "score": 3.2127642614841463 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)", - "tab": "General information", - "score": 1012.712 - }, - "GSM8K - # output tokens": { - "description": "min=114.663, mean=114.663, max=114.663, sum=114.663 (1)", - "tab": "General information", - "score": 114.663 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0.029, mean=0.49, max=0.958, sum=2.448 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.683, mean=1.316, max=2.689, sum=6.58 (5)", - "tab": "Efficiency", - "score": 1.3159105889028733 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=214.653, mean=1557.242, max=6428.398, sum=7786.208 (5)", - "tab": "General information", - "score": 1557.241581367783 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=9.202, max=27.753, sum=46.009 (5)", - "tab": "General information", - "score": 9.201869121421694 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=1.143, mean=1.143, max=1.143, sum=1.143 (1)", - "tab": "Efficiency", - "score": 1.1428523476033752 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)", - "tab": "General information", - "score": 1027.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.218, - "details": { - "description": "min=0.169, mean=0.218, max=0.25, sum=1.091 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.066, mean=1.139, max=1.228, sum=5.697 (5)", - "tab": "Efficiency", - "score": 1.1393479201068188 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=198.406, mean=219.573, max=241.974, sum=1097.866 (5)", - "tab": "General information", - "score": 219.57322077152472 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.517, mean=26.056, max=27.078, sum=130.278 (5)", - "tab": "General information", - "score": 26.05551068588469 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json deleted file mode 100644 index c3ca60cb8..000000000 --- a/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude Instant 1.2", - "id": "anthropic/claude-instant-1.2", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.399, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.4998377028714107 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.491, mean=1.491, max=1.491, sum=1.491 (1)", - "tab": "Efficiency", - "score": 1.490500447447871 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)", - "tab": "General information", - "score": 3709.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=17.149, mean=17.149, max=17.149, sum=17.149 (1)", - "tab": "General information", - "score": 17.149295774647886 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.343, - "details": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.975, mean=0.975, max=0.975, sum=0.975 (1)", - "tab": "Efficiency", - "score": 0.9746438981543135 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)", - "tab": "Efficiency", - "score": 0.6736472499370575 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)", - "tab": "General information", - "score": 4.964 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)", - "tab": "General information", - "score": 1734.363 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.217, mean=8.217, max=8.217, sum=8.217 (1)", - "tab": "General information", - "score": 8.217 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)", - "tab": "General information", - "score": 189.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.113, mean=5.113, max=5.113, sum=5.113 (1)", - "tab": "General information", - "score": 5.113 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.844, - "details": { - "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)", - "tab": "Efficiency", - "score": 0.596853446483612 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)", - "tab": "General information", - "score": 328.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.631, - "details": { - "description": "min=0.37, mean=0.631, max=0.9, sum=3.154 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.59, mean=0.614, max=0.636, sum=3.069 (5)", - "tab": "Efficiency", - "score": 0.613885824571576 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)", - "tab": "General information", - "score": 543.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.499, - "details": { - "description": "min=0.365, mean=0.499, max=0.704, sum=3.491 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.247, mean=1.403, max=1.528, sum=9.821 (7)", - "tab": "Efficiency", - "score": 1.4029501960147133 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)", - "tab": "General information", - "score": 1361.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=54.491, mean=65.956, max=76.513, sum=461.691 (7)", - "tab": "General information", - "score": 65.95586481608514 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.721, - "details": { - "description": "min=0.721, mean=0.721, max=0.721, sum=0.721 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.474, mean=1.474, max=1.474, sum=1.474 (1)", - "tab": "Efficiency", - "score": 1.474282945394516 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)", - "tab": "General information", - "score": 1012.712 - }, - "GSM8K - # output tokens": { - "description": "min=105.998, mean=105.998, max=105.998, sum=105.998 (1)", - "tab": "General information", - "score": 105.998 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.586, - "details": { - "description": "min=0.341, mean=0.586, max=0.937, sum=2.931 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.629, mean=0.911, max=1.974, sum=4.555 (5)", - "tab": "Efficiency", - "score": 0.9110085331512334 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.99 (5)", - "tab": "General information", - "score": 4.797959183673469 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)", - "tab": "General information", - "score": 1621.3558670820687 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.646, max=2.219, sum=8.23 (5)", - "tab": "General information", - "score": 1.6459798365122615 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)", - "tab": "Efficiency", - "score": 0.7633721221749399 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)", - "tab": "General information", - "score": 1092.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.194, - "details": { - "description": "min=0.138, mean=0.194, max=0.24, sum=0.971 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.726, mean=0.772, max=0.838, sum=3.859 (5)", - "tab": "Efficiency", - "score": 0.7717107724915095 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)", - "tab": "General information", - "score": 218.57322077152472 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.177, mean=25.579, max=26.326, sum=127.893 (5)", - "tab": "General information", - "score": 25.578513056277718 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json deleted file mode 100644 index da3e6b3b3..000000000 --- a/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude v1.3", - "id": "anthropic/claude-v1.3", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.09352059925093632 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723, - "details": { - "description": "min=0.723, mean=0.723, max=0.723, sum=0.723 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=6.114, mean=6.114, max=6.114, sum=6.114 (1)", - "tab": "Efficiency", - "score": 6.113923052666893 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)", - "tab": "General information", - "score": 3709.7408450704224 - }, - "NarrativeQA - # output tokens": { - "description": "min=9.338, mean=9.338, max=9.338, sum=9.338 (1)", - "tab": "General information", - "score": 9.338028169014084 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.409, - "details": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.409 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=3.523, mean=3.523, max=3.523, sum=3.523 (1)", - "tab": "Efficiency", - "score": 3.5226667501174913 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=2.059, mean=2.059, max=2.059, sum=2.059 (1)", - "tab": "Efficiency", - "score": 2.0589215233325957 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)", - "tab": "General information", - "score": 4.964 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)", - "tab": "General information", - "score": 1734.363 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=4.973, mean=4.973, max=4.973, sum=4.973 (1)", - "tab": "General information", - "score": 4.973 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)", - "tab": "General information", - "score": 189.259 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)", - "tab": "General information", - "score": 3.722 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=3.375, mean=3.375, max=3.375, sum=3.375 (1)", - "tab": "Efficiency", - "score": 3.375496371269226 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)", - "tab": "General information", - "score": 328.79 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.631, - "details": { - "description": "min=0.35, mean=0.631, max=0.93, sum=3.155 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.228, mean=1.482, max=1.741, sum=7.41 (5)", - "tab": "Efficiency", - "score": 1.4820951028288456 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)", - "tab": "General information", - "score": 543.747298245614 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54, - "details": { - "description": "min=0.368, mean=0.54, max=0.826, sum=3.783 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.85, mean=6.109, max=8.225, sum=42.762 (7)", - "tab": "Efficiency", - "score": 6.10879439056091 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)", - "tab": "General information", - "score": 1361.8141219676104 - }, - "MATH - # output tokens": { - "description": "min=53.133, mean=79.493, max=97.564, sum=556.452 (7)", - "tab": "General information", - "score": 79.49312981320325 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.784, - "details": { - "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=6.653, mean=6.653, max=6.653, sum=6.653 (1)", - "tab": "Efficiency", - "score": 6.653211696863174 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)", - "tab": "General information", - "score": 1012.712 - }, - "GSM8K - # output tokens": { - "description": "min=104.726, mean=104.726, max=104.726, sum=104.726 (1)", - "tab": "General information", - "score": 104.726 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.629, - "details": { - "description": "min=0.417, mean=0.629, max=0.916, sum=3.147 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=1.081, mean=3.536, max=8.614, sum=17.681 (5)", - "tab": "Efficiency", - "score": 3.536136101917547 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.99 (5)", - "tab": "General information", - "score": 4.797959183673469 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)", - "tab": "General information", - "score": 1621.3558670820687 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.354, max=2.232, sum=6.771 (5)", - "tab": "General information", - "score": 1.3542176968306323 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.618, - "details": { - "description": "min=0.618, mean=0.618, max=0.618, sum=0.618 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=3.39, mean=3.39, max=3.39, sum=3.39 (1)", - "tab": "Efficiency", - "score": 3.3901417141643244 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)", - "tab": "General information", - "score": 1092.4373757455269 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219, - "details": { - "description": "min=0.152, mean=0.219, max=0.28, sum=1.093 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.391, mean=2.232, max=3.755, sum=11.161 (5)", - "tab": "Efficiency", - "score": 2.232213549153336 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)", - "tab": "General information", - "score": 218.57322077152472 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.004, mean=25.611, max=26.28, sum=128.057 (5)", - "tab": "General information", - "score": 25.611364027374215 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json deleted file mode 100644 index a431f3338..000000000 --- a/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/cohere_command-light/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Command Light", - "id": "cohere/command-light", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.46863920099875156 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.629, - "details": { - "description": "min=0.629, mean=0.629, max=0.629, sum=0.629 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)", - "tab": "Efficiency", - "score": 0.8961316760157195 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.941, mean=1.941, max=1.941, sum=1.941 (1)", - "tab": "General information", - "score": 1.9408450704225353 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1660.485, mean=1660.485, max=1660.485, sum=1660.485 (1)", - "tab": "General information", - "score": 1660.4845070422534 - }, - "NarrativeQA - # output tokens": { - "description": "min=10.814, mean=10.814, max=10.814, sum=10.814 (1)", - "tab": "General information", - "score": 10.814084507042253 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.195, - "details": { - "description": "min=0.195, mean=0.195, max=0.195, sum=0.195 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.08, mean=1.08, max=1.08, sum=1.08 (1)", - "tab": "Efficiency", - "score": 1.0799305574893951 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)", - "tab": "Efficiency", - "score": 0.6957695767879486 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.617, mean=4.617, max=4.617, sum=4.617 (1)", - "tab": "General information", - "score": 4.617 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1557.639, mean=1557.639, max=1557.639, sum=1557.639 (1)", - "tab": "General information", - "score": 1557.639 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=10.869, mean=10.869, max=10.869, sum=10.869 (1)", - "tab": "General information", - "score": 10.869 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=115.191, mean=115.191, max=115.191, sum=115.191 (1)", - "tab": "General information", - "score": 115.191 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=17.348, mean=17.348, max=17.348, sum=17.348 (1)", - "tab": "General information", - "score": 17.348 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398, - "details": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.398 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.705, mean=0.705, max=0.705, sum=0.705 (1)", - "tab": "Efficiency", - "score": 0.7049956932067871 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=246.682, mean=246.682, max=246.682, sum=246.682 (1)", - "tab": "General information", - "score": 246.682 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386, - "details": { - "description": "min=0.25, mean=0.386, max=0.57, sum=1.928 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.405, mean=0.749, max=1.412, sum=3.747 (5)", - "tab": "Efficiency", - "score": 0.7494988910942747 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=2406.301 (5)", - "tab": "General information", - "score": 481.26021052631575 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.098, - "details": { - "description": "min=0.026, mean=0.098, max=0.167, sum=0.687 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.821, mean=2.374, max=2.948, sum=16.62 (7)", - "tab": "Efficiency", - "score": 2.374249639604042 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2.962, mean=6.878, max=8, sum=48.146 (7)", - "tab": "General information", - "score": 6.877964141122035 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=925.333, mean=1177.329, max=1534.058, sum=8241.302 (7)", - "tab": "General information", - "score": 1177.3289276411065 - }, - "MATH - # output tokens": { - "description": "min=83.228, mean=106.589, max=137.692, sum=746.121 (7)", - "tab": "General information", - "score": 106.58875792143844 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.149, - "details": { - "description": "min=0.149, mean=0.149, max=0.149, sum=0.149 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.751, mean=1.751, max=1.751, sum=1.751 (1)", - "tab": "Efficiency", - "score": 1.7514978868961335 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=942.424, mean=942.424, max=942.424, sum=942.424 (1)", - "tab": "General information", - "score": 942.424 - }, - "GSM8K - # output tokens": { - "description": "min=80.184, mean=80.184, max=80.184, sum=80.184 (1)", - "tab": "General information", - "score": 80.184 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.397, - "details": { - "description": "min=0.173, mean=0.397, max=0.874, sum=1.983 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.423, mean=0.783, max=1.232, sum=3.916 (5)", - "tab": "Efficiency", - "score": 0.7831334660572837 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.388, mean=3.878, max=5, sum=19.388 (5)", - "tab": "General information", - "score": 3.8775510204081636 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)", - "tab": "General information", - "score": 0.002857142857142857 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.295, mean=566.501, max=1529.327, sum=2832.507 (5)", - "tab": "General information", - "score": 566.5014751745068 - }, - "LegalBench - # output tokens": { - "description": "min=1.074, mean=6.64, max=23.614, sum=33.198 (5)", - "tab": "General information", - "score": 6.63968330089529 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.312, - "details": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.312 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)", - "tab": "Efficiency", - "score": 0.895831539901066 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1016.738, mean=1016.738, max=1016.738, sum=1016.738 (1)", - "tab": "General information", - "score": 1016.7375745526839 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.023, - "details": { - "description": "min=0.0, mean=0.023, max=0.064, sum=0.113 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.712, mean=0.797, max=0.934, sum=3.983 (5)", - "tab": "Efficiency", - "score": 0.7965989762712353 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=129.757, mean=149.459, max=178.821, sum=747.297 (5)", - "tab": "General information", - "score": 149.45941179844013 - }, - "WMT 2014 - # output tokens": { - "description": "min=30.895, mean=39.885, max=47.65, sum=199.426 (5)", - "tab": "General information", - "score": 39.88511765942805 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json deleted file mode 100644 index d0f464767..000000000 --- a/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/cohere_command-r-plus/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Command R Plus", - "id": "cohere/command-r-plus", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.441, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6927215980024969 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)", - "tab": "Efficiency", - "score": 0.6590185803426823 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3442.654, mean=3442.654, max=3442.654, sum=3442.654 (1)", - "tab": "General information", - "score": 3442.6535211267606 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.343, - "details": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.48 (1)", - "tab": "Efficiency", - "score": 0.48011646389961243 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)", - "tab": "Efficiency", - "score": 0.21743906450271605 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2069.055, mean=2069.055, max=2069.055, sum=2069.055 (1)", - "tab": "General information", - "score": 2069.055 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=160.159, mean=160.159, max=160.159, sum=160.159 (1)", - "tab": "General information", - "score": 160.159 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.526, mean=0.526, max=0.526, sum=0.526 (1)", - "tab": "Efficiency", - "score": 0.5261325912475586 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=260.678, mean=260.678, max=260.678, sum=260.678 (1)", - "tab": "General information", - "score": 260.678 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.21, mean=0.59, max=0.89, sum=2.951 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.26, mean=0.359, max=0.481, sum=1.797 (5)", - "tab": "Efficiency", - "score": 0.3594088048349347 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.66, mean=499.49, max=661.579, sum=2497.449 (5)", - "tab": "General information", - "score": 499.48978947368425 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403, - "details": { - "description": "min=0.25, mean=0.403, max=0.607, sum=2.822 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.358, mean=1.792, max=2.877, sum=12.543 (7)", - "tab": "Efficiency", - "score": 1.7917883168992628 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=974.156, mean=1406.107, max=2423.596, sum=9842.752 (7)", - "tab": "General information", - "score": 1406.1074103714861 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.738, - "details": { - "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.592, mean=3.592, max=3.592, sum=3.592 (1)", - "tab": "Efficiency", - "score": 3.5923334171772003 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1158.893, mean=1158.893, max=1158.893, sum=1158.893 (1)", - "tab": "General information", - "score": 1158.893 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672, - "details": { - "description": "min=0.428, mean=0.672, max=0.947, sum=3.358 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.193, mean=0.351, max=0.927, sum=1.754 (5)", - "tab": "Efficiency", - "score": 0.3508069759610481 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=223.126, mean=1582.617, max=6507.029, sum=7913.085 (5)", - "tab": "General information", - "score": 1582.6169819753743 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.567, - "details": { - "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)", - "tab": "Efficiency", - "score": 0.6308214294744533 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1062.905, mean=1062.905, max=1062.905, sum=1062.905 (1)", - "tab": "General information", - "score": 1062.9045725646124 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.203, - "details": { - "description": "min=0.156, mean=0.203, max=0.233, sum=1.017 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.59, mean=0.644, max=0.742, sum=3.221 (5)", - "tab": "Efficiency", - "score": 0.6441886008863676 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=114.404, mean=127.944, max=146.584, sum=639.721 (5)", - "tab": "General information", - "score": 127.94422599021257 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json deleted file mode 100644 index 51821d155..000000000 --- a/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/cohere_command-r/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Command R", - "id": "cohere/command-r", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.299, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.9644069912609239 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.742, mean=0.742, max=0.742, sum=0.742 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.389 (1)", - "tab": "Efficiency", - "score": 0.3886059089445732 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3442.654, mean=3442.654, max=3442.654, sum=3442.654 (1)", - "tab": "General information", - "score": 3442.6535211267606 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.352, - "details": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)", - "tab": "Efficiency", - "score": 0.2875482747554779 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.165, mean=0.165, max=0.165, sum=0.165 (1)", - "tab": "Efficiency", - "score": 0.16523362946510314 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2069.055, mean=2069.055, max=2069.055, sum=2069.055 (1)", - "tab": "General information", - "score": 2069.055 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=160.159, mean=160.159, max=160.159, sum=160.159 (1)", - "tab": "General information", - "score": 160.159 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782, - "details": { - "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.15, mean=0.15, max=0.15, sum=0.15 (1)", - "tab": "Efficiency", - "score": 0.14960159301757814 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=260.678, mean=260.678, max=260.678, sum=260.678 (1)", - "tab": "General information", - "score": 260.678 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.567, - "details": { - "description": "min=0.33, mean=0.567, max=0.82, sum=2.836 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.162, mean=0.173, max=0.185, sum=0.867 (5)", - "tab": "Efficiency", - "score": 0.17335561692923832 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.66, mean=499.49, max=661.579, sum=2497.449 (5)", - "tab": "General information", - "score": 499.48978947368425 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.266, - "details": { - "description": "min=0.158, mean=0.266, max=0.333, sum=1.861 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.659, mean=0.821, max=1.104, sum=5.745 (7)", - "tab": "Efficiency", - "score": 0.8207379439676702 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=974.156, mean=1406.107, max=2423.596, sum=9842.752 (7)", - "tab": "General information", - "score": 1406.1074103714861 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.551, - "details": { - "description": "min=0.551, mean=0.551, max=0.551, sum=0.551 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.04, mean=1.04, max=1.04, sum=1.04 (1)", - "tab": "Efficiency", - "score": 1.0398468203544617 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1158.893, mean=1158.893, max=1158.893, sum=1158.893 (1)", - "tab": "General information", - "score": 1158.893 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507, - "details": { - "description": "min=0.211, mean=0.507, max=0.905, sum=2.534 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.151, mean=0.235, max=0.5, sum=1.174 (5)", - "tab": "Efficiency", - "score": 0.23478191454837286 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=223.126, mean=1582.617, max=6507.029, sum=7913.085 (5)", - "tab": "General information", - "score": 1582.6169819753743 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555, - "details": { - "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.191, mean=0.191, max=0.191, sum=0.191 (1)", - "tab": "Efficiency", - "score": 0.19128861531585634 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1062.905, mean=1062.905, max=1062.905, sum=1062.905 (1)", - "tab": "General information", - "score": 1062.9045725646124 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.149, - "details": { - "description": "min=0.107, mean=0.149, max=0.175, sum=0.746 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.308, mean=0.343, max=0.455, sum=1.715 (5)", - "tab": "Efficiency", - "score": 0.3429552388299011 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=114.404, mean=127.944, max=146.584, sum=639.721 (5)", - "tab": "General information", - "score": 127.94422599021257 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json deleted file mode 100644 index 488fa54b9..000000000 --- a/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/cohere_command/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Command", - "id": "cohere/command", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.327, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.21596754057428214 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.749, - "details": { - "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.783, mean=1.783, max=1.783, sum=1.783 (1)", - "tab": "Efficiency", - "score": 1.783306110408944 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.941, mean=1.941, max=1.941, sum=1.941 (1)", - "tab": "General information", - "score": 1.9408450704225353 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1660.485, mean=1660.485, max=1660.485, sum=1660.485 (1)", - "tab": "General information", - "score": 1660.4845070422534 - }, - "NarrativeQA - # output tokens": { - "description": "min=7.442, mean=7.442, max=7.442, sum=7.442 (1)", - "tab": "General information", - "score": 7.44225352112676 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391, - "details": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.804, mean=1.804, max=1.804, sum=1.804 (1)", - "tab": "Efficiency", - "score": 1.8040301027297974 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.986, mean=0.986, max=0.986, sum=0.986 (1)", - "tab": "Efficiency", - "score": 0.9856750283241272 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.617, mean=4.617, max=4.617, sum=4.617 (1)", - "tab": "General information", - "score": 4.617 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1557.639, mean=1557.639, max=1557.639, sum=1557.639 (1)", - "tab": "General information", - "score": 1557.639 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.461, mean=8.461, max=8.461, sum=8.461 (1)", - "tab": "General information", - "score": 8.461 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=115.191, mean=115.191, max=115.191, sum=115.191 (1)", - "tab": "General information", - "score": 115.191 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.679, mean=5.679, max=5.679, sum=5.679 (1)", - "tab": "General information", - "score": 5.679 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.774, - "details": { - "description": "min=0.774, mean=0.774, max=0.774, sum=0.774 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=1.044, mean=1.044, max=1.044, sum=1.044 (1)", - "tab": "Efficiency", - "score": 1.0440752515792846 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=246.682, mean=246.682, max=246.682, sum=246.682 (1)", - "tab": "General information", - "score": 246.682 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.525, - "details": { - "description": "min=0.27, mean=0.525, max=0.88, sum=2.626 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.821, mean=1.08, max=1.384, sum=5.399 (5)", - "tab": "Efficiency", - "score": 1.0797608851633573 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.75, mean=481.26, max=628.421, sum=2406.301 (5)", - "tab": "General information", - "score": 481.26021052631575 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.236, - "details": { - "description": "min=0.1, mean=0.236, max=0.349, sum=1.652 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=4.562, mean=5.762, max=6.509, sum=40.337 (7)", - "tab": "Efficiency", - "score": 5.762416239357385 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2.962, mean=6.878, max=8, sum=48.146 (7)", - "tab": "General information", - "score": 6.877964141122035 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=925.333, mean=1177.329, max=1534.058, sum=8241.302 (7)", - "tab": "General information", - "score": 1177.3289276411065 - }, - "MATH - # output tokens": { - "description": "min=94.488, mean=116.49, max=135.115, sum=815.428 (7)", - "tab": "General information", - "score": 116.48968047229982 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=4.127, mean=4.127, max=4.127, sum=4.127 (1)", - "tab": "Efficiency", - "score": 4.127378141641617 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=942.424, mean=942.424, max=942.424, sum=942.424 (1)", - "tab": "General information", - "score": 942.424 - }, - "GSM8K - # output tokens": { - "description": "min=94.43, mean=94.43, max=94.43, sum=94.43 (1)", - "tab": "General information", - "score": 94.43 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.365, mean=0.578, max=0.884, sum=2.888 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.856, mean=1.165, max=1.842, sum=5.823 (5)", - "tab": "Efficiency", - "score": 1.1646721122881132 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.388, mean=3.878, max=5, sum=19.388 (5)", - "tab": "General information", - "score": 3.8775510204081636 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)", - "tab": "General information", - "score": 0.002857142857142857 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.295, mean=566.501, max=1529.327, sum=2832.507 (5)", - "tab": "General information", - "score": 566.5014751745068 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.79, max=3.055, sum=8.948 (5)", - "tab": "General information", - "score": 1.7895877106155815 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445, - "details": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.445 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=1.234, mean=1.234, max=1.234, sum=1.234 (1)", - "tab": "Efficiency", - "score": 1.2344102347584416 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1016.738, mean=1016.738, max=1016.738, sum=1016.738 (1)", - "tab": "General information", - "score": 1016.7375745526839 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.088, - "details": { - "description": "min=0.013, mean=0.088, max=0.151, sum=0.441 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=2.376, mean=2.894, max=3.133, sum=14.469 (5)", - "tab": "Efficiency", - "score": 2.8937741082134893 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=129.757, mean=149.459, max=178.821, sum=747.297 (5)", - "tab": "General information", - "score": 149.45941179844013 - }, - "WMT 2014 - # output tokens": { - "description": "min=27.65, mean=31.8, max=41.789, sum=159.002 (5)", - "tab": "General information", - "score": 31.800405260743236 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json deleted file mode 100644 index 9dc0aa32d..000000000 --- a/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DBRX Instruct", - "id": "databricks/dbrx-instruct", - "developer": "databricks", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.289, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5229588014981273 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.488, - "details": { - "description": "min=0.488, mean=0.488, max=0.488, sum=0.488 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.645, mean=1.645, max=1.645, sum=1.645 (1)", - "tab": "Efficiency", - "score": 1.6445875322315056 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)", - "tab": "General information", - "score": 3522.6704225352114 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.284, - "details": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.175, mean=1.175, max=1.175, sum=1.175 (1)", - "tab": "Efficiency", - "score": 1.1746999933719635 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.665, mean=0.665, max=0.665, sum=0.665 (1)", - "tab": "Efficiency", - "score": 0.6648788969516755 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1762.593, mean=1762.593, max=1762.593, sum=1762.593 (1)", - "tab": "General information", - "score": 1762.593 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)", - "tab": "General information", - "score": 173.127 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=0.91 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)", - "tab": "Efficiency", - "score": 0.3277706532478333 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)", - "tab": "General information", - "score": 242.782 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.34, mean=0.643, max=0.93, sum=3.215 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.39, mean=0.412, max=0.432, sum=2.062 (5)", - "tab": "Efficiency", - "score": 0.41247134314921857 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)", - "tab": "General information", - "score": 460.71996491228066 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358, - "details": { - "description": "min=0.015, mean=0.358, max=0.553, sum=2.509 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.531, mean=2.305, max=3.852, sum=16.138 (7)", - "tab": "Efficiency", - "score": 2.305378989452493 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)", - "tab": "General information", - "score": 1323.910874184069 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.671, - "details": { - "description": "min=0.671, mean=0.671, max=0.671, sum=0.671 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.384, mean=2.384, max=2.384, sum=2.384 (1)", - "tab": "Efficiency", - "score": 2.3839432048797606 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)", - "tab": "General information", - "score": 1020.035 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426, - "details": { - "description": "min=0.053, mean=0.426, max=0.755, sum=2.13 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.366, mean=0.733, max=1.771, sum=3.667 (5)", - "tab": "Efficiency", - "score": 0.73349196183029 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=253.442, mean=1570.163, max=6357.388, sum=7850.815 (5)", - "tab": "General information", - "score": 1570.162971355988 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694, - "details": { - "description": "min=0.694, mean=0.694, max=0.694, sum=0.694 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)", - "tab": "Efficiency", - "score": 0.4383622557221066 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)", - "tab": "General information", - "score": 1020.4135188866799 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.131, - "details": { - "description": "min=0.035, mean=0.131, max=0.192, sum=0.656 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.849, mean=1.059, max=1.342, sum=5.297 (5)", - "tab": "Efficiency", - "score": 1.0594140760888837 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)", - "tab": "General information", - "score": 193.04258583116683 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json deleted file mode 100644 index 201ddf6e5..000000000 --- a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek LLM Chat 67B", - "id": "deepseek-ai/deepseek-llm-67b-chat", - "developer": "deepseek-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.488, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.30021223470661673 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.581, - "details": { - "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=3.36, mean=3.36, max=3.36, sum=3.36 (1)", - "tab": "Efficiency", - "score": 3.359551859573579 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.946, mean=4.946, max=4.946, sum=4.946 (1)", - "tab": "General information", - "score": 4.946478873239436 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3583.146, mean=3583.146, max=3583.146, sum=3583.146 (1)", - "tab": "General information", - "score": 3583.1464788732396 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.412, - "details": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=2.237, mean=2.237, max=2.237, sum=2.237 (1)", - "tab": "Efficiency", - "score": 2.2367931361198425 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)", - "tab": "Efficiency", - "score": 0.8567402980327606 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.841, mean=4.841, max=4.841, sum=4.841 (1)", - "tab": "General information", - "score": 4.841 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.024, mean=0.024, max=0.024, sum=0.024 (1)", - "tab": "General information", - "score": 0.024 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2192.734, mean=2192.734, max=2192.734, sum=2192.734 (1)", - "tab": "General information", - "score": 2192.734 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=199.39, mean=199.39, max=199.39, sum=199.39 (1)", - "tab": "General information", - "score": 199.39 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=0.88 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)", - "tab": "Efficiency", - "score": 0.41702947664260864 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=253.206, mean=253.206, max=253.206, sum=253.206 (1)", - "tab": "General information", - "score": 253.206 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.641, - "details": { - "description": "min=0.44, mean=0.641, max=0.91, sum=3.203 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.48, mean=0.508, max=0.551, sum=2.542 (5)", - "tab": "Efficiency", - "score": 0.508463426874395 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=382.07, mean=490.941, max=646.667, sum=2454.707 (5)", - "tab": "General information", - "score": 490.9413333333334 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.615, - "details": { - "description": "min=0.456, mean=0.615, max=0.748, sum=4.304 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.389, mean=4.443, max=6.234, sum=31.098 (7)", - "tab": "Efficiency", - "score": 4.442596748084942 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=1012.548, mean=1443.29, max=2448.25, sum=10103.027 (7)", - "tab": "General information", - "score": 1443.2895059403625 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.795, - "details": { - "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=5.877, mean=5.877, max=5.877, sum=5.877 (1)", - "tab": "Efficiency", - "score": 5.876643376111984 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1233.708, mean=1233.708, max=1233.708, sum=1233.708 (1)", - "tab": "General information", - "score": 1233.708 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "description": "min=0.45, mean=0.637, max=0.821, sum=3.183 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.524, mean=0.942, max=2.301, sum=4.71 (5)", - "tab": "Efficiency", - "score": 0.9420770218153176 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=2.006, mean=4.201, max=5, sum=21.006 (5)", - "tab": "General information", - "score": 4.201224489795918 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=269.379, mean=990.259, max=3325.551, sum=4951.297 (5)", - "tab": "General information", - "score": 990.259348667894 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.628, - "details": { - "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)", - "tab": "Efficiency", - "score": 0.8296676231899982 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1084.235, mean=1084.235, max=1084.235, sum=1084.235 (1)", - "tab": "General information", - "score": 1084.234592445328 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.186, - "details": { - "description": "min=0.11, mean=0.186, max=0.236, sum=0.932 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.381, mean=1.429, max=1.464, sum=7.147 (5)", - "tab": "Efficiency", - "score": 1.429440071817079 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=203.736, mean=220.291, max=255.861, sum=1101.453 (5)", - "tab": "General information", - "score": 220.29060445022174 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json deleted file mode 100644 index b5f8e240f..000000000 --- a/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek v3", - "id": "deepseek-ai/deepseek-v3", - "developer": "deepseek-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.11454431960049938 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=0.796 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=6.44, mean=6.44, max=6.44, sum=6.44 (1)", - "tab": "Efficiency", - "score": 6.440373906954913 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3487.045, mean=3487.045, max=3487.045, sum=3487.045 (1)", - "tab": "General information", - "score": 3487.045070422535 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.467, - "details": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=5.606, mean=5.606, max=5.606, sum=5.606 (1)", - "tab": "Efficiency", - "score": 5.605930573940277 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=2.183, mean=2.183, max=2.183, sum=2.183 (1)", - "tab": "Efficiency", - "score": 2.1832692058086396 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1756.178, mean=1756.178, max=1756.178, sum=1756.178 (1)", - "tab": "General information", - "score": 1756.178 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=131.205, mean=131.205, max=131.205, sum=131.205 (1)", - "tab": "General information", - "score": 131.205 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.954, - "details": { - "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=1.746, mean=1.746, max=1.746, sum=1.746 (1)", - "tab": "Efficiency", - "score": 1.746311339378357 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=245.494, mean=245.494, max=245.494, sum=245.494 (1)", - "tab": "General information", - "score": 245.494 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.803, - "details": { - "description": "min=0.65, mean=0.803, max=0.92, sum=4.016 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.545, mean=0.564, max=0.585, sum=2.818 (5)", - "tab": "Efficiency", - "score": 0.5636642604125173 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.01, mean=465.871, max=613.535, sum=2329.355 (5)", - "tab": "General information", - "score": 465.8710175438597 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.912, - "details": { - "description": "min=0.816, mean=0.912, max=0.985, sum=6.385 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=7.691, mean=9.449, max=13.451, sum=66.142 (7)", - "tab": "Efficiency", - "score": 9.448914254379945 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=853.923, mean=1245.725, max=2184.846, sum=8720.075 (7)", - "tab": "General information", - "score": 1245.7249665607071 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=0.94 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=9.77, mean=9.77, max=9.77, sum=9.77 (1)", - "tab": "Efficiency", - "score": 9.76988450360298 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=961.041, mean=961.041, max=961.041, sum=961.041 (1)", - "tab": "General information", - "score": 961.041 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.425, mean=0.718, max=0.968, sum=3.589 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.566, mean=3.113, max=6.6, sum=15.563 (5)", - "tab": "Efficiency", - "score": 3.1125569474549435 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=198.516, mean=1498.765, max=6226.967, sum=7493.826 (5)", - "tab": "General information", - "score": 1498.7652695311654 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=0.809 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=1.79, mean=1.79, max=1.79, sum=1.79 (1)", - "tab": "Efficiency", - "score": 1.790037025751224 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=985.93, mean=985.93, max=985.93, sum=985.93 (1)", - "tab": "General information", - "score": 985.9304174950298 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.209, - "details": { - "description": "min=0.163, mean=0.209, max=0.252, sum=1.046 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=2.231, mean=2.677, max=3.02, sum=13.384 (5)", - "tab": "Efficiency", - "score": 2.6768779265693037 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=103.739, mean=118.596, max=138.616, sum=592.982 (5)", - "tab": "General information", - "score": 118.59634548478361 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json deleted file mode 100644 index eabdc0bbd..000000000 --- a/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.0 Pro 002", - "id": "google/gemini-1.0-pro-002", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6464918851435706 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.751, - "details": { - "description": "min=0.751, mean=0.751, max=0.751, sum=0.751 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.679, mean=0.679, max=0.679, sum=0.679 (1)", - "tab": "Efficiency", - "score": 0.6791302858934104 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3447.994, mean=3447.994, max=3447.994, sum=3447.994 (1)", - "tab": "General information", - "score": 3447.994366197183 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391, - "details": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.609, mean=0.609, max=0.609, sum=0.609 (1)", - "tab": "Efficiency", - "score": 0.6086829407215119 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)", - "tab": "Efficiency", - "score": 0.5965619602203369 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1978.347, mean=1978.347, max=1978.347, sum=1978.347 (1)", - "tab": "General information", - "score": 1978.347 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=153.995, mean=153.995, max=153.995, sum=153.995 (1)", - "tab": "General information", - "score": 153.995 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)", - "tab": "Efficiency", - "score": 0.4301223816871643 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)", - "tab": "General information", - "score": 248.508 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534, - "details": { - "description": "min=0.27, mean=0.534, max=0.81, sum=2.672 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.397, mean=0.407, max=0.417, sum=2.033 (5)", - "tab": "Efficiency", - "score": 0.4066482855060644 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)", - "tab": "General information", - "score": 481.5305263157895 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.665, - "details": { - "description": "min=0.553, mean=0.665, max=0.859, sum=4.654 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.402, mean=1.585, max=2.083, sum=11.094 (7)", - "tab": "Efficiency", - "score": 1.5848151401531698 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)", - "tab": "General information", - "score": 1355.5064552904823 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.816, - "details": { - "description": "min=0.816, mean=0.816, max=0.816, sum=0.816 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.513, mean=1.513, max=1.513, sum=1.513 (1)", - "tab": "Efficiency", - "score": 1.513066102743149 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)", - "tab": "General information", - "score": 1151.885 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.475, - "details": { - "description": "min=0.118, mean=0.475, max=0.811, sum=2.376 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.447, mean=0.609, max=1.08, sum=3.043 (5)", - "tab": "Efficiency", - "score": 0.6085789782066453 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=209.916, mean=1558.239, max=6423.569, sum=7791.193 (5)", - "tab": "General information", - "score": 1558.2386051001386 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.483, - "details": { - "description": "min=0.483, mean=0.483, max=0.483, sum=0.483 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)", - "tab": "Efficiency", - "score": 0.4310008814610333 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)", - "tab": "General information", - "score": 1029.4811133200794 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.194, - "details": { - "description": "min=0.144, mean=0.194, max=0.231, sum=0.972 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.705, mean=0.803, max=0.924, sum=4.014 (5)", - "tab": "Efficiency", - "score": 0.8027491282517494 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=90.732, mean=120.97, max=147.366, sum=604.851 (5)", - "tab": "General information", - "score": 120.97025108961614 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json deleted file mode 100644 index 991b81669..000000000 --- a/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Flash 001", - "id": "google/gemini-1.5-flash-001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.681960049937578 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.647, mean=0.647, max=0.647, sum=0.647 (1)", - "tab": "Efficiency", - "score": 0.6474363112991507 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3447.994, mean=3447.994, max=3447.994, sum=3447.994 (1)", - "tab": "General information", - "score": 3447.994366197183 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.332, - "details": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.495, mean=0.495, max=0.495, sum=0.495 (1)", - "tab": "Efficiency", - "score": 0.49524100852012637 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.432 (1)", - "tab": "Efficiency", - "score": 0.431587886095047 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1978.347, mean=1978.347, max=1978.347, sum=1978.347 (1)", - "tab": "General information", - "score": 1978.347 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=153.995, mean=153.995, max=153.995, sum=153.995 (1)", - "tab": "General information", - "score": 153.995 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.504, mean=0.504, max=0.504, sum=0.504 (1)", - "tab": "Efficiency", - "score": 0.5038927392959595 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)", - "tab": "General information", - "score": 248.508 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.703, - "details": { - "description": "min=0.58, mean=0.703, max=0.93, sum=3.514 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.525, mean=0.568, max=0.62, sum=2.842 (5)", - "tab": "Efficiency", - "score": 0.5683523873948214 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)", - "tab": "General information", - "score": 481.5305263157895 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.753, - "details": { - "description": "min=0.632, mean=0.753, max=0.889, sum=5.269 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.303, mean=1.592, max=2.086, sum=11.144 (7)", - "tab": "Efficiency", - "score": 1.592031592636459 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)", - "tab": "General information", - "score": 1355.5064552904823 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=0.785 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.758, mean=1.758, max=1.758, sum=1.758 (1)", - "tab": "Efficiency", - "score": 1.7575640678405762 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)", - "tab": "General information", - "score": 1151.885 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.425, mean=0.661, max=0.968, sum=3.305 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.409, mean=0.604, max=0.842, sum=3.02 (5)", - "tab": "Efficiency", - "score": 0.6040551961526522 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=209.916, mean=1558.239, max=6423.569, sum=7791.193 (5)", - "tab": "General information", - "score": 1558.2386051001386 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68, - "details": { - "description": "min=0.68, mean=0.68, max=0.68, sum=0.68 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.399, mean=0.399, max=0.399, sum=0.399 (1)", - "tab": "Efficiency", - "score": 0.3993651843165971 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)", - "tab": "General information", - "score": 1029.4811133200794 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.225, - "details": { - "description": "min=0.186, mean=0.225, max=0.253, sum=1.126 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.581, mean=0.637, max=0.75, sum=3.186 (5)", - "tab": "Efficiency", - "score": 0.6372637821067911 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=90.732, mean=120.97, max=147.366, sum=604.851 (5)", - "tab": "General information", - "score": 120.97025108961614 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json deleted file mode 100644 index 725c639a2..000000000 --- a/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Flash 002", - "id": "google/gemini-1.5-flash-002", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.573, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.8933333333333333 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)", - "tab": "Efficiency", - "score": 0.4433113621039824 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)", - "tab": "General information", - "score": 3437.994366197183 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323, - "details": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.323 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.379 (1)", - "tab": "Efficiency", - "score": 0.37945408272743225 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.37 (1)", - "tab": "Efficiency", - "score": 0.36984835290908813 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1968.347, mean=1968.347, max=1968.347, sum=1968.347 (1)", - "tab": "General information", - "score": 1968.347 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)", - "tab": "General information", - "score": 143.995 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=0.914 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.303 (1)", - "tab": "Efficiency", - "score": 0.302696533203125 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)", - "tab": "General information", - "score": 248.508 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679, - "details": { - "description": "min=0.56, mean=0.679, max=0.81, sum=3.395 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.291, mean=0.296, max=0.299, sum=1.482 (5)", - "tab": "Efficiency", - "score": 0.296430273214976 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)", - "tab": "General information", - "score": 481.5305263157895 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "description": "min=0.816, mean=0.908, max=0.985, sum=6.354 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.701, mean=0.848, max=1.036, sum=5.939 (7)", - "tab": "Efficiency", - "score": 0.8483759753773942 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)", - "tab": "General information", - "score": 1355.5064552904823 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328, - "details": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)", - "tab": "Efficiency", - "score": 0.8591284859287847 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)", - "tab": "General information", - "score": 1151.885 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.42, mean=0.67, max=0.979, sum=3.35 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.284, mean=0.347, max=0.541, sum=1.736 (5)", - "tab": "Efficiency", - "score": 0.34728255842366473 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=199.916, mean=1548.239, max=6413.569, sum=7741.193 (5)", - "tab": "General information", - "score": 1548.2386051001386 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656, - "details": { - "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.302, mean=0.302, max=0.302, sum=0.302 (1)", - "tab": "Efficiency", - "score": 0.30154310163873327 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)", - "tab": "General information", - "score": 1029.4811133200794 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.212, - "details": { - "description": "min=0.179, mean=0.212, max=0.232, sum=1.062 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.407, mean=0.424, max=0.444, sum=2.119 (5)", - "tab": "Efficiency", - "score": 0.42385545386168993 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)", - "tab": "General information", - "score": 110.97025108961614 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json deleted file mode 100644 index 8b7eab026..000000000 --- a/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Pro 001", - "id": "google/gemini-1.5-pro-001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.4783520599250936 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.835, mean=0.835, max=0.835, sum=0.835 (1)", - "tab": "Efficiency", - "score": 0.8351484166930544 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3447.994, mean=3447.994, max=3447.994, sum=3447.994 (1)", - "tab": "General information", - "score": 3447.994366197183 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378, - "details": { - "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.717, mean=0.717, max=0.717, sum=0.717 (1)", - "tab": "Efficiency", - "score": 0.7170397922992706 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.634, mean=0.634, max=0.634, sum=0.634 (1)", - "tab": "Efficiency", - "score": 0.6341883151531219 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1978.347, mean=1978.347, max=1978.347, sum=1978.347 (1)", - "tab": "General information", - "score": 1978.347 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=153.995, mean=153.995, max=153.995, sum=153.995 (1)", - "tab": "General information", - "score": 153.995 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.902, - "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=0.902 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)", - "tab": "Efficiency", - "score": 0.6239193634986877 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)", - "tab": "General information", - "score": 248.508 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772, - "details": { - "description": "min=0.62, mean=0.772, max=0.93, sum=3.858 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.65, mean=0.69, max=0.763, sum=3.451 (5)", - "tab": "Efficiency", - "score": 0.6902154895882857 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)", - "tab": "General information", - "score": 481.5305263157895 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.692, mean=0.825, max=0.956, sum=5.773 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.006, mean=2.701, max=3.274, sum=18.91 (7)", - "tab": "Efficiency", - "score": 2.701360058859101 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)", - "tab": "General information", - "score": 1355.5064552904823 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.206, mean=3.206, max=3.206, sum=3.206 (1)", - "tab": "Efficiency", - "score": 3.205789808034897 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)", - "tab": "General information", - "score": 1151.885 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757, - "details": { - "description": "min=0.46, mean=0.757, max=1, sum=3.786 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.577, mean=0.775, max=1.078, sum=3.876 (5)", - "tab": "Efficiency", - "score": 0.7752882438000996 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=209.916, mean=1558.239, max=6423.569, sum=7791.193 (5)", - "tab": "General information", - "score": 1558.2386051001386 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.692, - "details": { - "description": "min=0.692, mean=0.692, max=0.692, sum=0.692 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)", - "tab": "Efficiency", - "score": 0.5296737767785669 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)", - "tab": "General information", - "score": 1029.4811133200794 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.189, - "details": { - "description": "min=0.118, mean=0.189, max=0.252, sum=0.946 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.029, mean=1.14, max=1.4, sum=5.7 (5)", - "tab": "Efficiency", - "score": 1.1399874632845124 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=90.732, mean=120.97, max=147.366, sum=604.851 (5)", - "tab": "General information", - "score": 120.97025108961614 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json deleted file mode 100644 index ebd3081fb..000000000 --- a/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Pro 002", - "id": "google/gemini-1.5-pro-002", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.49837702871410733 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.756, - "details": { - "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)", - "tab": "Efficiency", - "score": 0.9118197140368548 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)", - "tab": "General information", - "score": 3437.994366197183 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.455, - "details": { - "description": "min=0.455, mean=0.455, max=0.455, sum=0.455 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)", - "tab": "Efficiency", - "score": 0.6156208164691925 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.539, mean=0.539, max=0.539, sum=0.539 (1)", - "tab": "Efficiency", - "score": 0.5389571013450623 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1968.347, mean=1968.347, max=1968.347, sum=1968.347 (1)", - "tab": "General information", - "score": 1968.347 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)", - "tab": "General information", - "score": 143.995 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.952, - "details": { - "description": "min=0.952, mean=0.952, max=0.952, sum=0.952 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.453 (1)", - "tab": "Efficiency", - "score": 0.45284647941589357 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)", - "tab": "General information", - "score": 248.508 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.795, - "details": { - "description": "min=0.67, mean=0.795, max=0.94, sum=3.973 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.453, mean=0.977, max=1.671, sum=4.883 (5)", - "tab": "Efficiency", - "score": 0.9766287260557476 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)", - "tab": "General information", - "score": 481.5305263157895 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.789, mean=0.92, max=1, sum=6.44 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.157, mean=3.273, max=4.064, sum=22.911 (7)", - "tab": "Efficiency", - "score": 3.2730091876347354 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)", - "tab": "General information", - "score": 1355.5064552904823 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.817, - "details": { - "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.161, mean=3.161, max=3.161, sum=3.161 (1)", - "tab": "Efficiency", - "score": 3.1614130451679228 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)", - "tab": "General information", - "score": 1151.885 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.747, - "details": { - "description": "min=0.439, mean=0.747, max=0.968, sum=3.735 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.489, mean=0.596, max=0.915, sum=2.982 (5)", - "tab": "Efficiency", - "score": 0.596480936304943 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=199.916, mean=1548.239, max=6413.569, sum=7741.193 (5)", - "tab": "General information", - "score": 1548.2386051001386 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.771, - "details": { - "description": "min=0.771, mean=0.771, max=0.771, sum=0.771 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)", - "tab": "Efficiency", - "score": 0.5296175953882115 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)", - "tab": "General information", - "score": 1029.4811133200794 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.231, - "details": { - "description": "min=0.192, mean=0.231, max=0.261, sum=1.156 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.049, mean=1.108, max=1.147, sum=5.541 (5)", - "tab": "Efficiency", - "score": 1.1081515031376248 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)", - "tab": "General information", - "score": 110.97025108961614 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json deleted file mode 100644 index b96b71c0c..000000000 --- a/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json +++ /dev/null @@ -1,644 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 2.0 Flash Experimental", - "id": "google/gemini-2.0-flash-exp", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.813, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7398626716604245 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)", - "tab": "Efficiency", - "score": 0.5123653337359428 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)", - "tab": "General information", - "score": 3437.994366197183 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.443, - "details": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.462, mean=0.462, max=0.462, sum=0.462 (1)", - "tab": "Efficiency", - "score": 0.4622749860286713 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)", - "tab": "Efficiency", - "score": 0.4170585689544678 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1968.347, mean=1968.347, max=1968.347, sum=1968.347 (1)", - "tab": "General information", - "score": 1968.347 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)", - "tab": "General information", - "score": 143.995 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.946, - "details": { - "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)", - "tab": "Efficiency", - "score": 0.39134009742736814 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)", - "tab": "General information", - "score": 248.508 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.717, - "details": { - "description": "min=0.56, mean=0.717, max=0.83, sum=3.583 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.405, mean=0.409, max=0.414, sum=2.043 (5)", - "tab": "Efficiency", - "score": 0.4086059420652557 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)", - "tab": "General information", - "score": 481.5305263157895 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.788, mean=0.901, max=0.985, sum=6.309 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.049, mean=1.506, max=2.041, sum=10.543 (7)", - "tab": "Efficiency", - "score": 1.5061902186836522 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)", - "tab": "General information", - "score": 1355.5064552904823 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.946, - "details": { - "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)", - "tab": "Efficiency", - "score": 1.4374724824428557 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)", - "tab": "General information", - "score": 1151.885 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674, - "details": { - "description": "min=0.237, mean=0.674, max=0.989, sum=3.371 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.454, mean=0.547, max=0.655, sum=2.737 (5)", - "tab": "Efficiency", - "score": 0.5473698430089784 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=216.916, mean=1559.239, max=6418.569, sum=7796.193 (5)", - "tab": "General information", - "score": 1559.2386051001386 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=0.73 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)", - "tab": "Efficiency", - "score": 0.4071517047540805 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)", - "tab": "General information", - "score": 1029.4811133200794 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.212, - "details": { - "description": "min=0.154, mean=0.212, max=0.242, sum=1.059 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.639, mean=0.725, max=0.883, sum=3.624 (5)", - "tab": "Efficiency", - "score": 0.7247073432282998 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)", - "tab": "General information", - "score": 110.97025108961614 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json deleted file mode 100644 index ea107cc9e..000000000 --- a/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma 2 Instruct 27B", - "id": "google/gemma-2-27b-it", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7407490636704119 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)", - "tab": "Efficiency", - "score": 0.6603116545878666 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)", - "tab": "General information", - "score": 3437.994366197183 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.353, - "details": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.486 (1)", - "tab": "Efficiency", - "score": 0.4863240420818329 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.358, mean=0.358, max=0.358, sum=0.358 (1)", - "tab": "Efficiency", - "score": 0.35805381870269776 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.953, mean=4.953, max=4.953, sum=4.953 (1)", - "tab": "General information", - "score": 4.953 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.009, mean=0.009, max=0.009, sum=0.009 (1)", - "tab": "General information", - "score": 0.009 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1911.526, mean=1911.526, max=1911.526, sum=1911.526 (1)", - "tab": "General information", - "score": 1911.526 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)", - "tab": "General information", - "score": 143.995 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)", - "tab": "General information", - "score": 0.993 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.918, - "details": { - "description": "min=0.918, mean=0.918, max=0.918, sum=0.918 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)", - "tab": "Efficiency", - "score": 0.3270734968185425 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)", - "tab": "General information", - "score": 248.508 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.664, - "details": { - "description": "min=0.44, mean=0.664, max=0.93, sum=3.32 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.317, mean=0.329, max=0.337, sum=1.643 (5)", - "tab": "Efficiency", - "score": 0.3286796834259702 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)", - "tab": "General information", - "score": 481.5305263157895 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.513, mean=0.746, max=0.93, sum=5.219 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.515, mean=1.903, max=2.648, sum=13.324 (7)", - "tab": "Efficiency", - "score": 1.9034432935092742 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)", - "tab": "General information", - "score": 1355.5064552904823 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "description": "min=0.812, mean=0.812, max=0.812, sum=0.812 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.332, mean=2.332, max=2.332, sum=2.332 (1)", - "tab": "Efficiency", - "score": 2.3315503742694856 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)", - "tab": "General information", - "score": 1151.885 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.439, mean=0.7, max=0.979, sum=3.499 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.328, mean=0.44, max=0.796, sum=2.202 (5)", - "tab": "Efficiency", - "score": 0.4403507251683155 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.992 (5)", - "tab": "General information", - "score": 4.798367346938775 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=199.916, mean=1546.699, max=6405.871, sum=7733.495 (5)", - "tab": "General information", - "score": 1546.699013263404 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.451 (1)", - "tab": "Efficiency", - "score": 0.4512898187277094 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)", - "tab": "General information", - "score": 1029.4811133200794 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.214, - "details": { - "description": "min=0.167, mean=0.214, max=0.241, sum=1.072 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.666, mean=0.698, max=0.715, sum=3.492 (5)", - "tab": "Efficiency", - "score": 0.6983992647690125 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)", - "tab": "General information", - "score": 110.97025108961614 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json deleted file mode 100644 index 1488d6604..000000000 --- a/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma 2 Instruct 9B", - "id": "google/gemma-2-9b-it", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.8286641697877652 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.593, mean=0.593, max=0.593, sum=0.593 (1)", - "tab": "Efficiency", - "score": 0.5928616705075116 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)", - "tab": "General information", - "score": 3437.994366197183 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328, - "details": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)", - "tab": "Efficiency", - "score": 0.44568803215026853 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.337 (1)", - "tab": "Efficiency", - "score": 0.337234415769577 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.953, mean=4.953, max=4.953, sum=4.953 (1)", - "tab": "General information", - "score": 4.953 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.009, mean=0.009, max=0.009, sum=0.009 (1)", - "tab": "General information", - "score": 0.009 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1911.526, mean=1911.526, max=1911.526, sum=1911.526 (1)", - "tab": "General information", - "score": 1911.526 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)", - "tab": "General information", - "score": 143.995 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=0.91 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.306 (1)", - "tab": "Efficiency", - "score": 0.3059106550216675 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)", - "tab": "General information", - "score": 248.508 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.645, - "details": { - "description": "min=0.42, mean=0.645, max=0.91, sum=3.225 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.299, mean=0.319, max=0.334, sum=1.594 (5)", - "tab": "Efficiency", - "score": 0.3187573717686168 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)", - "tab": "General information", - "score": 481.5305263157895 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724, - "details": { - "description": "min=0.635, mean=0.724, max=0.907, sum=5.071 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.006, mean=1.344, max=1.765, sum=9.409 (7)", - "tab": "Efficiency", - "score": 1.3440718759718908 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)", - "tab": "General information", - "score": 1355.5064552904823 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.72, mean=1.72, max=1.72, sum=1.72 (1)", - "tab": "Efficiency", - "score": 1.720498773097992 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)", - "tab": "General information", - "score": 1151.885 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.639, - "details": { - "description": "min=0.395, mean=0.639, max=0.937, sum=3.193 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.31, mean=0.384, max=0.652, sum=1.92 (5)", - "tab": "Efficiency", - "score": 0.3840073023663075 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.992 (5)", - "tab": "General information", - "score": 4.798367346938775 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=199.916, mean=1546.699, max=6405.871, sum=7733.495 (5)", - "tab": "General information", - "score": 1546.699013263404 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63, - "details": { - "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.316 (1)", - "tab": "Efficiency", - "score": 0.3161872125288127 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)", - "tab": "General information", - "score": 1029.4811133200794 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.201, - "details": { - "description": "min=0.155, mean=0.201, max=0.228, sum=1.003 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.526, mean=0.633, max=0.82, sum=3.165 (5)", - "tab": "Efficiency", - "score": 0.6330890842213928 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)", - "tab": "General information", - "score": 110.97025108961614 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json deleted file mode 100644 index 810e32965..000000000 --- a/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_gemma-7b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma 7B", - "id": "google/gemma-7b", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.336, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7896629213483146 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.909, mean=0.909, max=0.909, sum=0.909 (1)", - "tab": "Efficiency", - "score": 0.9086058952438999 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3411.994, mean=3411.994, max=3411.994, sum=3411.994 (1)", - "tab": "General information", - "score": 3411.994366197183 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.336, - "details": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.336 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.591, mean=0.591, max=0.591, sum=0.591 (1)", - "tab": "Efficiency", - "score": 0.5911745510101318 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)", - "tab": "Efficiency", - "score": 0.3430815353393555 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.94, mean=4.94, max=4.94, sum=4.94 (1)", - "tab": "General information", - "score": 4.94 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.01, mean=0.01, max=0.01, sum=0.01 (1)", - "tab": "General information", - "score": 0.01 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1879.978, mean=1879.978, max=1879.978, sum=1879.978 (1)", - "tab": "General information", - "score": 1879.978 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=125.995, mean=125.995, max=125.995, sum=125.995 (1)", - "tab": "General information", - "score": 125.995 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "description": "min=0.808, mean=0.808, max=0.808, sum=0.808 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.282 (1)", - "tab": "Efficiency", - "score": 0.28152281618118286 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=240.508, mean=240.508, max=240.508, sum=240.508 (1)", - "tab": "General information", - "score": 240.508 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.571, - "details": { - "description": "min=0.28, mean=0.571, max=0.87, sum=2.854 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.251, mean=0.273, max=0.293, sum=1.367 (5)", - "tab": "Efficiency", - "score": 0.27346607242550763 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=372.91, mean=473.531, max=626.553, sum=2367.653 (5)", - "tab": "General information", - "score": 473.5305263157895 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.3, mean=0.5, max=0.711, sum=3.499 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.995, mean=1.161, max=1.453, sum=8.127 (7)", - "tab": "Efficiency", - "score": 1.1609408722047545 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)", - "tab": "General information", - "score": 1355.5064552904823 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)", - "tab": "Efficiency", - "score": 2.024561887741089 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)", - "tab": "General information", - "score": 1151.885 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.581, - "details": { - "description": "min=0.379, mean=0.581, max=0.811, sum=2.904 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.295, mean=0.53, max=1.42, sum=2.652 (5)", - "tab": "Efficiency", - "score": 0.5303036133605687 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.795, max=5, sum=23.973 (5)", - "tab": "General information", - "score": 4.794693877551021 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=193.916, mean=1536.557, max=6379.163, sum=7682.787 (5)", - "tab": "General information", - "score": 1536.5573806103425 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.513, - "details": { - "description": "min=0.513, mean=0.513, max=0.513, sum=0.513 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.314 (1)", - "tab": "Efficiency", - "score": 0.3144090270427302 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1021.481, mean=1021.481, max=1021.481, sum=1021.481 (1)", - "tab": "General information", - "score": 1021.4811133200795 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.187, - "details": { - "description": "min=0.137, mean=0.187, max=0.211, sum=0.937 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.503, mean=0.524, max=0.541, sum=2.618 (5)", - "tab": "Efficiency", - "score": 0.5235538594776801 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=73.732, mean=103.97, max=130.366, sum=519.851 (5)", - "tab": "General information", - "score": 103.97025108961614 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json deleted file mode 100644 index 30d0e3442..000000000 --- a/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_text-bison@001/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PaLM-2 Bison", - "id": "google/text-bison@001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.526, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.47540574282147313 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.031, mean=1.031, max=1.031, sum=1.031 (1)", - "tab": "Efficiency", - "score": 1.030712524602111 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=4414.234, mean=4414.234, max=4414.234, sum=4414.234 (1)", - "tab": "General information", - "score": 4414.2338028169015 - }, - "NarrativeQA - # output tokens": { - "description": "min=7.997, mean=7.997, max=7.997, sum=7.997 (1)", - "tab": "General information", - "score": 7.997183098591549 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)", - "tab": "Efficiency", - "score": 0.987217092037201 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)", - "tab": "Efficiency", - "score": 0.754590849161148 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)", - "tab": "General information", - "score": 4.906 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.015, mean=0.015, max=0.015, sum=0.015 (1)", - "tab": "General information", - "score": 0.015 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2124.565, mean=2124.565, max=2124.565, sum=2124.565 (1)", - "tab": "General information", - "score": 2124.565 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.358, mean=7.358, max=7.358, sum=7.358 (1)", - "tab": "General information", - "score": 7.358 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=190.187, mean=190.187, max=190.187, sum=190.187 (1)", - "tab": "General information", - "score": 190.187 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.48, mean=4.48, max=4.48, sum=4.48 (1)", - "tab": "General information", - "score": 4.48 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=0.878 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)", - "tab": "Efficiency", - "score": 0.7879144654273987 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=253.308, mean=253.308, max=253.308, sum=253.308 (1)", - "tab": "General information", - "score": 253.308 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.608, - "details": { - "description": "min=0.39, mean=0.608, max=0.87, sum=3.038 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.017, mean=1.112, max=1.352, sum=5.561 (5)", - "tab": "Efficiency", - "score": 1.1122005350882547 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=360.7, mean=487.294, max=638.088, sum=2436.468 (5)", - "tab": "General information", - "score": 487.29354385964905 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421, - "details": { - "description": "min=0.25, mean=0.421, max=0.558, sum=2.946 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.161, mean=1.614, max=2.126, sum=11.299 (7)", - "tab": "Efficiency", - "score": 1.6140828338918989 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=1004.274, mean=1439.843, max=2386.942, sum=10078.901 (7)", - "tab": "General information", - "score": 1439.842989280994 - }, - "MATH - # output tokens": { - "description": "min=38.4, mean=66.89, max=88.316, sum=468.232 (7)", - "tab": "General information", - "score": 66.89023408252294 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "details": { - "description": "min=0.61, mean=0.61, max=0.61, sum=0.61 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.44, mean=1.44, max=1.44, sum=1.44 (1)", - "tab": "Efficiency", - "score": 1.4403084371089936 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1109.549, mean=1109.549, max=1109.549, sum=1109.549 (1)", - "tab": "General information", - "score": 1109.549 - }, - "GSM8K - # output tokens": { - "description": "min=94.258, mean=94.258, max=94.258, sum=94.258 (1)", - "tab": "General information", - "score": 94.258 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.645, - "details": { - "description": "min=0.466, mean=0.645, max=0.937, sum=3.224 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.53, mean=0.737, max=1.325, sum=3.683 (5)", - "tab": "Efficiency", - "score": 0.7366328867537384 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=2.988, mean=4.398, max=5, sum=21.988 (5)", - "tab": "General information", - "score": 4.397551020408163 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=287.432, mean=1387.966, max=5134.504, sum=6939.831 (5)", - "tab": "General information", - "score": 1387.966233478402 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.389, max=2.347, sum=6.947 (5)", - "tab": "General information", - "score": 1.3893499784884555 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.547, - "details": { - "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)", - "tab": "Efficiency", - "score": 0.7348999071784806 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1138.622, mean=1138.622, max=1138.622, sum=1138.622 (1)", - "tab": "General information", - "score": 1138.6222664015904 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.241, - "details": { - "description": "min=0.22, mean=0.241, max=0.255, sum=1.204 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.826, mean=0.875, max=0.952, sum=4.377 (5)", - "tab": "Efficiency", - "score": 0.8753595397700126 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=145.755, mean=183.587, max=206.169, sum=917.936 (5)", - "tab": "General information", - "score": 183.58714444104604 - }, - "WMT 2014 - # output tokens": { - "description": "min=28.076, mean=29.981, max=31.366, sum=149.905 (5)", - "tab": "General information", - "score": 29.980943664933477 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json deleted file mode 100644 index d5841340f..000000000 --- a/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/google_text-unicorn@001/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PaLM-2 Unicorn", - "id": "google/text-unicorn@001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.18023720349563047 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.583, - "details": { - "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=3.283, mean=3.283, max=3.283, sum=3.283 (1)", - "tab": "Efficiency", - "score": 3.283053755424392 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=4414.234, mean=4414.234, max=4414.234, sum=4414.234 (1)", - "tab": "General information", - "score": 4414.2338028169015 - }, - "NarrativeQA - # output tokens": { - "description": "min=16.544, mean=16.544, max=16.544, sum=16.544 (1)", - "tab": "General information", - "score": 16.543661971830986 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435, - "details": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=2.564, mean=2.564, max=2.564, sum=2.564 (1)", - "tab": "Efficiency", - "score": 2.564493465423584 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.56, mean=1.56, max=1.56, sum=1.56 (1)", - "tab": "Efficiency", - "score": 1.5603588831424713 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)", - "tab": "General information", - "score": 4.906 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.015, mean=0.015, max=0.015, sum=0.015 (1)", - "tab": "General information", - "score": 0.015 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2124.565, mean=2124.565, max=2124.565, sum=2124.565 (1)", - "tab": "General information", - "score": 2124.565 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=13.327, mean=13.327, max=13.327, sum=13.327 (1)", - "tab": "General information", - "score": 13.327 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=190.187, mean=190.187, max=190.187, sum=190.187 (1)", - "tab": "General information", - "score": 190.187 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=9.803, mean=9.803, max=9.803, sum=9.803 (1)", - "tab": "General information", - "score": 9.803 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938, - "details": { - "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "Efficiency", - "score": 0.9994440112113953 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=253.308, mean=253.308, max=253.308, sum=253.308 (1)", - "tab": "General information", - "score": 253.308 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.53, mean=0.702, max=0.96, sum=3.509 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.198, mean=1.262, max=1.332, sum=6.31 (5)", - "tab": "Efficiency", - "score": 1.2620431824148748 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=360.7, mean=487.294, max=638.088, sum=2436.468 (5)", - "tab": "General information", - "score": 487.29354385964905 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674, - "details": { - "description": "min=0.526, mean=0.674, max=0.867, sum=4.716 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=4.016, mean=4.636, max=5.654, sum=32.454 (7)", - "tab": "Efficiency", - "score": 4.636334307701402 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=1004.274, mean=1439.843, max=2386.942, sum=10078.901 (7)", - "tab": "General information", - "score": 1439.842989280994 - }, - "MATH - # output tokens": { - "description": "min=59.9, mean=80.458, max=98.342, sum=563.207 (7)", - "tab": "General information", - "score": 80.45819114472725 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.831, - "details": { - "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=5.437, mean=5.437, max=5.437, sum=5.437 (1)", - "tab": "Efficiency", - "score": 5.4373185629844665 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1109.549, mean=1109.549, max=1109.549, sum=1109.549 (1)", - "tab": "General information", - "score": 1109.549 - }, - "GSM8K - # output tokens": { - "description": "min=93.764, mean=93.764, max=93.764, sum=93.764 (1)", - "tab": "General information", - "score": 93.764 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677, - "details": { - "description": "min=0.452, mean=0.677, max=0.926, sum=3.387 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.859, mean=1.437, max=3.198, sum=7.187 (5)", - "tab": "Efficiency", - "score": 1.4374773445647835 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=2.988, mean=4.398, max=5, sum=21.988 (5)", - "tab": "General information", - "score": 4.397551020408163 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=287.432, mean=1387.966, max=5134.504, sum=6939.831 (5)", - "tab": "General information", - "score": 1387.966233478402 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.364, max=2.2, sum=6.821 (5)", - "tab": "General information", - "score": 1.3642506811989101 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=1.178, mean=1.178, max=1.178, sum=1.178 (1)", - "tab": "Efficiency", - "score": 1.1783231205305096 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1138.622, mean=1138.622, max=1138.622, sum=1138.622 (1)", - "tab": "General information", - "score": 1138.6222664015904 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.236, mean=0.26, max=0.279, sum=1.298 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.706, mean=1.801, max=1.909, sum=9.006 (5)", - "tab": "Efficiency", - "score": 1.801295139912888 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=145.755, mean=183.587, max=206.169, sum=917.936 (5)", - "tab": "General information", - "score": 183.58714444104604 - }, - "WMT 2014 - # output tokens": { - "description": "min=28.596, mean=30.567, max=31.734, sum=152.836 (5)", - "tab": "General information", - "score": 30.567241263954735 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json deleted file mode 100644 index 079c14180..000000000 --- a/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-2-13b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 2 13B", - "id": "meta/llama-2-13b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.233, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7253183520599251 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)", - "tab": "Efficiency", - "score": 0.7950913200915699 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.408, mean=4.408, max=4.408, sum=4.408 (1)", - "tab": "General information", - "score": 4.408450704225352 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3669.808, mean=3669.808, max=3669.808, sum=3669.808 (1)", - "tab": "General information", - "score": 3669.8084507042254 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.371, - "details": { - "description": "min=0.371, mean=0.371, max=0.371, sum=0.371 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)", - "tab": "Efficiency", - "score": 0.5793666501045227 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)", - "tab": "Efficiency", - "score": 0.3839698841571808 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)", - "tab": "General information", - "score": 4.831 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2289.357, mean=2289.357, max=2289.357, sum=2289.357 (1)", - "tab": "General information", - "score": 2289.357 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.986, mean=0.986, max=0.986, sum=0.986 (1)", - "tab": "General information", - "score": 0.986 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.634, - "details": { - "description": "min=0.634, mean=0.634, max=0.634, sum=0.634 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.347 (1)", - "tab": "Efficiency", - "score": 0.34700755834579466 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)", - "tab": "General information", - "score": 282.574 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.505, - "details": { - "description": "min=0.28, mean=0.505, max=0.84, sum=2.527 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.359, mean=0.374, max=0.383, sum=1.872 (5)", - "tab": "Efficiency", - "score": 0.37437369656144526 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102, - "details": { - "description": "min=0, mean=0.102, max=0.193, sum=0.715 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.083, mean=1.516, max=1.771, sum=10.613 (7)", - "tab": "Efficiency", - "score": 1.5161172209789922 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)", - "tab": "General information", - "score": 1438.6362030100095 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.266, - "details": { - "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.737, mean=1.737, max=1.737, sum=1.737 (1)", - "tab": "Efficiency", - "score": 1.7367573575973512 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)", - "tab": "General information", - "score": 1207.746 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.591, - "details": { - "description": "min=0.338, mean=0.591, max=0.779, sum=2.955 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.331, mean=0.438, max=0.729, sum=2.189 (5)", - "tab": "Efficiency", - "score": 0.43780977145306127 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=1.886, mean=4.177, max=5, sum=20.886 (5)", - "tab": "General information", - "score": 4.177142857142857 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)", - "tab": "General information", - "score": 0.0008163265306122449 - }, - "LegalBench - # prompt tokens": { - "description": "min=222.137, mean=1027.35, max=3642.378, sum=5136.751 (5)", - "tab": "General information", - "score": 1027.3502076083553 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392, - "details": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.459 (1)", - "tab": "Efficiency", - "score": 0.4588449499005115 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)", - "tab": "General information", - "score": 1234.9005964214712 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.167, - "details": { - "description": "min=0.074, mean=0.167, max=0.209, sum=0.836 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.557, mean=0.691, max=0.814, sum=3.456 (5)", - "tab": "Efficiency", - "score": 0.6911807014709866 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)", - "tab": "General information", - "score": 142.28751290334915 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json deleted file mode 100644 index 8faa07285..000000000 --- a/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-2-70b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 2 70B", - "id": "meta/llama-2-70b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.3882646691635456 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.763, - "details": { - "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.871, mean=1.871, max=1.871, sum=1.871 (1)", - "tab": "Efficiency", - "score": 1.8709671289148464 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.408, mean=4.408, max=4.408, sum=4.408 (1)", - "tab": "General information", - "score": 4.408450704225352 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3669.808, mean=3669.808, max=3669.808, sum=3669.808 (1)", - "tab": "General information", - "score": 3669.8084507042254 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.46 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.278, mean=1.278, max=1.278, sum=1.278 (1)", - "tab": "Efficiency", - "score": 1.277897496700287 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.818, mean=0.818, max=0.818, sum=0.818 (1)", - "tab": "Efficiency", - "score": 0.8177921280860901 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)", - "tab": "General information", - "score": 4.831 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2289.357, mean=2289.357, max=2289.357, sum=2289.357 (1)", - "tab": "General information", - "score": 2289.357 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.996, mean=0.996, max=0.996, sum=0.996 (1)", - "tab": "General information", - "score": 0.996 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)", - "tab": "Efficiency", - "score": 0.6557973260879517 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)", - "tab": "General information", - "score": 282.574 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.31, mean=0.58, max=0.92, sum=2.902 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.465, mean=0.501, max=0.56, sum=2.507 (5)", - "tab": "Efficiency", - "score": 0.5013968416013215 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323, - "details": { - "description": "min=0.205, mean=0.323, max=0.489, sum=2.26 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.813, mean=2.443, max=3.147, sum=17.103 (7)", - "tab": "Efficiency", - "score": 2.4432508421434598 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)", - "tab": "General information", - "score": 1438.6362030100095 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.567, - "details": { - "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.737, mean=3.737, max=3.737, sum=3.737 (1)", - "tab": "Efficiency", - "score": 3.737159442663193 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)", - "tab": "General information", - "score": 1207.746 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673, - "details": { - "description": "min=0.444, mean=0.673, max=0.937, sum=3.363 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.448, mean=0.759, max=1.744, sum=3.796 (5)", - "tab": "Efficiency", - "score": 0.7591354159811778 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=1.886, mean=4.177, max=5, sum=20.886 (5)", - "tab": "General information", - "score": 4.177142857142857 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)", - "tab": "General information", - "score": 0.0008163265306122449 - }, - "LegalBench - # prompt tokens": { - "description": "min=222.137, mean=1027.35, max=3642.378, sum=5136.751 (5)", - "tab": "General information", - "score": 1027.3502076083553 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.618, - "details": { - "description": "min=0.618, mean=0.618, max=0.618, sum=0.618 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.971, mean=0.971, max=0.971, sum=0.971 (1)", - "tab": "Efficiency", - "score": 0.9713700282170806 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)", - "tab": "General information", - "score": 1234.9005964214712 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.196, - "details": { - "description": "min=0.12, mean=0.196, max=0.233, sum=0.979 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.809, mean=1.074, max=1.477, sum=5.368 (5)", - "tab": "Efficiency", - "score": 1.0736038563633745 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)", - "tab": "General information", - "score": 142.28751290334915 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json deleted file mode 100644 index bb2c02730..000000000 --- a/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-2-7b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 2 7B", - "id": "meta/llama-2-7b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.152, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6685767790262173 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)", - "tab": "Efficiency", - "score": 0.8524049973823655 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.408, mean=4.408, max=4.408, sum=4.408 (1)", - "tab": "General information", - "score": 4.408450704225352 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3669.808, mean=3669.808, max=3669.808, sum=3669.808 (1)", - "tab": "General information", - "score": 3669.8084507042254 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.333, - "details": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.584, mean=0.584, max=0.584, sum=0.584 (1)", - "tab": "Efficiency", - "score": 0.584290323972702 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)", - "tab": "Efficiency", - "score": 0.47909903168678286 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)", - "tab": "General information", - "score": 4.831 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2289.357, mean=2289.357, max=2289.357, sum=2289.357 (1)", - "tab": "General information", - "score": 2289.357 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.958, mean=0.958, max=0.958, sum=0.958 (1)", - "tab": "General information", - "score": 0.958 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0.996, mean=0.996, max=0.996, sum=0.996 (1)", - "tab": "General information", - "score": 0.996 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.544, - "details": { - "description": "min=0.544, mean=0.544, max=0.544, sum=0.544 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.393 (1)", - "tab": "Efficiency", - "score": 0.3927152595520019 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)", - "tab": "General information", - "score": 282.574 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425, - "details": { - "description": "min=0.27, mean=0.425, max=0.63, sum=2.125 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.314, mean=0.33, max=0.349, sum=1.651 (5)", - "tab": "Efficiency", - "score": 0.33028721380233766 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.097, - "details": { - "description": "min=0.019, mean=0.097, max=0.198, sum=0.68 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.362, mean=2.66, max=5.271, sum=18.621 (7)", - "tab": "Efficiency", - "score": 2.6600816047289086 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)", - "tab": "General information", - "score": 1438.6362030100095 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.154, - "details": { - "description": "min=0.154, mean=0.154, max=0.154, sum=0.154 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.96, mean=1.96, max=1.96, sum=1.96 (1)", - "tab": "Efficiency", - "score": 1.95984334897995 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)", - "tab": "General information", - "score": 1207.746 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502, - "details": { - "description": "min=0.245, mean=0.502, max=0.747, sum=2.508 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.306, mean=0.428, max=0.76, sum=2.139 (5)", - "tab": "Efficiency", - "score": 0.4277655324222306 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=1.886, mean=4.177, max=5, sum=20.886 (5)", - "tab": "General information", - "score": 4.177142857142857 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)", - "tab": "General information", - "score": 0.0008163265306122449 - }, - "LegalBench - # prompt tokens": { - "description": "min=222.137, mean=1027.35, max=3642.378, sum=5136.751 (5)", - "tab": "General information", - "score": 1027.3502076083553 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392, - "details": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)", - "tab": "Efficiency", - "score": 0.46650436763497993 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)", - "tab": "General information", - "score": 1234.9005964214712 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.144, - "details": { - "description": "min=0.046, mean=0.144, max=0.189, sum=0.72 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.582, mean=0.697, max=0.802, sum=3.486 (5)", - "tab": "Efficiency", - "score": 0.697166075241057 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)", - "tab": "General information", - "score": 142.28751290334915 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json deleted file mode 100644 index 876850010..000000000 --- a/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3-70b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3 70B", - "id": "meta/llama-3-70b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.3926217228464419 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.798, - "details": { - "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.795, mean=1.795, max=1.795, sum=1.795 (1)", - "tab": "Efficiency", - "score": 1.7946508300136512 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3460.268, mean=3460.268, max=3460.268, sum=3460.268 (1)", - "tab": "General information", - "score": 3460.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.475, - "details": { - "description": "min=0.475, mean=0.475, max=0.475, sum=0.475 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.212, mean=1.212, max=1.212, sum=1.212 (1)", - "tab": "Efficiency", - "score": 1.211742308139801 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.558, mean=0.558, max=0.558, sum=0.558 (1)", - "tab": "Efficiency", - "score": 0.5584413967132569 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)", - "tab": "General information", - "score": 4.965 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1658.348, mean=1658.348, max=1658.348, sum=1658.348 (1)", - "tab": "General information", - "score": 1658.348 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=112.12, mean=112.12, max=112.12, sum=112.12 (1)", - "tab": "General information", - "score": 112.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=0.934 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)", - "tab": "Efficiency", - "score": 0.35184384298324584 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.776, mean=242.776, max=242.776, sum=242.776 (1)", - "tab": "General information", - "score": 242.776 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.695, - "details": { - "description": "min=0.43, mean=0.695, max=0.94, sum=3.473 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.387, mean=0.404, max=0.432, sum=2.021 (5)", - "tab": "Efficiency", - "score": 0.40422279727668087 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=366.43, mean=460.686, max=607.421, sum=2303.431 (5)", - "tab": "General information", - "score": 460.6862105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.663, - "details": { - "description": "min=0.433, mean=0.663, max=0.822, sum=4.641 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=14.895, mean=15.819, max=17.569, sum=110.731 (7)", - "tab": "Efficiency", - "score": 15.818764438908431 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)", - "tab": "General information", - "score": 1262.9092130545007 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.805, - "details": { - "description": "min=0.805, mean=0.805, max=0.805, sum=0.805 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=4.2, mean=4.2, max=4.2, sum=4.2 (1)", - "tab": "Efficiency", - "score": 4.199564570903778 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)", - "tab": "General information", - "score": 959.032 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.466, mean=0.733, max=0.958, sum=3.665 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.416, mean=0.87, max=2.556, sum=4.352 (5)", - "tab": "Efficiency", - "score": 0.8703131128024035 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.992 (5)", - "tab": "General information", - "score": 4.798367346938775 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=192.442, mean=1507.407, max=6287.633, sum=7537.033 (5)", - "tab": "General information", - "score": 1507.4065013565441 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777, - "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.548, mean=0.548, max=0.548, sum=0.548 (1)", - "tab": "Efficiency", - "score": 0.547684069419239 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1018.274, mean=1018.274, max=1018.274, sum=1018.274 (1)", - "tab": "General information", - "score": 1018.2743538767396 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.225, - "details": { - "description": "min=0.183, mean=0.225, max=0.259, sum=1.123 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.198, mean=1.239, max=1.282, sum=6.195 (5)", - "tab": "Efficiency", - "score": 1.239086973613365 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=90.139, mean=109.868, max=130.33, sum=549.34 (5)", - "tab": "General information", - "score": 109.86804366111025 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json deleted file mode 100644 index 87ab72524..000000000 --- a/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3-8b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3 8B", - "id": "meta/llama-3-8b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7163920099875156 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.754, - "details": { - "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)", - "tab": "Efficiency", - "score": 0.7260531909029249 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3460.268, mean=3460.268, max=3460.268, sum=3460.268 (1)", - "tab": "General information", - "score": 3460.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378, - "details": { - "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)", - "tab": "Efficiency", - "score": 0.523505747795105 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)", - "tab": "Efficiency", - "score": 0.42760186743736267 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)", - "tab": "General information", - "score": 4.965 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1658.348, mean=1658.348, max=1658.348, sum=1658.348 (1)", - "tab": "General information", - "score": 1658.348 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "General information", - "score": 0.999 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=112.12, mean=112.12, max=112.12, sum=112.12 (1)", - "tab": "General information", - "score": 112.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.766, - "details": { - "description": "min=0.766, mean=0.766, max=0.766, sum=0.766 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.308 (1)", - "tab": "Efficiency", - "score": 0.3076804256439209 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.776, mean=242.776, max=242.776, sum=242.776 (1)", - "tab": "General information", - "score": 242.776 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602, - "details": { - "description": "min=0.33, mean=0.602, max=0.88, sum=3.008 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.3, mean=0.317, max=0.344, sum=1.583 (5)", - "tab": "Efficiency", - "score": 0.3165063006919727 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=366.43, mean=460.686, max=607.421, sum=2303.431 (5)", - "tab": "General information", - "score": 460.6862105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391, - "details": { - "description": "min=0.233, mean=0.391, max=0.496, sum=2.737 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=5.431, mean=5.651, max=6.121, sum=39.558 (7)", - "tab": "Efficiency", - "score": 5.651119198181415 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)", - "tab": "General information", - "score": 1262.9092130545007 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.499, - "details": { - "description": "min=0.499, mean=0.499, max=0.499, sum=0.499 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.771, mean=1.771, max=1.771, sum=1.771 (1)", - "tab": "Efficiency", - "score": 1.770608879327774 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)", - "tab": "General information", - "score": 959.032 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "description": "min=0.417, mean=0.637, max=0.874, sum=3.185 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.322, mean=0.465, max=0.989, sum=2.326 (5)", - "tab": "Efficiency", - "score": 0.4651390315970952 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.992 (5)", - "tab": "General information", - "score": 4.798367346938775 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=192.442, mean=1507.407, max=6287.633, sum=7537.033 (5)", - "tab": "General information", - "score": 1507.4065013565441 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.581, - "details": { - "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.361, mean=0.361, max=0.361, sum=0.361 (1)", - "tab": "Efficiency", - "score": 0.36141945306159867 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1018.274, mean=1018.274, max=1018.274, sum=1018.274 (1)", - "tab": "General information", - "score": 1018.2743538767396 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.183, - "details": { - "description": "min=0.133, mean=0.183, max=0.212, sum=0.915 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.547, mean=0.563, max=0.573, sum=2.816 (5)", - "tab": "Efficiency", - "score": 0.5631435248437351 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=90.139, mean=109.868, max=130.33, sum=549.34 (5)", - "tab": "General information", - "score": 109.86804366111025 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json deleted file mode 100644 index 0bc6225d5..000000000 --- a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.1 Instruct Turbo 405B", - "id": "meta/llama-3.1-405b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.3095505617977528 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.749, - "details": { - "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=2.964, mean=2.964, max=2.964, sum=2.964 (1)", - "tab": "Efficiency", - "score": 2.964381891572979 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)", - "tab": "General information", - "score": 3484.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=9.904, mean=9.904, max=9.904, sum=9.904 (1)", - "tab": "General information", - "score": 9.904225352112675 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.456, - "details": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.456 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=4.105, mean=4.105, max=4.105, sum=4.105 (1)", - "tab": "Efficiency", - "score": 4.104731038570404 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)", - "tab": "Efficiency", - "score": 0.9464026074409485 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)", - "tab": "General information", - "score": 1716.78 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.741, mean=8.741, max=8.741, sum=8.741 (1)", - "tab": "General information", - "score": 8.741 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)", - "tab": "General information", - "score": 129.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=8.576, mean=8.576, max=8.576, sum=8.576 (1)", - "tab": "General information", - "score": 8.576 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=0.94 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=2.693, mean=2.693, max=2.693, sum=2.693 (1)", - "tab": "Efficiency", - "score": 2.6930377073287963 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)", - "tab": "General information", - "score": 249.776 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.759, - "details": { - "description": "min=0.6, mean=0.759, max=0.94, sum=3.796 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.464, mean=0.529, max=0.598, sum=2.643 (5)", - "tab": "Efficiency", - "score": 0.528599283887629 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)", - "tab": "General information", - "score": 467.6862105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827, - "details": { - "description": "min=0.635, mean=0.827, max=0.97, sum=5.789 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.188, mean=4.118, max=4.906, sum=28.826 (7)", - "tab": "Efficiency", - "score": 4.117939187053165 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)", - "tab": "General information", - "score": 1262.9092130545007 - }, - "MATH - # output tokens": { - "description": "min=175.942, mean=232.698, max=270.904, sum=1628.884 (7)", - "tab": "General information", - "score": 232.69774473452566 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.737, mean=2.737, max=2.737, sum=2.737 (1)", - "tab": "Efficiency", - "score": 2.737115991592407 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)", - "tab": "General information", - "score": 959.032 - }, - "GSM8K - # output tokens": { - "description": "min=122.777, mean=122.777, max=122.777, sum=122.777 (1)", - "tab": "General information", - "score": 122.777 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.707, - "details": { - "description": "min=0.433, mean=0.707, max=0.979, sum=3.536 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.492, mean=0.797, max=1.89, sum=3.987 (5)", - "tab": "Efficiency", - "score": 0.7974352428433198 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)", - "tab": "General information", - "score": 1513.8824197238912 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.407, max=3, sum=12.035 (5)", - "tab": "General information", - "score": 2.4069553133514985 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.805, - "details": { - "description": "min=0.805, mean=0.805, max=0.805, sum=0.805 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.951, mean=0.951, max=0.951, sum=0.951 (1)", - "tab": "Efficiency", - "score": 0.9505775325577965 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)", - "tab": "General information", - "score": 1025.2743538767395 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.238, - "details": { - "description": "min=0.2, mean=0.238, max=0.284, sum=1.191 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.96, mean=1.055, max=1.147, sum=5.277 (5)", - "tab": "Efficiency", - "score": 1.0554436480227387 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=101.139, mean=120.712, max=141.117, sum=603.559 (5)", - "tab": "General information", - "score": 120.71178123566294 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.598, mean=26.056, max=26.819, sum=130.279 (5)", - "tab": "General information", - "score": 26.055818454656674 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json deleted file mode 100644 index d57074cb2..000000000 --- a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.1 Instruct Turbo 70B", - "id": "meta/llama-3.1-70b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.133645443196005 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772, - "details": { - "description": "min=0.772, mean=0.772, max=0.772, sum=0.772 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=3.402, mean=3.402, max=3.402, sum=3.402 (1)", - "tab": "Efficiency", - "score": 3.4022000312805174 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)", - "tab": "General information", - "score": 3484.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=9.034, mean=9.034, max=9.034, sum=9.034 (1)", - "tab": "General information", - "score": 9.033802816901408 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=3.354, mean=3.354, max=3.354, sum=3.354 (1)", - "tab": "Efficiency", - "score": 3.354476467370987 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=3.534, mean=3.534, max=3.534, sum=3.534 (1)", - "tab": "Efficiency", - "score": 3.534221899032593 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)", - "tab": "General information", - "score": 1716.78 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.203, mean=8.203, max=8.203, sum=8.203 (1)", - "tab": "General information", - "score": 8.203 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)", - "tab": "General information", - "score": 129.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=7.222, mean=7.222, max=7.222, sum=7.222 (1)", - "tab": "General information", - "score": 7.222 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938, - "details": { - "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=3.875, mean=3.875, max=3.875, sum=3.875 (1)", - "tab": "Efficiency", - "score": 3.8750249314308167 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)", - "tab": "General information", - "score": 249.776 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.55, mean=0.709, max=0.93, sum=3.545 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=2.836, mean=12.026, max=45.251, sum=60.131 (5)", - "tab": "Efficiency", - "score": 12.026294649132511 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)", - "tab": "General information", - "score": 467.6862105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.579, mean=0.783, max=0.97, sum=5.483 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=5.784, mean=6.527, max=7.228, sum=45.691 (7)", - "tab": "Efficiency", - "score": 6.527233472429779 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)", - "tab": "General information", - "score": 1262.9092130545007 - }, - "MATH - # output tokens": { - "description": "min=184.733, mean=243.368, max=279.105, sum=1703.574 (7)", - "tab": "General information", - "score": 243.36764411525732 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938, - "details": { - "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=4.99, mean=4.99, max=4.99, sum=4.99 (1)", - "tab": "Efficiency", - "score": 4.9902911036014554 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)", - "tab": "General information", - "score": 959.032 - }, - "GSM8K - # output tokens": { - "description": "min=127.086, mean=127.086, max=127.086, sum=127.086 (1)", - "tab": "General information", - "score": 127.086 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.687, - "details": { - "description": "min=0.439, mean=0.687, max=1, sum=3.433 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=2.233, mean=3.171, max=3.636, sum=15.855 (5)", - "tab": "Efficiency", - "score": 3.1709040240543165 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)", - "tab": "General information", - "score": 1513.8824197238912 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.538, max=4.032, sum=12.688 (5)", - "tab": "General information", - "score": 2.5376711028251826 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.769, - "details": { - "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=3.053, mean=3.053, max=3.053, sum=3.053 (1)", - "tab": "Efficiency", - "score": 3.0525233205222704 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)", - "tab": "General information", - "score": 1025.2743538767395 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.223, - "details": { - "description": "min=0.183, mean=0.223, max=0.265, sum=1.114 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.762, mean=0.965, max=1.177, sum=4.824 (5)", - "tab": "Efficiency", - "score": 0.9648550899177766 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=101.139, mean=120.712, max=141.117, sum=603.559 (5)", - "tab": "General information", - "score": 120.71178123566294 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.231, mean=25.786, max=26.692, sum=128.928 (5)", - "tab": "General information", - "score": 25.78567441504817 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json deleted file mode 100644 index 198d81cd2..000000000 --- a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.1 Instruct Turbo 8B", - "id": "meta/llama-3.1-8b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.303, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5896504369538077 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.756, - "details": { - "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)", - "tab": "Efficiency", - "score": 0.5813529316808136 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)", - "tab": "General information", - "score": 3484.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=7.287, mean=7.287, max=7.287, sum=7.287 (1)", - "tab": "General information", - "score": 7.2873239436619714 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.209, - "details": { - "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.544, mean=0.544, max=0.544, sum=0.544 (1)", - "tab": "Efficiency", - "score": 0.5441543731689453 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)", - "tab": "Efficiency", - "score": 0.751717613697052 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)", - "tab": "General information", - "score": 1716.78 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.736, mean=8.736, max=8.736, sum=8.736 (1)", - "tab": "General information", - "score": 8.736 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)", - "tab": "General information", - "score": 129.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=11.732, mean=11.732, max=11.732, sum=11.732 (1)", - "tab": "General information", - "score": 11.732 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=2.937, mean=2.937, max=2.937, sum=2.937 (1)", - "tab": "Efficiency", - "score": 2.9374450149536133 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)", - "tab": "General information", - "score": 249.776 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.26, mean=0.5, max=0.79, sum=2.501 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.284, mean=0.417, max=0.567, sum=2.086 (5)", - "tab": "Efficiency", - "score": 0.41729471965421716 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)", - "tab": "General information", - "score": 467.6862105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.703, - "details": { - "description": "min=0.509, mean=0.703, max=0.849, sum=4.92 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.617, mean=1.927, max=2.175, sum=13.492 (7)", - "tab": "Efficiency", - "score": 1.9274194573191807 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)", - "tab": "General information", - "score": 1262.9092130545007 - }, - "MATH - # output tokens": { - "description": "min=203.384, mean=253.982, max=288.596, sum=1777.872 (7)", - "tab": "General information", - "score": 253.98170179473732 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.798, - "details": { - "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.109, mean=2.109, max=2.109, sum=2.109 (1)", - "tab": "Efficiency", - "score": 2.108796592712402 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)", - "tab": "General information", - "score": 959.032 - }, - "GSM8K - # output tokens": { - "description": "min=150.02, mean=150.02, max=150.02, sum=150.02 (1)", - "tab": "General information", - "score": 150.02 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342, - "details": { - "description": "min=0, mean=0.342, max=0.8, sum=1.71 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.409, mean=0.481, max=0.626, sum=2.407 (5)", - "tab": "Efficiency", - "score": 0.4814103188942614 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)", - "tab": "General information", - "score": 1513.8824197238912 - }, - "LegalBench - # output tokens": { - "description": "min=2.032, mean=6.824, max=10.886, sum=34.118 (5)", - "tab": "General information", - "score": 6.823557876005701 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245, - "details": { - "description": "min=0.245, mean=0.245, max=0.245, sum=0.245 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)", - "tab": "Efficiency", - "score": 0.742541556803891 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)", - "tab": "General information", - "score": 1025.2743538767395 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.181, - "details": { - "description": "min=0.132, mean=0.181, max=0.219, sum=0.907 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.439, mean=0.565, max=0.727, sum=2.826 (5)", - "tab": "Efficiency", - "score": 0.5651802479746801 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=101.139, mean=120.712, max=141.117, sum=603.559 (5)", - "tab": "General information", - "score": 120.71178123566294 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.354, mean=25.779, max=26.833, sum=128.893 (5)", - "tab": "General information", - "score": 25.778561802263347 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json deleted file mode 100644 index 722a6f050..000000000 --- a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo 11B", - "id": "meta/llama-3.2-11b-vision-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.8754681647940075 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.756, - "details": { - "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)", - "tab": "Efficiency", - "score": 0.37828690300525075 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)", - "tab": "General information", - "score": 3484.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.234, - "details": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)", - "tab": "Efficiency", - "score": 0.28472757744789123 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.326 (1)", - "tab": "Efficiency", - "score": 0.32630494999885556 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)", - "tab": "General information", - "score": 1716.785 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)", - "tab": "General information", - "score": 129.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724, - "details": { - "description": "min=0.724, mean=0.724, max=0.724, sum=0.724 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.21, mean=0.21, max=0.21, sum=0.21 (1)", - "tab": "Efficiency", - "score": 0.21042356300354004 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)", - "tab": "General information", - "score": 249.776 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.511, - "details": { - "description": "min=0.28, mean=0.511, max=0.78, sum=2.555 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.226, mean=0.406, max=0.726, sum=2.031 (5)", - "tab": "Efficiency", - "score": 0.40622414255142214 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)", - "tab": "General information", - "score": 467.6862105263158 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739, - "details": { - "description": "min=0.579, mean=0.739, max=0.884, sum=5.176 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.715, mean=2.099, max=2.413, sum=14.696 (7)", - "tab": "Efficiency", - "score": 2.099496145662431 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)", - "tab": "General information", - "score": 1262.9092130545007 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.823, - "details": { - "description": "min=0.823, mean=0.823, max=0.823, sum=0.823 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.274, mean=1.274, max=1.274, sum=1.274 (1)", - "tab": "Efficiency", - "score": 1.2738200931549073 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)", - "tab": "General information", - "score": 959.032 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435, - "details": { - "description": "min=0.018, mean=0.435, max=0.905, sum=2.175 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.199, mean=0.277, max=0.438, sum=1.384 (5)", - "tab": "Efficiency", - "score": 0.2767821625533402 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)", - "tab": "General information", - "score": 1513.8824197238912 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27, - "details": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)", - "tab": "Efficiency", - "score": 0.20540714263916016 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)", - "tab": "General information", - "score": 1025.2743538767395 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.179, - "details": { - "description": "min=0.13, mean=0.179, max=0.217, sum=0.896 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.349, mean=0.383, max=0.412, sum=1.915 (5)", - "tab": "Efficiency", - "score": 0.38295877939459017 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=101.139, mean=120.868, max=141.33, sum=604.34 (5)", - "tab": "General information", - "score": 120.86804366111025 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json deleted file mode 100644 index 8bef7c4e9..000000000 --- a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo 90B", - "id": "meta/llama-3.2-90b-vision-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.819, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5839825218476904 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777, - "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)", - "tab": "Efficiency", - "score": 0.8297326531208736 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)", - "tab": "General information", - "score": 3484.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.457, - "details": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.111, mean=1.111, max=1.111, sum=1.111 (1)", - "tab": "Efficiency", - "score": 1.110703297138214 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)", - "tab": "Efficiency", - "score": 0.4218848171234131 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)", - "tab": "General information", - "score": 1716.785 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)", - "tab": "General information", - "score": 129.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.942, - "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)", - "tab": "Efficiency", - "score": 0.28476666021347047 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)", - "tab": "General information", - "score": 249.776 - }, - "OpenbookQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.703, - "details": { - "description": "min=0.52, mean=0.703, max=0.93, sum=3.514 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.266, mean=0.798, max=2.612, sum=3.992 (5)", - "tab": "Efficiency", - "score": 0.7984467656654225 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)", - "tab": "General information", - "score": 467.6862105263158 - }, - "MMLU - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.579, mean=0.791, max=0.978, sum=5.54 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=4.64, mean=5.739, max=6.652, sum=40.174 (7)", - "tab": "Efficiency", - "score": 5.739186799526185 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)", - "tab": "General information", - "score": 1262.9092130545007 - }, - "MATH - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.936, - "details": { - "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.889, mean=2.889, max=2.889, sum=2.889 (1)", - "tab": "Efficiency", - "score": 2.8894128675460817 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)", - "tab": "General information", - "score": 959.032 - }, - "GSM8K - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68, - "details": { - "description": "min=0.438, mean=0.68, max=0.989, sum=3.398 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.284, mean=0.478, max=1.152, sum=2.389 (5)", - "tab": "Efficiency", - "score": 0.47773526830658064 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)", - "tab": "General information", - "score": 1513.8824197238912 - }, - "LegalBench - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.769, - "details": { - "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.318 (1)", - "tab": "Efficiency", - "score": 0.3180293652930743 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)", - "tab": "General information", - "score": 1025.2743538767395 - }, - "MedQA - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.224, - "details": { - "description": "min=0.182, mean=0.224, max=0.266, sum=1.121 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.737, mean=0.816, max=0.848, sum=4.078 (5)", - "tab": "Efficiency", - "score": 0.8156762526912515 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=101.139, mean=120.868, max=141.33, sum=604.34 (5)", - "tab": "General information", - "score": 120.86804366111025 - }, - "WMT 2014 - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json deleted file mode 100644 index cc4cca983..000000000 --- a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.3 Instruct Turbo 70B", - "id": "meta/llama-3.3-70b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7410112359550561 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)", - "tab": "Efficiency", - "score": 0.7455473496880329 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)", - "tab": "General information", - "score": 3484.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=7.606, mean=7.606, max=7.606, sum=7.606 (1)", - "tab": "General information", - "score": 7.605633802816901 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431, - "details": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.51, mean=0.51, max=0.51, sum=0.51 (1)", - "tab": "Efficiency", - "score": 0.5104404001235961 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)", - "tab": "Efficiency", - "score": 0.46574948048591613 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)", - "tab": "General information", - "score": 1716.78 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.503, mean=7.503, max=7.503, sum=7.503 (1)", - "tab": "General information", - "score": 7.503 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)", - "tab": "General information", - "score": 129.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=9.152, mean=9.152, max=9.152, sum=9.152 (1)", - "tab": "General information", - "score": 9.152 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.339 (1)", - "tab": "Efficiency", - "score": 0.3392307605743408 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)", - "tab": "General information", - "score": 249.776 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.5, mean=0.7, max=0.93, sum=3.499 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.313, mean=0.339, max=0.359, sum=1.695 (5)", - "tab": "Efficiency", - "score": 0.3389431067433274 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)", - "tab": "General information", - "score": 467.6862105263158 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "description": "min=0.635, mean=0.808, max=0.963, sum=5.655 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.483, mean=1.779, max=2.037, sum=12.455 (7)", - "tab": "Efficiency", - "score": 1.7792604792087183 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)", - "tab": "General information", - "score": 1262.9092130545007 - }, - "MATH - # output tokens": { - "description": "min=192.326, mean=245.345, max=274.462, sum=1717.412 (7)", - "tab": "General information", - "score": 245.34459229967183 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.942, - "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.354, mean=1.354, max=1.354, sum=1.354 (1)", - "tab": "Efficiency", - "score": 1.3539768285751344 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)", - "tab": "General information", - "score": 959.032 - }, - "GSM8K - # output tokens": { - "description": "min=155.609, mean=155.609, max=155.609, sum=155.609 (1)", - "tab": "General information", - "score": 155.609 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725, - "details": { - "description": "min=0.428, mean=0.725, max=0.979, sum=3.627 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.374, mean=0.549, max=1.199, sum=2.745 (5)", - "tab": "Efficiency", - "score": 0.5490109607174599 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)", - "tab": "General information", - "score": 1513.8824197238912 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.404, max=3.032, sum=12.02 (5)", - "tab": "General information", - "score": 2.404037659543955 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.761, - "details": { - "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.359 (1)", - "tab": "Efficiency", - "score": 0.35867250700357184 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)", - "tab": "General information", - "score": 1025.2743538767395 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219, - "details": { - "description": "min=0.18, mean=0.219, max=0.261, sum=1.096 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.587, mean=0.62, max=0.685, sum=3.1 (5)", - "tab": "Efficiency", - "score": 0.6200136459034178 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=96.139, mean=115.712, max=136.117, sum=578.559 (5)", - "tab": "General information", - "score": 115.71178123566294 - }, - "WMT 2014 - # output tokens": { - "description": "min=25.161, mean=26.542, max=27.189, sum=132.708 (5)", - "tab": "General information", - "score": 26.541526800734054 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json deleted file mode 100644 index ebea32b6c..000000000 --- a/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/meta_llama-65b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA 65B", - "id": "meta/llama-65b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.345, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.07451935081148564 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=2.909, mean=2.909, max=2.909, sum=2.909 (1)", - "tab": "Efficiency", - "score": 2.9087761751362975 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=1.434, mean=1.434, max=1.434, sum=1.434 (1)", - "tab": "General information", - "score": 1.4338028169014085 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1539.586, mean=1539.586, max=1539.586, sum=1539.586 (1)", - "tab": "General information", - "score": 1539.5859154929578 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.433, - "details": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.433 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.361, mean=1.361, max=1.361, sum=1.361 (1)", - "tab": "Efficiency", - "score": 1.3611893365383148 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)", - "tab": "Efficiency", - "score": 4.703710767745972 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)", - "tab": "General information", - "score": 3.722 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)", - "tab": "General information", - "score": 0.049 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1407.129, mean=1407.129, max=1407.129, sum=1407.129 (1)", - "tab": "General information", - "score": 1407.129 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.985, mean=0.985, max=0.985, sum=0.985 (1)", - "tab": "General information", - "score": 0.985 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)", - "tab": "General information", - "score": 137.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.754, - "details": { - "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=4.49, mean=4.49, max=4.49, sum=4.49 (1)", - "tab": "Efficiency", - "score": 4.490233006477356 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)", - "tab": "General information", - "score": 282.574 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.34, mean=0.584, max=0.89, sum=2.919 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.962, mean=3.925, max=5.875, sum=19.627 (5)", - "tab": "Efficiency", - "score": 3.925460591943641 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)", - "tab": "General information", - "score": 522.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.257, - "details": { - "description": "min=0.096, mean=0.257, max=0.474, sum=1.802 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=13.711, mean=20.79, max=30.888, sum=145.531 (7)", - "tab": "Efficiency", - "score": 20.790176352238564 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2.962, mean=6.897, max=8, sum=48.277 (7)", - "tab": "General information", - "score": 6.896761133603239 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=971.652, mean=1214.707, max=1552.038, sum=8502.951 (7)", - "tab": "General information", - "score": 1214.7073423969382 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.489, - "details": { - "description": "min=0.489, mean=0.489, max=0.489, sum=0.489 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=12.339, mean=12.339, max=12.339, sum=12.339 (1)", - "tab": "Efficiency", - "score": 12.338884568691254 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)", - "tab": "General information", - "score": 1207.746 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48, - "details": { - "description": "min=0.018, mean=0.48, max=0.863, sum=2.401 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=1.489, mean=3.974, max=6.264, sum=19.868 (5)", - "tab": "Efficiency", - "score": 3.9735240905509466 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.024, mean=3.805, max=5, sum=19.024 (5)", - "tab": "General information", - "score": 3.8048979591836734 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.006, max=0.031, sum=0.031 (5)", - "tab": "General information", - "score": 0.006122448979591836 - }, - "LegalBench - # prompt tokens": { - "description": "min=222.137, mean=595.161, max=1481.433, sum=2975.806 (5)", - "tab": "General information", - "score": 595.1612280165185 - }, - "LegalBench - # output tokens": { - "description": "min=0.882, mean=0.976, max=1, sum=4.882 (5)", - "tab": "General information", - "score": 0.9763265306122448 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507, - "details": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=4.984, mean=4.984, max=4.984, sum=4.984 (1)", - "tab": "Efficiency", - "score": 4.983887912264875 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)", - "tab": "General information", - "score": 1234.9005964214712 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.189, - "details": { - "description": "min=0.102, mean=0.189, max=0.239, sum=0.945 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=2.057, mean=3.603, max=8.087, sum=18.014 (5)", - "tab": "Efficiency", - "score": 3.6028029962680237 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)", - "tab": "General information", - "score": 142.28751290334915 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json deleted file mode 100644 index ee330c2d2..000000000 --- a/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/microsoft_phi-2/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-2", - "id": "microsoft/phi-2", - "developer": "microsoft", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.169, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.9032709113607991 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.703, - "details": { - "description": "min=0.703, mean=0.703, max=0.703, sum=0.703 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.493, mean=0.493, max=0.493, sum=0.493 (1)", - "tab": "Efficiency", - "score": 0.49325697791408485 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.085, mean=2.085, max=2.085, sum=2.085 (1)", - "tab": "General information", - "score": 2.084507042253521 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1705.006, mean=1705.006, max=1705.006, sum=1705.006 (1)", - "tab": "General information", - "score": 1705.0056338028169 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.155, - "details": { - "description": "min=0.155, mean=0.155, max=0.155, sum=0.155 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.47 (1)", - "tab": "Efficiency", - "score": 0.46984758591651915 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)", - "tab": "Efficiency", - "score": 0.29179329943656923 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.706, mean=4.706, max=4.706, sum=4.706 (1)", - "tab": "General information", - "score": 4.706 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.036, mean=0.036, max=0.036, sum=0.036 (1)", - "tab": "General information", - "score": 0.036 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1493.994, mean=1493.994, max=1493.994, sum=1493.994 (1)", - "tab": "General information", - "score": 1493.994 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)", - "tab": "General information", - "score": 116.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.798, - "details": { - "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.262, mean=0.262, max=0.262, sum=0.262 (1)", - "tab": "Efficiency", - "score": 0.2615062308311462 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=254.216, mean=254.216, max=254.216, sum=254.216 (1)", - "tab": "General information", - "score": 254.216 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518, - "details": { - "description": "min=0.31, mean=0.518, max=0.78, sum=2.592 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.27, mean=0.285, max=0.295, sum=1.426 (5)", - "tab": "Efficiency", - "score": 0.28525047320650343 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)", - "tab": "General information", - "score": 472.2740350877192 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255, - "details": { - "description": "min=0.033, mean=0.255, max=0.465, sum=1.786 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.923, mean=1.129, max=1.577, sum=7.902 (7)", - "tab": "Efficiency", - "score": 1.1288332585709453 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)", - "tab": "General information", - "score": 6.915558126084441 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=906.541, mean=1162.126, max=1511.442, sum=8134.881 (7)", - "tab": "General information", - "score": 1162.1258475895563 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.581, - "details": { - "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.147, mean=1.147, max=1.147, sum=1.147 (1)", - "tab": "Efficiency", - "score": 1.1468114259243012 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=938.893, mean=938.893, max=938.893, sum=938.893 (1)", - "tab": "General information", - "score": 938.893 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.334, - "details": { - "description": "min=0.137, mean=0.334, max=0.537, sum=1.672 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.268, mean=0.303, max=0.381, sum=1.517 (5)", - "tab": "Efficiency", - "score": 0.3034723702962031 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.337, mean=3.867, max=5, sum=19.337 (5)", - "tab": "General information", - "score": 3.8673469387755106 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)", - "tab": "General information", - "score": 0.002857142857142857 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.632, mean=566.249, max=1519.543, sum=2831.243 (5)", - "tab": "General information", - "score": 566.2485439511586 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.41, - "details": { - "description": "min=0.41, mean=0.41, max=0.41, sum=0.41 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.275, mean=0.275, max=0.275, sum=0.275 (1)", - "tab": "Efficiency", - "score": 0.27509861532783886 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1038.833, mean=1038.833, max=1038.833, sum=1038.833 (1)", - "tab": "General information", - "score": 1038.8330019880716 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.038, - "details": { - "description": "min=0.0, mean=0.038, max=0.113, sum=0.189 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.427, mean=0.47, max=0.534, sum=2.35 (5)", - "tab": "Efficiency", - "score": 0.47001117224047206 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=136.93, mean=181.692, max=241.656, sum=908.462 (5)", - "tab": "General information", - "score": 181.69235022556967 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json deleted file mode 100644 index 6d945026f..000000000 --- a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3 14B", - "id": "microsoft/phi-3-medium-4k-instruct", - "developer": "microsoft", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.12111111111111111 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724, - "details": { - "description": "min=0.724, mean=0.724, max=0.724, sum=0.724 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=29.509, mean=29.509, max=29.509, sum=29.509 (1)", - "tab": "Efficiency", - "score": 29.5092350200868 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.392, mean=4.392, max=4.392, sum=4.392 (1)", - "tab": "General information", - "score": 4.391549295774648 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3685.707, mean=3685.707, max=3685.707, sum=3685.707 (1)", - "tab": "General information", - "score": 3685.707042253521 - }, - "NarrativeQA - # output tokens": { - "description": "min=7.245, mean=7.245, max=7.245, sum=7.245 (1)", - "tab": "General information", - "score": 7.245070422535211 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.278, - "details": { - "description": "min=0.278, mean=0.278, max=0.278, sum=0.278 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=44.238, mean=44.238, max=44.238, sum=44.238 (1)", - "tab": "Efficiency", - "score": 44.23756227874756 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=49.743, mean=49.743, max=49.743, sum=49.743 (1)", - "tab": "Efficiency", - "score": 49.743374599456786 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.83, mean=4.83, max=4.83, sum=4.83 (1)", - "tab": "General information", - "score": 4.83 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2307.695, mean=2307.695, max=2307.695, sum=2307.695 (1)", - "tab": "General information", - "score": 2307.695 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.371, mean=8.371, max=8.371, sum=8.371 (1)", - "tab": "General information", - "score": 8.371 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=156.383, mean=156.383, max=156.383, sum=156.383 (1)", - "tab": "General information", - "score": 156.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=10.079, mean=10.079, max=10.079, sum=10.079 (1)", - "tab": "General information", - "score": 10.079 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.916, - "details": { - "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)", - "tab": "Efficiency", - "score": 0.3850016188621521 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=291.574, mean=291.574, max=291.574, sum=291.574 (1)", - "tab": "General information", - "score": 291.574 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.48, mean=0.675, max=0.94, sum=3.375 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.381, mean=0.504, max=0.722, sum=2.52 (5)", - "tab": "Efficiency", - "score": 0.5039482383811682 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)", - "tab": "General information", - "score": 531.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611, - "details": { - "description": "min=0.462, mean=0.611, max=0.7, sum=4.277 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=67.969, mean=71.561, max=74.993, sum=500.925 (7)", - "tab": "Efficiency", - "score": 71.56076915436368 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)", - "tab": "General information", - "score": 1438.6362030100095 - }, - "MATH - # output tokens": { - "description": "min=357.548, mean=372.128, max=392.767, sum=2604.893 (7)", - "tab": "General information", - "score": 372.1276343562145 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=0.878 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=74.933, mean=74.933, max=74.933, sum=74.933 (1)", - "tab": "Efficiency", - "score": 74.93269198083877 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)", - "tab": "General information", - "score": 1207.746 - }, - "GSM8K - # output tokens": { - "description": "min=400, mean=400, max=400, sum=400 (1)", - "tab": "General information", - "score": 400.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.593, - "details": { - "description": "min=0.365, mean=0.593, max=0.811, sum=2.966 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=5.972, mean=7.879, max=14.755, sum=39.397 (5)", - "tab": "Efficiency", - "score": 7.879368148866983 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=1.884, mean=4.177, max=5, sum=20.884 (5)", - "tab": "General information", - "score": 4.176734693877551 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)", - "tab": "General information", - "score": 0.0008163265306122449 - }, - "LegalBench - # prompt tokens": { - "description": "min=229.137, mean=1033.818, max=3646.718, sum=5169.092 (5)", - "tab": "General information", - "score": 1033.8183708736613 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.356, max=1.979, sum=6.782 (5)", - "tab": "General information", - "score": 1.3564703389458466 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=1.792, mean=1.792, max=1.792, sum=1.792 (1)", - "tab": "Efficiency", - "score": 1.7916561092581473 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)", - "tab": "General information", - "score": 1243.9005964214712 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.17, - "details": { - "description": "min=0.086, mean=0.17, max=0.218, sum=0.85 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=19.742, mean=19.987, max=20.079, sum=99.934 (5)", - "tab": "Efficiency", - "score": 19.98681167411759 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=135.523, mean=150.288, max=172.972, sum=751.438 (5)", - "tab": "General information", - "score": 150.28751290334915 - }, - "WMT 2014 - # output tokens": { - "description": "min=98.254, mean=99.651, max=100, sum=498.254 (5)", - "tab": "General information", - "score": 99.65089463220676 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json deleted file mode 100644 index c7b88764b..000000000 --- a/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3 7B", - "id": "microsoft/phi-3-small-8k-instruct", - "developer": "microsoft", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.18641975308641975 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.754, - "details": { - "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=30.408, mean=30.408, max=30.408, sum=30.408 (1)", - "tab": "Efficiency", - "score": 30.40753108749927 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3485.67, mean=3485.67, max=3485.67, sum=3485.67 (1)", - "tab": "General information", - "score": 3485.6704225352114 - }, - "NarrativeQA - # output tokens": { - "description": "min=33.71, mean=33.71, max=33.71, sum=33.71 (1)", - "tab": "General information", - "score": 33.709859154929575 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.324, - "details": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=68.232, mean=68.232, max=68.232, sum=68.232 (1)", - "tab": "Efficiency", - "score": 68.2322377743721 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=63.003, mean=63.003, max=63.003, sum=63.003 (1)", - "tab": "Efficiency", - "score": 63.00250503087044 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)", - "tab": "General information", - "score": 4.965 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1675.981, mean=1675.981, max=1675.981, sum=1675.981 (1)", - "tab": "General information", - "score": 1675.981 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=16.786, mean=16.786, max=16.786, sum=16.786 (1)", - "tab": "General information", - "score": 16.786 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.127, mean=129.127, max=129.127, sum=129.127 (1)", - "tab": "General information", - "score": 129.127 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=36.311, mean=36.311, max=36.311, sum=36.311 (1)", - "tab": "General information", - "score": 36.311 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.912, - "details": { - "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.289 (1)", - "tab": "Efficiency", - "score": 0.28856802701950074 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.782, mean=249.782, max=249.782, sum=249.782 (1)", - "tab": "General information", - "score": 249.782 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.44, mean=0.659, max=0.95, sum=3.296 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.275, mean=0.406, max=0.549, sum=2.032 (5)", - "tab": "Efficiency", - "score": 0.406433069689232 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)", - "tab": "General information", - "score": 467.71996491228066 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.703, - "details": { - "description": "min=0.538, mean=0.703, max=0.933, sum=4.922 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=49.379, mean=60.681, max=73.413, sum=424.765 (7)", - "tab": "Efficiency", - "score": 60.680695580739844 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)", - "tab": "General information", - "score": 1262.9108741840687 - }, - "MATH - # output tokens": { - "description": "min=57.779, mean=115.236, max=283.904, sum=806.654 (7)", - "tab": "General information", - "score": 115.23627800867702 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -1.0, - "details": { - "description": "No matching runs", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "No matching runs", - "tab": "Efficiency", - "score": null - }, - "GSM8K - # eval": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "GSM8K - # train": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "GSM8K - truncated": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "GSM8K - # prompt tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - }, - "GSM8K - # output tokens": { - "description": "No matching runs", - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.395, mean=0.584, max=0.895, sum=2.92 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=6.293, mean=8.342, max=16.012, sum=41.71 (5)", - "tab": "Efficiency", - "score": 8.34200078530511 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.992 (5)", - "tab": "General information", - "score": 4.798367346938775 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=197.442, mean=1512.687, max=6294.008, sum=7563.435 (5)", - "tab": "General information", - "score": 1512.6870529886412 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.192, max=1.538, sum=5.96 (5)", - "tab": "General information", - "score": 1.192017037143267 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672, - "details": { - "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)", - "tab": "Efficiency", - "score": 0.8902683931126983 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)", - "tab": "General information", - "score": 1027.4135188866799 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.154, - "details": { - "description": "min=0.043, mean=0.154, max=0.205, sum=0.772 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=20.252, mean=20.399, max=20.714, sum=101.996 (5)", - "tab": "Efficiency", - "score": 20.399208641134514 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=114.901, mean=138.043, max=158.185, sum=690.213 (5)", - "tab": "General information", - "score": 138.04258583116683 - }, - "WMT 2014 - # output tokens": { - "description": "min=96.311, mean=96.966, max=98.575, sum=484.832 (5)", - "tab": "General information", - "score": 96.96643456568283 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json deleted file mode 100644 index fd0f8e02b..000000000 --- a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Instruct v0.3 7B", - "id": "mistralai/mistral-7b-instruct-v0.3", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.196, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6493133583021223 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.716, - "details": { - "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.813, mean=0.813, max=0.813, sum=0.813 (1)", - "tab": "Efficiency", - "score": 0.8132137520212522 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3924.33, mean=3924.33, max=3924.33, sum=3924.33 (1)", - "tab": "General information", - "score": 3924.3295774647886 - }, - "NarrativeQA - # output tokens": { - "description": "min=7.107, mean=7.107, max=7.107, sum=7.107 (1)", - "tab": "General information", - "score": 7.107042253521127 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.253, - "details": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.563, mean=0.563, max=0.563, sum=0.563 (1)", - "tab": "Efficiency", - "score": 0.5634698050022126 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.535, mean=0.535, max=0.535, sum=0.535 (1)", - "tab": "Efficiency", - "score": 0.5347676448822022 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2498.79, mean=2498.79, max=2498.79, sum=2498.79 (1)", - "tab": "General information", - "score": 2498.79 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=12.448, mean=12.448, max=12.448, sum=12.448 (1)", - "tab": "General information", - "score": 12.448 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=172.069, mean=172.069, max=172.069, sum=172.069 (1)", - "tab": "General information", - "score": 172.069 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=20.461, mean=20.461, max=20.461, sum=20.461 (1)", - "tab": "General information", - "score": 20.461 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.256 (1)", - "tab": "Efficiency", - "score": 0.25593132400512697 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=289.15, mean=289.15, max=289.15, sum=289.15 (1)", - "tab": "General information", - "score": 289.15 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.27, mean=0.51, max=0.79, sum=2.551 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.221, mean=0.372, max=0.487, sum=1.862 (5)", - "tab": "Efficiency", - "score": 0.37230395750413864 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=411.44, mean=532.091, max=696.175, sum=2660.455 (5)", - "tab": "General information", - "score": 532.0910877192983 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.289, - "details": { - "description": "min=0.115, mean=0.289, max=0.477, sum=2.02 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.027, mean=2.656, max=3.039, sum=18.593 (7)", - "tab": "Efficiency", - "score": 2.656151831465352 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)", - "tab": "General information", - "score": 1455.2664139976257 - }, - "MATH - # output tokens": { - "description": "min=123.616, mean=149.99, max=172.789, sum=1049.933 (7)", - "tab": "General information", - "score": 149.99043902740354 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.538, - "details": { - "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.95, mean=3.95, max=3.95, sum=3.95 (1)", - "tab": "Efficiency", - "score": 3.949965229511261 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)", - "tab": "General information", - "score": 1187.268 - }, - "GSM8K - # output tokens": { - "description": "min=196.611, mean=196.611, max=196.611, sum=196.611 (1)", - "tab": "General information", - "score": 196.611 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.331, - "details": { - "description": "min=0.063, mean=0.331, max=0.733, sum=1.655 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.316, mean=0.489, max=0.855, sum=2.444 (5)", - "tab": "Efficiency", - "score": 0.4887186054518059 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=236.453, mean=1750.748, max=7224.488, sum=8753.741 (5)", - "tab": "General information", - "score": 1750.7482458432962 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=9.174, max=15.242, sum=45.871 (5)", - "tab": "General information", - "score": 9.17419274343898 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517, - "details": { - "description": "min=0.517, mean=0.517, max=0.517, sum=0.517 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.418 (1)", - "tab": "Efficiency", - "score": 0.4182186216767692 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1202.093, mean=1202.093, max=1202.093, sum=1202.093 (1)", - "tab": "General information", - "score": 1202.0934393638172 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.142, - "details": { - "description": "min=0.047, mean=0.142, max=0.184, sum=0.712 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.582, mean=0.775, max=0.872, sum=3.875 (5)", - "tab": "Efficiency", - "score": 0.7750062139801958 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=148.306, mean=162.433, max=181.018, sum=812.166 (5)", - "tab": "General information", - "score": 162.43317355482492 - }, - "WMT 2014 - # output tokens": { - "description": "min=28.3, mean=30.51, max=31.912, sum=152.552 (5)", - "tab": "General information", - "score": 30.510483732222053 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json deleted file mode 100644 index 8f4801f23..000000000 --- a/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral v0.1 7B", - "id": "mistralai/mistral-7b-v0.1", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.292, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.8075780274656679 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.716, - "details": { - "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.705, mean=0.705, max=0.705, sum=0.705 (1)", - "tab": "Efficiency", - "score": 0.7051956902087574 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.575, mean=4.575, max=4.575, sum=4.575 (1)", - "tab": "General information", - "score": 4.574647887323944 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3627.715, mean=3627.715, max=3627.715, sum=3627.715 (1)", - "tab": "General information", - "score": 3627.7154929577464 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.367, - "details": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.367 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.494, mean=0.494, max=0.494, sum=0.494 (1)", - "tab": "Efficiency", - "score": 0.49417281556129455 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.462, mean=0.462, max=0.462, sum=0.462 (1)", - "tab": "Efficiency", - "score": 0.46181689071655274 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.832, mean=4.832, max=4.832, sum=4.832 (1)", - "tab": "General information", - "score": 4.832 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2268.728, mean=2268.728, max=2268.728, sum=2268.728 (1)", - "tab": "General information", - "score": 2268.728 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.988, mean=0.988, max=0.988, sum=0.988 (1)", - "tab": "General information", - "score": 0.988 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=142.069, mean=142.069, max=142.069, sum=142.069 (1)", - "tab": "General information", - "score": 142.069 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.776, - "details": { - "description": "min=0.776, mean=0.776, max=0.776, sum=0.776 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)", - "tab": "Efficiency", - "score": 0.32474704647064206 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)", - "tab": "General information", - "score": 280.15 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.31, mean=0.584, max=0.85, sum=2.918 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.272, mean=0.291, max=0.304, sum=1.457 (5)", - "tab": "Efficiency", - "score": 0.2914179778851961 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)", - "tab": "General information", - "score": 523.0910877192983 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297, - "details": { - "description": "min=0.067, mean=0.297, max=0.43, sum=2.082 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.992, mean=1.159, max=1.576, sum=8.114 (7)", - "tab": "Efficiency", - "score": 1.159214100149656 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)", - "tab": "General information", - "score": 1455.2664139976257 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.377, - "details": { - "description": "min=0.377, mean=0.377, max=0.377, sum=0.377 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.632, mean=1.632, max=1.632, sum=1.632 (1)", - "tab": "Efficiency", - "score": 1.6323128745555877 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)", - "tab": "General information", - "score": 1187.268 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.433, mean=0.58, max=0.789, sum=2.901 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.287, mean=0.353, max=0.577, sum=1.765 (5)", - "tab": "Efficiency", - "score": 0.35307050709631943 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=1.969, mean=4.194, max=5, sum=20.969 (5)", - "tab": "General information", - "score": 4.1938775510204085 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=219.453, mean=998.503, max=3534.259, sum=4992.513 (5)", - "tab": "General information", - "score": 998.5025315575822 - }, - "LegalBench - # output tokens": { - "description": "min=0.992, mean=0.998, max=1, sum=4.992 (5)", - "tab": "General information", - "score": 0.9983673469387755 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.525, - "details": { - "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.348 (1)", - "tab": "Efficiency", - "score": 0.3478535307093596 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)", - "tab": "General information", - "score": 1193.0934393638172 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.16, - "details": { - "description": "min=0.056, mean=0.16, max=0.201, sum=0.802 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.52, mean=0.561, max=0.701, sum=2.803 (5)", - "tab": "Efficiency", - "score": 0.5605853292576617 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=130.306, mean=144.433, max=163.018, sum=722.166 (5)", - "tab": "General information", - "score": 144.43317355482492 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json deleted file mode 100644 index d8d60cc37..000000000 --- a/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Large 2402", - "id": "mistralai/mistral-large-2402", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.23681647940074904 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.454, - "details": { - "description": "min=0.454, mean=0.454, max=0.454, sum=0.454 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.675, mean=1.675, max=1.675, sum=1.675 (1)", - "tab": "Efficiency", - "score": 1.6750120075655655 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3955.33, mean=3955.33, max=3955.33, sum=3955.33 (1)", - "tab": "General information", - "score": 3955.3295774647886 - }, - "NarrativeQA - # output tokens": { - "description": "min=22.614, mean=22.614, max=22.614, sum=22.614 (1)", - "tab": "General information", - "score": 22.614084507042254 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.311, - "details": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.311 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.666, mean=1.666, max=1.666, sum=1.666 (1)", - "tab": "Efficiency", - "score": 1.665770656108856 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=2.122, mean=2.122, max=2.122, sum=2.122 (1)", - "tab": "Efficiency", - "score": 2.1218616259098053 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2537.79, mean=2537.79, max=2537.79, sum=2537.79 (1)", - "tab": "General information", - "score": 2537.79 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=27.337, mean=27.337, max=27.337, sum=27.337 (1)", - "tab": "General information", - "score": 27.337 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=211.069, mean=211.069, max=211.069, sum=211.069 (1)", - "tab": "General information", - "score": 211.069 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=27.91, mean=27.91, max=27.91, sum=27.91 (1)", - "tab": "General information", - "score": 27.91 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=0.894 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)", - "tab": "Efficiency", - "score": 0.5687967395782471 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)", - "tab": "General information", - "score": 280.15 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.638, - "details": { - "description": "min=0.38, mean=0.638, max=0.92, sum=3.19 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.226, mean=1.451, max=1.633, sum=7.257 (5)", - "tab": "Efficiency", - "score": 1.4514196366845515 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)", - "tab": "General information", - "score": 523.0910877192983 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.632, mean=0.75, max=0.904, sum=5.253 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.885, mean=5.128, max=5.812, sum=35.896 (7)", - "tab": "Efficiency", - "score": 5.128044104863146 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=1061.615, mean=1525.266, max=2572.962, sum=10676.865 (7)", - "tab": "General information", - "score": 1525.2664139976257 - }, - "MATH - # output tokens": { - "description": "min=101.444, mean=128.216, max=154.897, sum=897.515 (7)", - "tab": "General information", - "score": 128.21647245723133 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694, - "details": { - "description": "min=0.694, mean=0.694, max=0.694, sum=0.694 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=7.095, mean=7.095, max=7.095, sum=7.095 (1)", - "tab": "Efficiency", - "score": 7.095049407720566 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1255.268, mean=1255.268, max=1255.268, sum=1255.268 (1)", - "tab": "General information", - "score": 1255.268 - }, - "GSM8K - # output tokens": { - "description": "min=129.185, mean=129.185, max=129.185, sum=129.185 (1)", - "tab": "General information", - "score": 129.185 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.479, - "details": { - "description": "min=0.1, mean=0.479, max=0.821, sum=2.394 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.985, mean=1.692, max=2.787, sum=8.462 (5)", - "tab": "Efficiency", - "score": 1.6924799473534797 - }, - "LegalBench - # eval": { - "description": "min=50, mean=312.4, max=1000, sum=1562 (5)", - "tab": "General information", - "score": 312.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=219.46, mean=1783.252, max=7251, sum=8916.261 (5)", - "tab": "General information", - "score": 1783.2521685070988 - }, - "LegalBench - # output tokens": { - "description": "min=1.005, mean=8.217, max=25.86, sum=41.087 (5)", - "tab": "General information", - "score": 8.217420478990393 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.499, - "details": { - "description": "min=0.499, mean=0.499, max=0.499, sum=0.499 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.595, mean=0.595, max=0.595, sum=0.595 (1)", - "tab": "Efficiency", - "score": 0.5950325303238856 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)", - "tab": "General information", - "score": 1193.0934393638172 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182, - "details": { - "description": "min=0.098, mean=0.182, max=0.224, sum=0.909 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.69, mean=1.969, max=2.702, sum=9.846 (5)", - "tab": "Efficiency", - "score": 1.969239294333439 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=198.306, mean=212.433, max=231.018, sum=1062.166 (5)", - "tab": "General information", - "score": 212.43317355482492 - }, - "WMT 2014 - # output tokens": { - "description": "min=27.272, mean=29.042, max=29.871, sum=145.211 (5)", - "tab": "General information", - "score": 29.04227089386756 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json deleted file mode 100644 index d75c9932b..000000000 --- a/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Large 2 2407", - "id": "mistralai/mistral-large-2407", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.744, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.4191385767790262 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.728, mean=0.728, max=0.728, sum=0.728 (1)", - "tab": "Efficiency", - "score": 0.7276979574015443 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3914.33, mean=3914.33, max=3914.33, sum=3914.33 (1)", - "tab": "General information", - "score": 3914.3295774647886 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.2, mean=6.2, max=6.2, sum=6.2 (1)", - "tab": "General information", - "score": 6.2 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.453, - "details": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.453 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.757, mean=0.757, max=0.757, sum=0.757 (1)", - "tab": "Efficiency", - "score": 0.7573216142654419 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.527, mean=0.527, max=0.527, sum=0.527 (1)", - "tab": "Efficiency", - "score": 0.5273597676753998 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2488.79, mean=2488.79, max=2488.79, sum=2488.79 (1)", - "tab": "General information", - "score": 2488.79 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.994, mean=7.994, max=7.994, sum=7.994 (1)", - "tab": "General information", - "score": 7.994 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=162.069, mean=162.069, max=162.069, sum=162.069 (1)", - "tab": "General information", - "score": 162.069 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=6.496, mean=6.496, max=6.496, sum=6.496 (1)", - "tab": "General information", - "score": 6.496 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.891, mean=0.891, max=0.891, sum=0.891 (1)", - "tab": "Efficiency", - "score": 0.8910596170425416 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=290.15, mean=290.15, max=290.15, sum=290.15 (1)", - "tab": "General information", - "score": 290.15 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725, - "details": { - "description": "min=0.52, mean=0.725, max=0.9, sum=3.623 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.684, mean=0.789, max=0.933, sum=3.943 (5)", - "tab": "Efficiency", - "score": 0.7886472435834114 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=412.44, mean=533.091, max=697.175, sum=2665.455 (5)", - "tab": "General information", - "score": 533.0910877192983 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677, - "details": { - "description": "min=0.342, mean=0.677, max=0.881, sum=4.737 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=4.359, mean=5.441, max=6.464, sum=38.087 (7)", - "tab": "Efficiency", - "score": 5.441067432619708 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=992.615, mean=1456.266, max=2503.962, sum=10193.865 (7)", - "tab": "General information", - "score": 1456.2664139976257 - }, - "MATH - # output tokens": { - "description": "min=129.395, mean=180.319, max=220.298, sum=1262.231 (7)", - "tab": "General information", - "score": 180.3187090913529 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.912, - "details": { - "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=5.431, mean=5.431, max=5.431, sum=5.431 (1)", - "tab": "Efficiency", - "score": 5.431343378543854 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)", - "tab": "General information", - "score": 1187.268 - }, - "GSM8K - # output tokens": { - "description": "min=205.748, mean=205.748, max=205.748, sum=205.748 (1)", - "tab": "General information", - "score": 205.748 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.646, - "details": { - "description": "min=0.229, mean=0.646, max=1, sum=3.23 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.485, mean=0.797, max=0.986, sum=3.987 (5)", - "tab": "Efficiency", - "score": 0.7974768901406878 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=227.453, mean=1741.348, max=7215.488, sum=8706.741 (5)", - "tab": "General information", - "score": 1741.3482458432961 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=3.484, max=8.469, sum=17.42 (5)", - "tab": "General information", - "score": 3.484006654237774 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "details": { - "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)", - "tab": "Efficiency", - "score": 0.4456319799480097 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1203.093, mean=1203.093, max=1203.093, sum=1203.093 (1)", - "tab": "General information", - "score": 1203.0934393638172 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.192, - "details": { - "description": "min=0.14, mean=0.192, max=0.231, sum=0.962 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.075, mean=1.269, max=1.402, sum=6.343 (5)", - "tab": "Efficiency", - "score": 1.2686868536542282 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=139.306, mean=153.433, max=172.018, sum=767.166 (5)", - "tab": "General information", - "score": 153.43317355482492 - }, - "WMT 2014 - # output tokens": { - "description": "min=29.153, mean=30.306, max=33.358, sum=151.531 (5)", - "tab": "General information", - "score": 30.30625095580364 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json deleted file mode 100644 index 6bb7115e2..000000000 --- a/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Medium 2312", - "id": "mistralai/mistral-medium-2312", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.268, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.06677902621722846 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449, - "details": { - "description": "min=0.449, mean=0.449, max=0.449, sum=0.449 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=3.898, mean=3.898, max=3.898, sum=3.898 (1)", - "tab": "Efficiency", - "score": 3.898151301666045 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3955.33, mean=3955.33, max=3955.33, sum=3955.33 (1)", - "tab": "General information", - "score": 3955.3295774647886 - }, - "NarrativeQA - # output tokens": { - "description": "min=24.885, mean=24.885, max=24.885, sum=24.885 (1)", - "tab": "General information", - "score": 24.88450704225352 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29, - "details": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.29 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=5.342, mean=5.342, max=5.342, sum=5.342 (1)", - "tab": "Efficiency", - "score": 5.342489146232605 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=6.588, mean=6.588, max=6.588, sum=6.588 (1)", - "tab": "Efficiency", - "score": 6.588117929935455 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2537.79, mean=2537.79, max=2537.79, sum=2537.79 (1)", - "tab": "General information", - "score": 2537.79 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=32.377, mean=32.377, max=32.377, sum=32.377 (1)", - "tab": "General information", - "score": 32.377 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=211.069, mean=211.069, max=211.069, sum=211.069 (1)", - "tab": "General information", - "score": 211.069 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=34.263, mean=34.263, max=34.263, sum=34.263 (1)", - "tab": "General information", - "score": 34.263 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=2.12, mean=2.12, max=2.12, sum=2.12 (1)", - "tab": "Efficiency", - "score": 2.1195812821388245 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)", - "tab": "General information", - "score": 280.15 - }, - "OpenbookQA - # output tokens": { - "description": "min=0.968, mean=0.968, max=0.968, sum=0.968 (1)", - "tab": "General information", - "score": 0.968 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.618, - "details": { - "description": "min=0.32, mean=0.618, max=0.91, sum=3.089 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.507, mean=2.775, max=3.62, sum=13.874 (5)", - "tab": "Efficiency", - "score": 2.774717758923246 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)", - "tab": "General information", - "score": 523.0910877192983 - }, - "MMLU - # output tokens": { - "description": "min=0.93, mean=0.97, max=0.991, sum=4.851 (5)", - "tab": "General information", - "score": 0.9702456140350877 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565, - "details": { - "description": "min=0.4, mean=0.565, max=0.756, sum=3.958 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=6.1, mean=7.086, max=10.207, sum=49.602 (7)", - "tab": "Efficiency", - "score": 7.0860357509079535 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=1061.615, mean=1525.266, max=2572.962, sum=10676.865 (7)", - "tab": "General information", - "score": 1525.2664139976257 - }, - "MATH - # output tokens": { - "description": "min=80, mean=113.328, max=132.25, sum=793.295 (7)", - "tab": "General information", - "score": 113.3278270462481 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=9.719, mean=9.719, max=9.719, sum=9.719 (1)", - "tab": "Efficiency", - "score": 9.718977437496186 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1255.268, mean=1255.268, max=1255.268, sum=1255.268 (1)", - "tab": "General information", - "score": 1255.268 - }, - "GSM8K - # output tokens": { - "description": "min=137.554, mean=137.554, max=137.554, sum=137.554 (1)", - "tab": "General information", - "score": 137.554 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.066, mean=0.452, max=0.692, sum=2.258 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=2.695, mean=3.248, max=3.795, sum=16.242 (5)", - "tab": "Efficiency", - "score": 3.248400288401771 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=287.453, mean=1801.748, max=7275.488, sum=9008.741 (5)", - "tab": "General information", - "score": 1801.7482458432964 - }, - "LegalBench - # output tokens": { - "description": "min=1.008, mean=8.476, max=25.305, sum=42.382 (5)", - "tab": "General information", - "score": 8.47642872361909 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "details": { - "description": "min=0.61, mean=0.61, max=0.61, sum=0.61 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=2.813, mean=2.813, max=2.813, sum=2.813 (1)", - "tab": "Efficiency", - "score": 2.813041030531138 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)", - "tab": "General information", - "score": 1193.0934393638172 - }, - "MedQA - # output tokens": { - "description": "min=0.95, mean=0.95, max=0.95, sum=0.95 (1)", - "tab": "General information", - "score": 0.9502982107355865 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.169, - "details": { - "description": "min=0.07, mean=0.169, max=0.22, sum=0.844 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=3.982, mean=4.948, max=6.067, sum=24.741 (5)", - "tab": "Efficiency", - "score": 4.9482336292575715 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=198.306, mean=212.433, max=231.018, sum=1062.166 (5)", - "tab": "General information", - "score": 212.43317355482492 - }, - "WMT 2014 - # output tokens": { - "description": "min=26.33, mean=27.816, max=30.692, sum=139.08 (5)", - "tab": "General information", - "score": 27.81599632971402 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json deleted file mode 100644 index 1f2cb2632..000000000 --- a/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Small 2402", - "id": "mistralai/mistral-small-2402", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.288, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.39283395755305867 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519, - "details": { - "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.948, mean=0.948, max=0.948, sum=0.948 (1)", - "tab": "Efficiency", - "score": 0.947719474577568 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3955.33, mean=3955.33, max=3955.33, sum=3955.33 (1)", - "tab": "General information", - "score": 3955.3295774647886 - }, - "NarrativeQA - # output tokens": { - "description": "min=21.775, mean=21.775, max=21.775, sum=21.775 (1)", - "tab": "General information", - "score": 21.774647887323944 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.304, - "details": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.304 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.384, mean=1.384, max=1.384, sum=1.384 (1)", - "tab": "Efficiency", - "score": 1.384453837633133 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.442, mean=1.442, max=1.442, sum=1.442 (1)", - "tab": "Efficiency", - "score": 1.4422871778011321 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2537.79, mean=2537.79, max=2537.79, sum=2537.79 (1)", - "tab": "General information", - "score": 2537.79 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=21.017, mean=21.017, max=21.017, sum=21.017 (1)", - "tab": "General information", - "score": 21.017 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=211.069, mean=211.069, max=211.069, sum=211.069 (1)", - "tab": "General information", - "score": 211.069 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=27.473, mean=27.473, max=27.473, sum=27.473 (1)", - "tab": "General information", - "score": 27.473 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.862, - "details": { - "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)", - "tab": "Efficiency", - "score": 0.5299914984703064 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)", - "tab": "General information", - "score": 280.15 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.593, - "details": { - "description": "min=0.26, mean=0.593, max=0.89, sum=2.964 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.022, mean=1.262, max=1.477, sum=6.308 (5)", - "tab": "Efficiency", - "score": 1.2616501861371492 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)", - "tab": "General information", - "score": 523.0910877192983 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621, - "details": { - "description": "min=0.367, mean=0.621, max=0.859, sum=4.344 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.895, mean=2.217, max=2.662, sum=15.518 (7)", - "tab": "Efficiency", - "score": 2.216904607788028 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=1061.615, mean=1525.266, max=2572.962, sum=10676.865 (7)", - "tab": "General information", - "score": 1525.2664139976257 - }, - "MATH - # output tokens": { - "description": "min=104.221, mean=125.526, max=154.904, sum=878.68 (7)", - "tab": "General information", - "score": 125.52572529016837 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.734, - "details": { - "description": "min=0.734, mean=0.734, max=0.734, sum=0.734 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.972, mean=2.972, max=2.972, sum=2.972 (1)", - "tab": "Efficiency", - "score": 2.9720949590206147 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1255.268, mean=1255.268, max=1255.268, sum=1255.268 (1)", - "tab": "General information", - "score": 1255.268 - }, - "GSM8K - # output tokens": { - "description": "min=148.06, mean=148.06, max=148.06, sum=148.06 (1)", - "tab": "General information", - "score": 148.06 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.389, - "details": { - "description": "min=0, mean=0.389, max=0.789, sum=1.947 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.609, mean=0.874, max=1.067, sum=4.369 (5)", - "tab": "Efficiency", - "score": 0.8738773620338431 - }, - "LegalBench - # eval": { - "description": "min=50, mean=312.4, max=1000, sum=1562 (5)", - "tab": "General information", - "score": 312.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=219.46, mean=1783.252, max=7251, sum=8916.261 (5)", - "tab": "General information", - "score": 1783.2521685070988 - }, - "LegalBench - # output tokens": { - "description": "min=1.716, mean=12.778, max=30, sum=63.891 (5)", - "tab": "General information", - "score": 12.778290319804961 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.499, mean=0.499, max=0.499, sum=0.499 (1)", - "tab": "Efficiency", - "score": 0.4987720272413068 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)", - "tab": "General information", - "score": 1193.0934393638172 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.169, - "details": { - "description": "min=0.076, mean=0.169, max=0.215, sum=0.843 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.945, mean=1.189, max=1.429, sum=5.943 (5)", - "tab": "Efficiency", - "score": 1.1885517670659458 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=198.306, mean=212.433, max=231.018, sum=1062.166 (5)", - "tab": "General information", - "score": 212.43317355482492 - }, - "WMT 2014 - # output tokens": { - "description": "min=26.479, mean=28.3, max=29.024, sum=141.498 (5)", - "tab": "General information", - "score": 28.29957084416578 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json deleted file mode 100644 index e6bfd0332..000000000 --- a/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral 8x22B", - "id": "mistralai/mixtral-8x22b", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.705, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5349563046192259 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.478, mean=1.478, max=1.478, sum=1.478 (1)", - "tab": "Efficiency", - "score": 1.477503587158633 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3886.33, mean=3886.33, max=3886.33, sum=3886.33 (1)", - "tab": "General information", - "score": 3886.3295774647886 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478, - "details": { - "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.004, mean=1.004, max=1.004, sum=1.004 (1)", - "tab": "Efficiency", - "score": 1.003950766324997 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.442 (1)", - "tab": "Efficiency", - "score": 0.44196626234054565 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2468.79, mean=2468.79, max=2468.79, sum=2468.79 (1)", - "tab": "General information", - "score": 2468.79 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=142.069, mean=142.069, max=142.069, sum=142.069 (1)", - "tab": "General information", - "score": 142.069 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882, - "details": { - "description": "min=0.882, mean=0.882, max=0.882, sum=0.882 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)", - "tab": "Efficiency", - "score": 0.33846320056915286 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)", - "tab": "General information", - "score": 280.15 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.701, - "details": { - "description": "min=0.48, mean=0.701, max=0.95, sum=3.507 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.313, mean=0.344, max=0.359, sum=1.722 (5)", - "tab": "Efficiency", - "score": 0.344487278235586 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)", - "tab": "General information", - "score": 523.0910877192983 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656, - "details": { - "description": "min=0.5, mean=0.656, max=0.822, sum=4.589 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.009, mean=2.509, max=3.121, sum=17.565 (7)", - "tab": "Efficiency", - "score": 2.5093491334109825 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)", - "tab": "General information", - "score": 1455.2664139976257 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.539, mean=3.539, max=3.539, sum=3.539 (1)", - "tab": "Efficiency", - "score": 3.5390553929805755 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)", - "tab": "General information", - "score": 1187.268 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.708, - "details": { - "description": "min=0.441, mean=0.708, max=0.968, sum=3.539 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.372, mean=0.821, max=1.973, sum=4.107 (5)", - "tab": "Efficiency", - "score": 0.8213642223004287 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=219.453, mean=1733.148, max=7207.488, sum=8665.741 (5)", - "tab": "General information", - "score": 1733.148245843296 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704, - "details": { - "description": "min=0.704, mean=0.704, max=0.704, sum=0.704 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.463, mean=0.463, max=0.463, sum=0.463 (1)", - "tab": "Efficiency", - "score": 0.46328771849038825 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)", - "tab": "General information", - "score": 1193.0934393638172 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.209, - "details": { - "description": "min=0.133, mean=0.209, max=0.243, sum=1.045 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.928, mean=0.963, max=0.982, sum=4.813 (5)", - "tab": "Efficiency", - "score": 0.9626315307056144 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=130.306, mean=144.433, max=163.018, sum=722.166 (5)", - "tab": "General information", - "score": 144.43317355482492 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json deleted file mode 100644 index 7bf0323b1..000000000 --- a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral 8x7B 32K seqlen", - "id": "mistralai/mixtral-8x7b-32kseqlen", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6727715355805244 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.767, - "details": { - "description": "min=0.767, mean=0.767, max=0.767, sum=0.767 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)", - "tab": "Efficiency", - "score": 0.649569604766201 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.575, mean=4.575, max=4.575, sum=4.575 (1)", - "tab": "General information", - "score": 4.574647887323944 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3627.715, mean=3627.715, max=3627.715, sum=3627.715 (1)", - "tab": "General information", - "score": 3627.7154929577464 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.427, - "details": { - "description": "min=0.427, mean=0.427, max=0.427, sum=0.427 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "Efficiency", - "score": 0.507013471364975 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.513, mean=0.513, max=0.513, sum=0.513 (1)", - "tab": "Efficiency", - "score": 0.5133386459350586 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.832, mean=4.832, max=4.832, sum=4.832 (1)", - "tab": "General information", - "score": 4.832 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)", - "tab": "General information", - "score": 0.026 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2268.728, mean=2268.728, max=2268.728, sum=2268.728 (1)", - "tab": "General information", - "score": 2268.728 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.991, mean=0.991, max=0.991, sum=0.991 (1)", - "tab": "General information", - "score": 0.991 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=142.069, mean=142.069, max=142.069, sum=142.069 (1)", - "tab": "General information", - "score": 142.069 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)", - "tab": "General information", - "score": 0.999 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=0.868 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)", - "tab": "Efficiency", - "score": 0.3542211503982544 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)", - "tab": "General information", - "score": 280.15 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649, - "details": { - "description": "min=0.38, mean=0.649, max=0.93, sum=3.245 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.355, mean=0.36, max=0.366, sum=1.802 (5)", - "tab": "Efficiency", - "score": 0.3604579553102192 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)", - "tab": "General information", - "score": 523.0910877192983 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.494, - "details": { - "description": "min=0.289, mean=0.494, max=0.696, sum=3.459 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.128, mean=1.528, max=2.033, sum=10.695 (7)", - "tab": "Efficiency", - "score": 1.527861329055259 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)", - "tab": "General information", - "score": 1455.2664139976257 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.622, - "details": { - "description": "min=0.622, mean=0.622, max=0.622, sum=0.622 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.273, mean=3.273, max=3.273, sum=3.273 (1)", - "tab": "Efficiency", - "score": 3.2728567245006563 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)", - "tab": "General information", - "score": 1187.268 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63, - "details": { - "description": "min=0.428, mean=0.63, max=0.853, sum=3.15 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.369, mean=0.41, max=0.512, sum=2.05 (5)", - "tab": "Efficiency", - "score": 0.40995627823211056 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=1.969, mean=4.194, max=5, sum=20.969 (5)", - "tab": "General information", - "score": 4.1938775510204085 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=219.453, mean=998.503, max=3534.259, sum=4992.513 (5)", - "tab": "General information", - "score": 998.5025315575822 - }, - "LegalBench - # output tokens": { - "description": "min=0.998, mean=1.0, max=1, sum=4.998 (5)", - "tab": "General information", - "score": 0.9995918367346939 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.652, mean=0.652, max=0.652, sum=0.652 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)", - "tab": "Efficiency", - "score": 0.35297762423338996 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)", - "tab": "General information", - "score": 1193.0934393638172 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.19, - "details": { - "description": "min=0.099, mean=0.19, max=0.23, sum=0.949 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.115, mean=1.202, max=1.294, sum=6.011 (5)", - "tab": "Efficiency", - "score": 1.2021687407719377 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=130.306, mean=144.433, max=163.018, sum=722.166 (5)", - "tab": "General information", - "score": 144.43317355482492 - }, - "WMT 2014 - # output tokens": { - "description": "min=0.994, mean=0.999, max=1, sum=4.994 (5)", - "tab": "General information", - "score": 0.998798076923077 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json deleted file mode 100644 index 7fee5cb57..000000000 --- a/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral NeMo 2402", - "id": "mistralai/open-mistral-nemo-2407", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.333, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5309862671660425 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731, - "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)", - "tab": "Efficiency", - "score": 0.7111437549053783 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3527.392, mean=3527.392, max=3527.392, sum=3527.392 (1)", - "tab": "General information", - "score": 3527.3915492957744 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.901, mean=6.901, max=6.901, sum=6.901 (1)", - "tab": "General information", - "score": 6.901408450704225 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.265, - "details": { - "description": "min=0.265, mean=0.265, max=0.265, sum=0.265 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)", - "tab": "Efficiency", - "score": 0.851971923828125 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.877, mean=0.877, max=0.877, sum=0.877 (1)", - "tab": "Efficiency", - "score": 0.8765462198257447 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2032.962, mean=2032.962, max=2032.962, sum=2032.962 (1)", - "tab": "General information", - "score": 2032.962 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.927, mean=5.927, max=5.927, sum=5.927 (1)", - "tab": "General information", - "score": 5.927 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=137.405, mean=137.405, max=137.405, sum=137.405 (1)", - "tab": "General information", - "score": 137.405 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=3.595, mean=3.595, max=3.595, sum=3.595 (1)", - "tab": "General information", - "score": 3.595 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=0.822 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)", - "tab": "Efficiency", - "score": 0.7987758111953736 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=248.246, mean=248.246, max=248.246, sum=248.246 (1)", - "tab": "General information", - "score": 248.246 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.604, - "details": { - "description": "min=0.29, mean=0.604, max=0.89, sum=3.021 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.635, mean=0.782, max=1.011, sum=3.908 (5)", - "tab": "Efficiency", - "score": 0.7815720957371226 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=377.89, mean=479.924, max=631.851, sum=2399.621 (5)", - "tab": "General information", - "score": 479.9241754385965 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.668, - "details": { - "description": "min=0.558, mean=0.668, max=0.852, sum=4.679 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.866, mean=1.013, max=1.281, sum=7.093 (7)", - "tab": "Efficiency", - "score": 1.0132869822173503 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=915.846, mean=1317.28, max=2238.885, sum=9220.959 (7)", - "tab": "General information", - "score": 1317.2798769434019 - }, - "MATH - # output tokens": { - "description": "min=97.456, mean=111.745, max=141.433, sum=782.217 (7)", - "tab": "General information", - "score": 111.74533800213115 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782, - "details": { - "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.425, mean=1.425, max=1.425, sum=1.425 (1)", - "tab": "Efficiency", - "score": 1.4254731934070588 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1134.356, mean=1134.356, max=1134.356, sum=1134.356 (1)", - "tab": "General information", - "score": 1134.356 - }, - "GSM8K - # output tokens": { - "description": "min=187.859, mean=187.859, max=187.859, sum=187.859 (1)", - "tab": "General information", - "score": 187.859 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415, - "details": { - "description": "min=0.232, mean=0.415, max=0.758, sum=2.076 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.715, mean=0.78, max=0.868, sum=3.898 (5)", - "tab": "Efficiency", - "score": 0.7795765090728288 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=200.716, mean=1561.36, max=6486.116, sum=7806.8 (5)", - "tab": "General information", - "score": 1561.3600575619662 - }, - "LegalBench - # output tokens": { - "description": "min=4.94, mean=8.473, max=15.796, sum=42.365 (5)", - "tab": "General information", - "score": 8.473099835809844 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.59, mean=0.59, max=0.59, sum=0.59 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)", - "tab": "Efficiency", - "score": 0.7488490715178533 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1022.543, mean=1022.543, max=1022.543, sum=1022.543 (1)", - "tab": "General information", - "score": 1022.5427435387674 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.177, - "details": { - "description": "min=0.111, mean=0.177, max=0.211, sum=0.887 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.752, mean=0.782, max=0.819, sum=3.911 (5)", - "tab": "Efficiency", - "score": 0.7821908106898373 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=81.661, mean=110.163, max=135.306, sum=550.814 (5)", - "tab": "General information", - "score": 110.16282784064842 - }, - "WMT 2014 - # output tokens": { - "description": "min=24.622, mean=26.542, max=27.26, sum=132.709 (5)", - "tab": "General information", - "score": 26.541759538920324 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json deleted file mode 100644 index 878d33981..000000000 --- a/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-3.5 Turbo 0613", - "id": "openai/gpt-3.5-turbo-0613", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.956641697877653 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655, - "details": { - "description": "min=0.655, mean=0.655, max=0.655, sum=0.655 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.381 (1)", - "tab": "Efficiency", - "score": 0.3810261323418416 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.946, mean=4.946, max=4.946, sum=4.946 (1)", - "tab": "General information", - "score": 4.946478873239436 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3493.662, mean=3493.662, max=3493.662, sum=3493.662 (1)", - "tab": "General information", - "score": 3493.6619718309857 - }, - "NarrativeQA - # output tokens": { - "description": "min=9.91, mean=9.91, max=9.91, sum=9.91 (1)", - "tab": "General information", - "score": 9.909859154929578 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.335, - "details": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.335 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.305 (1)", - "tab": "Efficiency", - "score": 0.30532183837890625 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.221, mean=0.221, max=0.221, sum=0.221 (1)", - "tab": "Efficiency", - "score": 0.22069251775741577 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.884, mean=4.884, max=4.884, sum=4.884 (1)", - "tab": "General information", - "score": 4.884 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.019, mean=0.019, max=0.019, sum=0.019 (1)", - "tab": "General information", - "score": 0.019 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1649.552, mean=1649.552, max=1649.552, sum=1649.552 (1)", - "tab": "General information", - "score": 1649.552 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=9.389, mean=9.389, max=9.389, sum=9.389 (1)", - "tab": "General information", - "score": 9.389 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)", - "tab": "General information", - "score": 173.127 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.576, mean=5.576, max=5.576, sum=5.576 (1)", - "tab": "General information", - "score": 5.576 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.172, mean=0.172, max=0.172, sum=0.172 (1)", - "tab": "Efficiency", - "score": 0.17227248001098633 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)", - "tab": "General information", - "score": 242.782 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614, - "details": { - "description": "min=0.38, mean=0.614, max=0.88, sum=3.07 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.171, mean=0.175, max=0.177, sum=0.875 (5)", - "tab": "Efficiency", - "score": 0.1750619323630082 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)", - "tab": "General information", - "score": 460.71996491228066 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "description": "min=0.533, mean=0.667, max=0.826, sum=4.667 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=0.741, mean=0.813, max=0.963, sum=5.69 (7)", - "tab": "Efficiency", - "score": 0.8128212395123947 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)", - "tab": "General information", - "score": 1323.910874184069 - }, - "MATH - # output tokens": { - "description": "min=53.5, mean=60.844, max=77.4, sum=425.908 (7)", - "tab": "General information", - "score": 60.844003793024605 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.501, - "details": { - "description": "min=0.501, mean=0.501, max=0.501, sum=0.501 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=0.898, mean=0.898, max=0.898, sum=0.898 (1)", - "tab": "Efficiency", - "score": 0.8983073465824127 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)", - "tab": "General information", - "score": 1020.035 - }, - "GSM8K - # output tokens": { - "description": "min=77.29, mean=77.29, max=77.29, sum=77.29 (1)", - "tab": "General information", - "score": 77.29 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.528, - "details": { - "description": "min=0.302, mean=0.528, max=0.747, sum=2.642 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.178, mean=0.202, max=0.277, sum=1.011 (5)", - "tab": "Efficiency", - "score": 0.20213919553681423 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=2.09, mean=4.218, max=5, sum=21.09 (5)", - "tab": "General information", - "score": 4.21795918367347 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=253.442, mean=949.517, max=3254.159, sum=4747.586 (5)", - "tab": "General information", - "score": 949.5172570702738 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.387, max=2.032, sum=6.934 (5)", - "tab": "General information", - "score": 1.3868394951957552 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.622, - "details": { - "description": "min=0.622, mean=0.622, max=0.622, sum=0.622 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.194, mean=0.194, max=0.194, sum=0.194 (1)", - "tab": "Efficiency", - "score": 0.19374941736755977 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)", - "tab": "General information", - "score": 1020.4135188866799 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.187, - "details": { - "description": "min=0.1, mean=0.187, max=0.23, sum=0.937 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.367, mean=0.394, max=0.409, sum=1.968 (5)", - "tab": "Efficiency", - "score": 0.39351808213963385 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)", - "tab": "General information", - "score": 193.04258583116683 - }, - "WMT 2014 - # output tokens": { - "description": "min=21.983, mean=25.038, max=26.352, sum=125.192 (5)", - "tab": "General information", - "score": 25.038384118366725 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json deleted file mode 100644 index 7ff111f74..000000000 --- a/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4-0613/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4 0613", - "id": "openai/gpt-4-0613", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.867, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5158801498127341 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.976, mean=0.976, max=0.976, sum=0.976 (1)", - "tab": "Efficiency", - "score": 0.9758186582108619 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)", - "tab": "General information", - "score": 3522.6704225352114 - }, - "NarrativeQA - # output tokens": { - "description": "min=8.515, mean=8.515, max=8.515, sum=8.515 (1)", - "tab": "General information", - "score": 8.51549295774648 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.457, - "details": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)", - "tab": "Efficiency", - "score": 0.9083020164966583 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)", - "tab": "Efficiency", - "score": 0.5116857671737671 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)", - "tab": "General information", - "score": 4.964 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1717.847, mean=1717.847, max=1717.847, sum=1717.847 (1)", - "tab": "General information", - "score": 1717.847 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.055, mean=8.055, max=8.055, sum=8.055 (1)", - "tab": "General information", - "score": 8.055 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)", - "tab": "General information", - "score": 173.127 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=3.832, mean=3.832, max=3.832, sum=3.832 (1)", - "tab": "General information", - "score": 3.832 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)", - "tab": "Efficiency", - "score": 0.40061268854141235 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)", - "tab": "General information", - "score": 242.782 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.55, mean=0.735, max=0.95, sum=3.674 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.364, mean=0.391, max=0.434, sum=1.954 (5)", - "tab": "Efficiency", - "score": 0.39080846048656265 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)", - "tab": "General information", - "score": 460.71996491228066 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802, - "details": { - "description": "min=0.673, mean=0.802, max=0.948, sum=5.617 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.95, mean=3.472, max=4.247, sum=24.303 (7)", - "tab": "Efficiency", - "score": 3.4718795228507955 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)", - "tab": "General information", - "score": 1323.910874184069 - }, - "MATH - # output tokens": { - "description": "min=59.674, mean=73.257, max=81.1, sum=512.799 (7)", - "tab": "General information", - "score": 73.25695858608955 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=4.948, mean=4.948, max=4.948, sum=4.948 (1)", - "tab": "Efficiency", - "score": 4.947624314308166 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)", - "tab": "General information", - "score": 1020.035 - }, - "GSM8K - # output tokens": { - "description": "min=111.209, mean=111.209, max=111.209, sum=111.209 (1)", - "tab": "General information", - "score": 111.209 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.713, - "details": { - "description": "min=0.452, mean=0.713, max=0.905, sum=3.564 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.46, mean=0.558, max=0.886, sum=2.791 (5)", - "tab": "Efficiency", - "score": 0.5582764348578453 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.992 (5)", - "tab": "General information", - "score": 4.798367346938775 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=253.442, mean=1568.687, max=6350.008, sum=7843.435 (5)", - "tab": "General information", - "score": 1568.6870529886412 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.34, max=2.063, sum=6.698 (5)", - "tab": "General information", - "score": 1.3396070557866055 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)", - "tab": "Efficiency", - "score": 0.4136932588239787 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)", - "tab": "General information", - "score": 1020.4135188866799 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.211, - "details": { - "description": "min=0.149, mean=0.211, max=0.256, sum=1.053 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.448, mean=1.58, max=1.724, sum=7.899 (5)", - "tab": "Efficiency", - "score": 1.5797039644192494 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)", - "tab": "General information", - "score": 193.04258583116683 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.767, mean=25.424, max=26.121, sum=127.122 (5)", - "tab": "General information", - "score": 25.424382072946933 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json deleted file mode 100644 index 060ab8fb5..000000000 --- a/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4 Turbo 1106 preview", - "id": "openai/gpt-4-1106-preview", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.698, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.3935580524344569 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=0.727 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.068, mean=1.068, max=1.068, sum=1.068 (1)", - "tab": "Efficiency", - "score": 1.068114177945634 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)", - "tab": "General information", - "score": 3522.6704225352114 - }, - "NarrativeQA - # output tokens": { - "description": "min=9.885, mean=9.885, max=9.885, sum=9.885 (1)", - "tab": "General information", - "score": 9.88450704225352 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435, - "details": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.867, mean=0.867, max=0.867, sum=0.867 (1)", - "tab": "Efficiency", - "score": 0.8667134034633637 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.131, mean=1.131, max=1.131, sum=1.131 (1)", - "tab": "Efficiency", - "score": 1.1312835423946381 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1762.593, mean=1762.593, max=1762.593, sum=1762.593 (1)", - "tab": "General information", - "score": 1762.593 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.753, mean=8.753, max=8.753, sum=8.753 (1)", - "tab": "General information", - "score": 8.753 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)", - "tab": "General information", - "score": 173.127 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=14.157, mean=14.157, max=14.157, sum=14.157 (1)", - "tab": "General information", - "score": 14.157 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=0.95 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)", - "tab": "Efficiency", - "score": 0.5122070140838623 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)", - "tab": "General information", - "score": 242.782 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.699, - "details": { - "description": "min=0.47, mean=0.699, max=0.96, sum=3.495 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.397, mean=0.447, max=0.515, sum=2.236 (5)", - "tab": "Efficiency", - "score": 0.4471675806380155 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)", - "tab": "General information", - "score": 460.71996491228066 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.711, mean=0.857, max=0.97, sum=5.998 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=10.989, mean=12.704, max=15.09, sum=88.928 (7)", - "tab": "Efficiency", - "score": 12.704059314714486 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)", - "tab": "General information", - "score": 1323.910874184069 - }, - "MATH - # output tokens": { - "description": "min=122.465, mean=161.876, max=186.673, sum=1133.133 (7)", - "tab": "General information", - "score": 161.87607288445722 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.668, - "details": { - "description": "min=0.668, mean=0.668, max=0.668, sum=0.668 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=5.738, mean=5.738, max=5.738, sum=5.738 (1)", - "tab": "Efficiency", - "score": 5.738402992963791 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)", - "tab": "General information", - "score": 1020.035 - }, - "GSM8K - # output tokens": { - "description": "min=98.073, mean=98.073, max=98.073, sum=98.073 (1)", - "tab": "General information", - "score": 98.073 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.626, - "details": { - "description": "min=0.368, mean=0.626, max=0.989, sum=3.13 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.445, mean=0.603, max=0.98, sum=3.017 (5)", - "tab": "Efficiency", - "score": 0.6033123332286346 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=253.442, mean=1570.163, max=6357.388, sum=7850.815 (5)", - "tab": "General information", - "score": 1570.162971355988 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.458, max=2.695, sum=7.291 (5)", - "tab": "General information", - "score": 1.458208948802524 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.817, - "details": { - "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)", - "tab": "Efficiency", - "score": 0.3924491192190121 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)", - "tab": "General information", - "score": 1020.4135188866799 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.205, - "details": { - "description": "min=0.156, mean=0.205, max=0.241, sum=1.023 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.797, mean=2.1, max=2.349, sum=10.502 (5)", - "tab": "Efficiency", - "score": 2.1004491326059744 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)", - "tab": "General information", - "score": 193.04258583116683 - }, - "WMT 2014 - # output tokens": { - "description": "min=26.229, mean=26.996, max=28.59, sum=134.98 (5)", - "tab": "General information", - "score": 26.995945480960394 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json deleted file mode 100644 index dae83b652..000000000 --- a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4 Turbo 2024-04-09", - "id": "openai/gpt-4-turbo-2024-04-09", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.4568414481897628 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.761, - "details": { - "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.804, mean=0.804, max=0.804, sum=0.804 (1)", - "tab": "Efficiency", - "score": 0.8043310716118611 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3495.67, mean=3495.67, max=3495.67, sum=3495.67 (1)", - "tab": "General information", - "score": 3495.6704225352114 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.037, mean=6.037, max=6.037, sum=6.037 (1)", - "tab": "General information", - "score": 6.0366197183098596 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482, - "details": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.712, mean=0.712, max=0.712, sum=0.712 (1)", - "tab": "Efficiency", - "score": 0.7120162718296051 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)", - "tab": "Efficiency", - "score": 0.6052222681045533 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1728.593, mean=1728.593, max=1728.593, sum=1728.593 (1)", - "tab": "General information", - "score": 1728.593 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.902, mean=5.902, max=5.902, sum=5.902 (1)", - "tab": "General information", - "score": 5.902 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=139.127, mean=139.127, max=139.127, sum=139.127 (1)", - "tab": "General information", - "score": 139.127 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.263, mean=5.263, max=5.263, sum=5.263 (1)", - "tab": "General information", - "score": 5.263 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.97, - "details": { - "description": "min=0.97, mean=0.97, max=0.97, sum=0.97 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)", - "tab": "Efficiency", - "score": 0.4376141686439514 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.782, mean=249.782, max=249.782, sum=249.782 (1)", - "tab": "General information", - "score": 249.782 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.53, mean=0.711, max=0.96, sum=3.555 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.53, mean=0.55, max=0.572, sum=2.749 (5)", - "tab": "Efficiency", - "score": 0.5498773384847139 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)", - "tab": "General information", - "score": 467.71996491228066 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.684, mean=0.833, max=0.97, sum=5.83 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=4.92, mean=6.678, max=8.338, sum=46.748 (7)", - "tab": "Efficiency", - "score": 6.678270916932833 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)", - "tab": "General information", - "score": 1262.9108741840687 - }, - "MATH - # output tokens": { - "description": "min=135.163, mean=189.561, max=219.316, sum=1326.926 (7)", - "tab": "General information", - "score": 189.56082409362702 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=6.915, mean=6.915, max=6.915, sum=6.915 (1)", - "tab": "Efficiency", - "score": 6.91472976398468 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.035, mean=959.035, max=959.035, sum=959.035 (1)", - "tab": "General information", - "score": 959.035 - }, - "GSM8K - # output tokens": { - "description": "min=141.712, mean=141.712, max=141.712, sum=141.712 (1)", - "tab": "General information", - "score": 141.712 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.417, mean=0.727, max=0.947, sum=3.637 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.514, mean=0.608, max=0.803, sum=3.041 (5)", - "tab": "Efficiency", - "score": 0.6081070231398068 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=207.442, mean=1524.163, max=6311.388, sum=7620.815 (5)", - "tab": "General information", - "score": 1524.162971355988 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.325, max=2.032, sum=6.626 (5)", - "tab": "General information", - "score": 1.3251168793919403 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.455, mean=0.455, max=0.455, sum=0.455 (1)", - "tab": "Efficiency", - "score": 0.4549296101329341 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)", - "tab": "General information", - "score": 1027.4135188866799 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.218, - "details": { - "description": "min=0.169, mean=0.218, max=0.264, sum=1.088 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.131, mean=1.185, max=1.222, sum=5.925 (5)", - "tab": "Efficiency", - "score": 1.1850423664020953 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=124.901, mean=148.043, max=168.185, sum=740.213 (5)", - "tab": "General information", - "score": 148.04258583116683 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.744, mean=25.264, max=25.938, sum=126.322 (5)", - "tab": "General information", - "score": 25.26444840571953 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json deleted file mode 100644 index c23053f17..000000000 --- a/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4o 2024-05-13", - "id": "openai/gpt-4o-2024-05-13", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6270536828963795 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=0.804 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.556, mean=0.556, max=0.556, sum=0.556 (1)", - "tab": "Efficiency", - "score": 0.5561933571184186 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3461.668, mean=3461.668, max=3461.668, sum=3461.668 (1)", - "tab": "General information", - "score": 3461.667605633803 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.62, mean=4.62, max=4.62, sum=4.62 (1)", - "tab": "General information", - "score": 4.619718309859155 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.501, - "details": { - "description": "min=0.501, mean=0.501, max=0.501, sum=0.501 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)", - "tab": "Efficiency", - "score": 0.5071200861930847 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.461, mean=0.461, max=0.461, sum=0.461 (1)", - "tab": "Efficiency", - "score": 0.46105142664909365 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1724.02, mean=1724.02, max=1724.02, sum=1724.02 (1)", - "tab": "General information", - "score": 1724.02 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.41, mean=5.41, max=5.41, sum=5.41 (1)", - "tab": "General information", - "score": 5.41 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=139.953, mean=139.953, max=139.953, sum=139.953 (1)", - "tab": "General information", - "score": 139.953 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.245, mean=4.245, max=4.245, sum=4.245 (1)", - "tab": "General information", - "score": 4.245 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.966, - "details": { - "description": "min=0.966, mean=0.966, max=0.966, sum=0.966 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.402 (1)", - "tab": "Efficiency", - "score": 0.4019911346435547 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=245.486, mean=245.486, max=245.486, sum=245.486 (1)", - "tab": "General information", - "score": 245.486 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.748, - "details": { - "description": "min=0.61, mean=0.748, max=0.95, sum=3.742 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.353, mean=0.39, max=0.416, sum=1.952 (5)", - "tab": "Efficiency", - "score": 0.3904274333485386 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.42, mean=466.992, max=613.228, sum=2334.958 (5)", - "tab": "General information", - "score": 466.9916140350877 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.632, mean=0.829, max=0.977, sum=5.802 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.334, mean=4.358, max=4.85, sum=30.503 (7)", - "tab": "Efficiency", - "score": 4.357550465458739 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=888.43, mean=1273.32, max=2222.25, sum=8913.243 (7)", - "tab": "General information", - "score": 1273.320452019534 - }, - "MATH - # output tokens": { - "description": "min=187.942, mean=245.482, max=284.788, sum=1718.377 (7)", - "tab": "General information", - "score": 245.4823665454633 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "details": { - "description": "min=0.905, mean=0.905, max=0.905, sum=0.905 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=4.227, mean=4.227, max=4.227, sum=4.227 (1)", - "tab": "Efficiency", - "score": 4.227096201658249 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=952.617, mean=952.617, max=952.617, sum=952.617 (1)", - "tab": "General information", - "score": 952.617 - }, - "GSM8K - # output tokens": { - "description": "min=213.475, mean=213.475, max=213.475, sum=213.475 (1)", - "tab": "General information", - "score": 213.475 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.441, mean=0.733, max=0.989, sum=3.666 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.36, mean=0.431, max=0.568, sum=2.154 (5)", - "tab": "Efficiency", - "score": 0.4307274274560104 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=208.179, mean=1512.795, max=6254.98, sum=7563.977 (5)", - "tab": "General information", - "score": 1512.7954037538377 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.249, max=2.021, sum=6.244 (5)", - "tab": "General information", - "score": 1.2488971748171518 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)", - "tab": "Efficiency", - "score": 0.4072816490416024 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1009.05, mean=1009.05, max=1009.05, sum=1009.05 (1)", - "tab": "General information", - "score": 1009.0497017892644 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.231, - "details": { - "description": "min=0.176, mean=0.231, max=0.281, sum=1.154 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.775, mean=0.842, max=0.967, sum=4.212 (5)", - "tab": "Efficiency", - "score": 0.8424805298775759 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=79.529, mean=115.006, max=138.497, sum=575.028 (5)", - "tab": "General information", - "score": 115.00557042361216 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.62, mean=25.287, max=26.018, sum=126.434 (5)", - "tab": "General information", - "score": 25.286879683437835 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json deleted file mode 100644 index f8d7c3614..000000000 --- a/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4o 2024-08-06", - "id": "openai/gpt-4o-2024-08-06", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6728589263420724 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.795, - "details": { - "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.562, mean=0.562, max=0.562, sum=0.562 (1)", - "tab": "Efficiency", - "score": 0.5615828097706109 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3451.668, mean=3451.668, max=3451.668, sum=3451.668 (1)", - "tab": "General information", - "score": 3451.667605633803 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.076, mean=5.076, max=5.076, sum=5.076 (1)", - "tab": "General information", - "score": 5.076056338028169 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.496, - "details": { - "description": "min=0.496, mean=0.496, max=0.496, sum=0.496 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)", - "tab": "Efficiency", - "score": 0.6156781461238862 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.418 (1)", - "tab": "Efficiency", - "score": 0.4182390425205231 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1714.02, mean=1714.02, max=1714.02, sum=1714.02 (1)", - "tab": "General information", - "score": 1714.02 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.504, mean=6.504, max=6.504, sum=6.504 (1)", - "tab": "General information", - "score": 6.504 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.953, mean=129.953, max=129.953, sum=129.953 (1)", - "tab": "General information", - "score": 129.953 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=5.032, mean=5.032, max=5.032, sum=5.032 (1)", - "tab": "General information", - "score": 5.032 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.968, - "details": { - "description": "min=0.968, mean=0.968, max=0.968, sum=0.968 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)", - "tab": "Efficiency", - "score": 0.40116420984268186 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=245.486, mean=245.486, max=245.486, sum=245.486 (1)", - "tab": "General information", - "score": 245.486 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.738, - "details": { - "description": "min=0.58, mean=0.738, max=0.95, sum=3.691 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.335, mean=0.441, max=0.512, sum=2.204 (5)", - "tab": "Efficiency", - "score": 0.4407063991228739 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.42, mean=466.992, max=613.228, sum=2334.958 (5)", - "tab": "General information", - "score": 466.9916140350877 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.853, - "details": { - "description": "min=0.731, mean=0.853, max=0.956, sum=5.968 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.205, mean=4.321, max=6.062, sum=30.245 (7)", - "tab": "Efficiency", - "score": 4.320655013573451 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=888.43, mean=1273.32, max=2222.25, sum=8913.243 (7)", - "tab": "General information", - "score": 1273.320452019534 - }, - "MATH - # output tokens": { - "description": "min=157.721, mean=210.124, max=243.135, sum=1470.869 (7)", - "tab": "General information", - "score": 210.1241379885811 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.909, - "details": { - "description": "min=0.909, mean=0.909, max=0.909, sum=0.909 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.937, mean=2.937, max=2.937, sum=2.937 (1)", - "tab": "Efficiency", - "score": 2.9373713800907133 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=952.617, mean=952.617, max=952.617, sum=952.617 (1)", - "tab": "General information", - "score": 952.617 - }, - "GSM8K - # output tokens": { - "description": "min=167.729, mean=167.729, max=167.729, sum=167.729 (1)", - "tab": "General information", - "score": 167.729 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.721, - "details": { - "description": "min=0.422, mean=0.721, max=0.979, sum=3.605 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.312, mean=0.38, max=0.526, sum=1.901 (5)", - "tab": "Efficiency", - "score": 0.38022537218958125 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=198.179, mean=1502.795, max=6244.98, sum=7513.977 (5)", - "tab": "General information", - "score": 1502.7954037538377 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.298, max=2.021, sum=6.49 (5)", - "tab": "General information", - "score": 1.298021970457479 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.863, - "details": { - "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)", - "tab": "Efficiency", - "score": 0.30731069923158194 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1009.05, mean=1009.05, max=1009.05, sum=1009.05 (1)", - "tab": "General information", - "score": 1009.0497017892644 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.225, - "details": { - "description": "min=0.18, mean=0.225, max=0.267, sum=1.125 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.725, mean=0.768, max=0.804, sum=3.841 (5)", - "tab": "Efficiency", - "score": 0.7681678841877538 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=69.529, mean=105.006, max=128.497, sum=525.028 (5)", - "tab": "General information", - "score": 105.00557042361216 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.809, mean=25.367, max=25.988, sum=126.835 (5)", - "tab": "General information", - "score": 25.366906254779018 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json deleted file mode 100644 index 3869cb246..000000000 --- a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4o mini 2024-07-18", - "id": "openai/gpt-4o-mini-2024-07-18", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.701, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7796004993757802 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.473, mean=0.473, max=0.473, sum=0.473 (1)", - "tab": "Efficiency", - "score": 0.47311924612018424 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3451.668, mean=3451.668, max=3451.668, sum=3451.668 (1)", - "tab": "General information", - "score": 3451.667605633803 - }, - "NarrativeQA - # output tokens": { - "description": "min=4.482, mean=4.482, max=4.482, sum=4.482 (1)", - "tab": "General information", - "score": 4.48169014084507 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386, - "details": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.386 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)", - "tab": "Efficiency", - "score": 0.40617332768440245 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.374 (1)", - "tab": "Efficiency", - "score": 0.3740478873252869 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1714.02, mean=1714.02, max=1714.02, sum=1714.02 (1)", - "tab": "General information", - "score": 1714.02 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=5.175, mean=5.175, max=5.175, sum=5.175 (1)", - "tab": "General information", - "score": 5.175 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.953, mean=129.953, max=129.953, sum=129.953 (1)", - "tab": "General information", - "score": 129.953 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.847, mean=4.847, max=4.847, sum=4.847 (1)", - "tab": "General information", - "score": 4.847 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=0.92 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.331 (1)", - "tab": "Efficiency", - "score": 0.3309546322822571 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=245.486, mean=245.486, max=245.486, sum=245.486 (1)", - "tab": "General information", - "score": 245.486 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.668, - "details": { - "description": "min=0.42, mean=0.668, max=0.91, sum=3.339 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.292, mean=0.299, max=0.309, sum=1.497 (5)", - "tab": "Efficiency", - "score": 0.2993013315033494 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.42, mean=466.992, max=613.228, sum=2334.958 (5)", - "tab": "General information", - "score": 466.9916140350877 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802, - "details": { - "description": "min=0.605, mean=0.802, max=0.97, sum=5.611 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.312, mean=3.175, max=3.696, sum=22.228 (7)", - "tab": "Efficiency", - "score": 3.175392215033706 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=888.43, mean=1273.32, max=2222.25, sum=8913.243 (7)", - "tab": "General information", - "score": 1273.320452019534 - }, - "MATH - # output tokens": { - "description": "min=167.884, mean=238.235, max=276.058, sum=1667.647 (7)", - "tab": "General information", - "score": 238.23525019565412 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.519, mean=2.519, max=2.519, sum=2.519 (1)", - "tab": "Efficiency", - "score": 2.5191967821121217 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=952.617, mean=952.617, max=952.617, sum=952.617 (1)", - "tab": "General information", - "score": 952.617 - }, - "GSM8K - # output tokens": { - "description": "min=215.465, mean=215.465, max=215.465, sum=215.465 (1)", - "tab": "General information", - "score": 215.465 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.653, - "details": { - "description": "min=0.414, mean=0.653, max=0.937, sum=3.263 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.337, mean=0.382, max=0.503, sum=1.91 (5)", - "tab": "Efficiency", - "score": 0.38199841220513264 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=198.179, mean=1502.795, max=6244.98, sum=7513.977 (5)", - "tab": "General information", - "score": 1502.7954037538377 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.293, max=2.253, sum=6.465 (5)", - "tab": "General information", - "score": 1.2930331277785745 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.748, - "details": { - "description": "min=0.748, mean=0.748, max=0.748, sum=0.748 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)", - "tab": "Efficiency", - "score": 0.3318999989132284 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1009.05, mean=1009.05, max=1009.05, sum=1009.05 (1)", - "tab": "General information", - "score": 1009.0497017892644 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.206, - "details": { - "description": "min=0.153, mean=0.206, max=0.254, sum=1.032 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.557, mean=0.583, max=0.598, sum=2.917 (5)", - "tab": "Efficiency", - "score": 0.5833699647787834 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=69.529, mean=105.006, max=128.497, sum=525.028 (5)", - "tab": "General information", - "score": 105.00557042361216 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.748, mean=25.504, max=26.235, sum=127.522 (5)", - "tab": "General information", - "score": 25.504310196513227 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json deleted file mode 100644 index f3294dd85..000000000 --- a/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_text-davinci-002/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-3.5 text-davinci-002", - "id": "openai/text-davinci-002", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.336, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6860299625468165 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719, - "details": { - "description": "min=0.719, mean=0.719, max=0.719, sum=0.719 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.226, mean=1.226, max=1.226, sum=1.226 (1)", - "tab": "Efficiency", - "score": 1.2258358747186795 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)", - "tab": "General information", - "score": 4.954929577464789 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3479.563, mean=3479.563, max=3479.563, sum=3479.563 (1)", - "tab": "General information", - "score": 3479.56338028169 - }, - "NarrativeQA - # output tokens": { - "description": "min=8.448, mean=8.448, max=8.448, sum=8.448 (1)", - "tab": "General information", - "score": 8.447887323943663 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.394, - "details": { - "description": "min=0.394, mean=0.394, max=0.394, sum=0.394 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.886, mean=0.886, max=0.886, sum=0.886 (1)", - "tab": "Efficiency", - "score": 0.8863302536010742 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.683, mean=0.683, max=0.683, sum=0.683 (1)", - "tab": "Efficiency", - "score": 0.6834516413211823 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.885, mean=4.885, max=4.885, sum=4.885 (1)", - "tab": "General information", - "score": 4.885 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)", - "tab": "General information", - "score": 0.02 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1617.729, mean=1617.729, max=1617.729, sum=1617.729 (1)", - "tab": "General information", - "score": 1617.729 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.632, mean=6.632, max=6.632, sum=6.632 (1)", - "tab": "General information", - "score": 6.632 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)", - "tab": "General information", - "score": 116.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=4.116, mean=4.116, max=4.116, sum=4.116 (1)", - "tab": "General information", - "score": 4.116 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=0.796 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.174, mean=0.174, max=0.174, sum=0.174 (1)", - "tab": "Efficiency", - "score": 0.1743956871032715 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)", - "tab": "General information", - "score": 254.21 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.568, - "details": { - "description": "min=0.26, mean=0.568, max=0.84, sum=2.841 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.175, mean=0.177, max=0.181, sum=0.887 (5)", - "tab": "Efficiency", - "score": 0.17730724048614502 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)", - "tab": "General information", - "score": 472.2740350877192 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428, - "details": { - "description": "min=0.288, mean=0.428, max=0.548, sum=2.997 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.257, mean=5.188, max=9.459, sum=36.316 (7)", - "tab": "Efficiency", - "score": 5.188020693120383 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)", - "tab": "General information", - "score": 1375.7353092779654 - }, - "MATH - # output tokens": { - "description": "min=76.721, mean=136.822, max=259.175, sum=957.754 (7)", - "tab": "General information", - "score": 136.82193804427587 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.479, - "details": { - "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.762, mean=3.762, max=3.762, sum=3.762 (1)", - "tab": "Efficiency", - "score": 3.762208682537079 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)", - "tab": "General information", - "score": 938.869 - }, - "GSM8K - # output tokens": { - "description": "min=90.543, mean=90.543, max=90.543, sum=90.543 (1)", - "tab": "General information", - "score": 90.543 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.326, mean=0.58, max=0.916, sum=2.901 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.167, mean=0.223, max=0.403, sum=1.115 (5)", - "tab": "Efficiency", - "score": 0.2229105462585103 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=2.053, mean=4.211, max=5, sum=21.053 (5)", - "tab": "General information", - "score": 4.210612244897959 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.632, mean=907.387, max=3225.32, sum=4536.936 (5)", - "tab": "General information", - "score": 907.3872120499769 - }, - "LegalBench - # output tokens": { - "description": "min=0.996, mean=1.099, max=1.238, sum=5.496 (5)", - "tab": "General information", - "score": 1.0991972687655298 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.525, - "details": { - "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.206, mean=0.206, max=0.206, sum=0.206 (1)", - "tab": "Efficiency", - "score": 0.20554606720183052 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)", - "tab": "General information", - "score": 1038.8608349900596 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.174, - "details": { - "description": "min=0.077, mean=0.174, max=0.212, sum=0.872 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.446, mean=0.467, max=0.478, sum=2.336 (5)", - "tab": "Efficiency", - "score": 0.4672719452194591 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)", - "tab": "General information", - "score": 181.69386660804403 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.557, mean=24.862, max=25.636, sum=124.309 (5)", - "tab": "General information", - "score": 24.86174013610644 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json deleted file mode 100644 index 93f27df2b..000000000 --- a/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/openai_text-davinci-003/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-3.5 text-davinci-003", - "id": "openai/text-davinci-003", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.439, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5880524344569289 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731, - "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.813, mean=1.813, max=1.813, sum=1.813 (1)", - "tab": "Efficiency", - "score": 1.812959625351597 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)", - "tab": "General information", - "score": 4.954929577464789 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3479.563, mean=3479.563, max=3479.563, sum=3479.563 (1)", - "tab": "General information", - "score": 3479.56338028169 - }, - "NarrativeQA - # output tokens": { - "description": "min=9.732, mean=9.732, max=9.732, sum=9.732 (1)", - "tab": "General information", - "score": 9.732394366197184 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.413, - "details": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.187, mean=1.187, max=1.187, sum=1.187 (1)", - "tab": "Efficiency", - "score": 1.1872664585113526 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.996, mean=0.996, max=0.996, sum=0.996 (1)", - "tab": "Efficiency", - "score": 0.9963206455707551 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.885, mean=4.885, max=4.885, sum=4.885 (1)", - "tab": "General information", - "score": 4.885 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)", - "tab": "General information", - "score": 0.02 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1617.729, mean=1617.729, max=1617.729, sum=1617.729 (1)", - "tab": "General information", - "score": 1617.729 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=6.8, mean=6.8, max=6.8, sum=6.8 (1)", - "tab": "General information", - "score": 6.8 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)", - "tab": "General information", - "score": 116.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=7.074, mean=7.074, max=7.074, sum=7.074 (1)", - "tab": "General information", - "score": 7.074 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)", - "tab": "Efficiency", - "score": 0.20436767482757567 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)", - "tab": "General information", - "score": 254.21 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555, - "details": { - "description": "min=0.3, mean=0.555, max=0.83, sum=2.774 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.199, mean=0.2, max=0.203, sum=1.0 (5)", - "tab": "Efficiency", - "score": 0.2000334782098469 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)", - "tab": "General information", - "score": 472.2740350877192 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449, - "details": { - "description": "min=0.3, mean=0.449, max=0.548, sum=3.146 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.871, mean=4.334, max=5.181, sum=30.338 (7)", - "tab": "Efficiency", - "score": 4.333955165715466 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)", - "tab": "General information", - "score": 1375.7353092779654 - }, - "MATH - # output tokens": { - "description": "min=61.333, mean=74.938, max=97.115, sum=524.566 (7)", - "tab": "General information", - "score": 74.93793702104595 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.615, - "details": { - "description": "min=0.615, mean=0.615, max=0.615, sum=0.615 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=5.199, mean=5.199, max=5.199, sum=5.199 (1)", - "tab": "Efficiency", - "score": 5.199419307470322 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)", - "tab": "General information", - "score": 938.869 - }, - "GSM8K - # output tokens": { - "description": "min=93.717, mean=93.717, max=93.717, sum=93.717 (1)", - "tab": "General information", - "score": 93.717 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.622, - "details": { - "description": "min=0.324, mean=0.622, max=0.947, sum=3.11 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.189, mean=0.259, max=0.474, sum=1.297 (5)", - "tab": "Efficiency", - "score": 0.2594051892596125 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=2.053, mean=4.211, max=5, sum=21.053 (5)", - "tab": "General information", - "score": 4.210612244897959 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.632, mean=907.387, max=3225.32, sum=4536.936 (5)", - "tab": "General information", - "score": 907.3872120499769 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.168, max=1.443, sum=5.838 (5)", - "tab": "General information", - "score": 1.1675708408818857 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531, - "details": { - "description": "min=0.531, mean=0.531, max=0.531, sum=0.531 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.228, mean=0.228, max=0.228, sum=0.228 (1)", - "tab": "Efficiency", - "score": 0.22811962975185388 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)", - "tab": "General information", - "score": 1038.8608349900596 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.191, - "details": { - "description": "min=0.094, mean=0.191, max=0.227, sum=0.956 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.756, mean=0.8, max=0.822, sum=4.0 (5)", - "tab": "Efficiency", - "score": 0.800053899013968 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)", - "tab": "General information", - "score": 181.69386660804403 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.563, mean=25.117, max=25.652, sum=125.587 (5)", - "tab": "General information", - "score": 25.117336366416882 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json deleted file mode 100644 index 800f57826..000000000 --- a/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 Chat 110B", - "id": "qwen/qwen1.5-110b-chat", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6592634207240948 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.721, - "details": { - "description": "min=0.721, mean=0.721, max=0.721, sum=0.721 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.984, mean=0.984, max=0.984, sum=0.984 (1)", - "tab": "Efficiency", - "score": 0.9843533623386437 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3502.913, mean=3502.913, max=3502.913, sum=3502.913 (1)", - "tab": "General information", - "score": 3502.912676056338 - }, - "NarrativeQA - # output tokens": { - "description": "min=10.29, mean=10.29, max=10.29, sum=10.29 (1)", - "tab": "General information", - "score": 10.290140845070422 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35, - "details": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.35 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.647, mean=0.647, max=0.647, sum=0.647 (1)", - "tab": "Efficiency", - "score": 0.6468759918212891 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.465, mean=0.465, max=0.465, sum=0.465 (1)", - "tab": "Efficiency", - "score": 0.46513359355926515 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2017.955, mean=2017.955, max=2017.955, sum=2017.955 (1)", - "tab": "General information", - "score": 2017.955 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.509, mean=8.509, max=8.509, sum=8.509 (1)", - "tab": "General information", - "score": 8.509 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=146.262, mean=146.262, max=146.262, sum=146.262 (1)", - "tab": "General information", - "score": 146.262 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=8.99, mean=8.99, max=8.99, sum=8.99 (1)", - "tab": "General information", - "score": 8.99 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.922, - "details": { - "description": "min=0.922, mean=0.922, max=0.922, sum=0.922 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.244, mean=0.244, max=0.244, sum=0.244 (1)", - "tab": "Efficiency", - "score": 0.24445231294631958 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)", - "tab": "General information", - "score": 249.846 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704, - "details": { - "description": "min=0.57, mean=0.704, max=0.87, sum=3.52 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.229, mean=0.248, max=0.277, sum=1.241 (5)", - "tab": "Efficiency", - "score": 0.2482092388136345 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)", - "tab": "General information", - "score": 477.8357192982456 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.568, - "details": { - "description": "min=0.211, mean=0.568, max=0.769, sum=3.974 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.984, mean=3.989, max=5.0, sum=27.92 (7)", - "tab": "Efficiency", - "score": 3.9885726889236994 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)", - "tab": "General information", - "score": 1323.836848955025 - }, - "MATH - # output tokens": { - "description": "min=104.174, mean=156.855, max=202.368, sum=1097.984 (7)", - "tab": "General information", - "score": 156.85484968134907 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=4.537, mean=4.537, max=4.537, sum=4.537 (1)", - "tab": "Efficiency", - "score": 4.537143226146698 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)", - "tab": "General information", - "score": 1130.403 - }, - "GSM8K - # output tokens": { - "description": "min=175.784, mean=175.784, max=175.784, sum=175.784 (1)", - "tab": "General information", - "score": 175.784 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.624, - "details": { - "description": "min=0.387, mean=0.624, max=0.958, sum=3.121 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.271, mean=0.499, max=1.328, sum=2.493 (5)", - "tab": "Efficiency", - "score": 0.4986402694478536 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=207.453, mean=1557.088, max=6445.714, sum=7785.442 (5)", - "tab": "General information", - "score": 1557.0883229968654 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.314, max=2.958, sum=11.571 (5)", - "tab": "General information", - "score": 2.3142312634447153 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=0.64 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)", - "tab": "Efficiency", - "score": 0.2881786700034473 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)", - "tab": "General information", - "score": 1052.4850894632207 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.192, - "details": { - "description": "min=0.133, mean=0.192, max=0.232, sum=0.962 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.839, mean=0.882, max=0.896, sum=4.411 (5)", - "tab": "Efficiency", - "score": 0.882270189100544 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=124.855, mean=142.657, max=158.373, sum=713.283 (5)", - "tab": "General information", - "score": 142.65662658663405 - }, - "WMT 2014 - # output tokens": { - "description": "min=25.499, mean=26.949, max=27.529, sum=134.744 (5)", - "tab": "General information", - "score": 26.94872734745374 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json deleted file mode 100644 index c8749e5f5..000000000 --- a/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 14B", - "id": "qwen/qwen1.5-14b", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.6941198501872659 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)", - "tab": "Efficiency", - "score": 0.986717187183004 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3468.913, mean=3468.913, max=3468.913, sum=3468.913 (1)", - "tab": "General information", - "score": 3468.912676056338 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3, - "details": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.679, mean=0.679, max=0.679, sum=0.679 (1)", - "tab": "Efficiency", - "score": 0.6790921592712402 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)", - "tab": "Efficiency", - "score": 0.3734231026172638 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1990.955, mean=1990.955, max=1990.955, sum=1990.955 (1)", - "tab": "General information", - "score": 1990.955 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)", - "tab": "General information", - "score": 119.262 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.862, - "details": { - "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)", - "tab": "Efficiency", - "score": 0.2849515151977539 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)", - "tab": "General information", - "score": 242.846 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.626, - "details": { - "description": "min=0.4, mean=0.626, max=0.87, sum=3.131 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.285, mean=0.31, max=0.335, sum=1.549 (5)", - "tab": "Efficiency", - "score": 0.30986739750075765 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)", - "tab": "General information", - "score": 470.8357192982456 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.6, mean=0.686, max=0.8, sum=4.8 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=4.789, mean=4.932, max=5.055, sum=34.522 (7)", - "tab": "Efficiency", - "score": 4.931704092498438 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)", - "tab": "General information", - "score": 1323.836848955025 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.693, mean=0.693, max=0.693, sum=0.693 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.966, mean=1.966, max=1.966, sum=1.966 (1)", - "tab": "Efficiency", - "score": 1.965628466129303 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)", - "tab": "General information", - "score": 1130.403 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.593, - "details": { - "description": "min=0.358, mean=0.593, max=0.853, sum=2.966 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.332, mean=0.544, max=1.352, sum=2.722 (5)", - "tab": "Efficiency", - "score": 0.5443530451858324 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=192.453, mean=1542.088, max=6430.714, sum=7710.442 (5)", - "tab": "General information", - "score": 1542.0883229968654 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.515, - "details": { - "description": "min=0.515, mean=0.515, max=0.515, sum=0.515 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.326 (1)", - "tab": "Efficiency", - "score": 0.3256318408025662 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)", - "tab": "General information", - "score": 1045.4850894632207 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.178, - "details": { - "description": "min=0.101, mean=0.178, max=0.23, sum=0.89 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.59, mean=0.606, max=0.617, sum=3.032 (5)", - "tab": "Efficiency", - "score": 0.606455911532908 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)", - "tab": "General information", - "score": 126.65662658663405 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json deleted file mode 100644 index 699c1515b..000000000 --- a/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 32B", - "id": "qwen/qwen1.5-32b", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.546, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.47831460674157306 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589, - "details": { - "description": "min=0.589, mean=0.589, max=0.589, sum=0.589 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.848, mean=1.848, max=1.848, sum=1.848 (1)", - "tab": "Efficiency", - "score": 1.847580643774758 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3468.913, mean=3468.913, max=3468.913, sum=3468.913 (1)", - "tab": "General information", - "score": 3468.912676056338 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.353, - "details": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.139, mean=1.139, max=1.139, sum=1.139 (1)", - "tab": "Efficiency", - "score": 1.1394575798511506 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)", - "tab": "Efficiency", - "score": 0.457463458776474 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1990.955, mean=1990.955, max=1990.955, sum=1990.955 (1)", - "tab": "General information", - "score": 1990.955 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)", - "tab": "General information", - "score": 119.262 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)", - "tab": "Efficiency", - "score": 0.3515647969245911 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)", - "tab": "General information", - "score": 242.846 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.628, - "details": { - "description": "min=0.4, mean=0.628, max=0.91, sum=3.141 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.337, mean=0.345, max=0.367, sum=1.724 (5)", - "tab": "Efficiency", - "score": 0.34482146733267266 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)", - "tab": "General information", - "score": 470.8357192982456 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.5, mean=0.733, max=0.859, sum=5.132 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=8.668, mean=9.437, max=10.496, sum=66.058 (7)", - "tab": "Efficiency", - "score": 9.436887120006455 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)", - "tab": "General information", - "score": 1323.836848955025 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "description": "min=0.773, mean=0.773, max=0.773, sum=0.773 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=3.406, mean=3.406, max=3.406, sum=3.406 (1)", - "tab": "Efficiency", - "score": 3.405816124200821 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)", - "tab": "General information", - "score": 1130.403 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.636, - "details": { - "description": "min=0.417, mean=0.636, max=0.926, sum=3.179 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.371, mean=0.789, max=2.33, sum=3.947 (5)", - "tab": "Efficiency", - "score": 0.7894946821991368 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=192.453, mean=1542.088, max=6430.714, sum=7710.442 (5)", - "tab": "General information", - "score": 1542.0883229968654 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656, - "details": { - "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)", - "tab": "Efficiency", - "score": 0.4515474046437925 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)", - "tab": "General information", - "score": 1045.4850894632207 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.193, - "details": { - "description": "min=0.129, mean=0.193, max=0.242, sum=0.967 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.902, mean=0.92, max=0.952, sum=4.6 (5)", - "tab": "Efficiency", - "score": 0.9200148107330449 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)", - "tab": "General information", - "score": 126.65662658663405 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json deleted file mode 100644 index 8b347b68d..000000000 --- a/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 72B", - "id": "qwen/qwen1.5-72b", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.608, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.3881398252184769 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.601, - "details": { - "description": "min=0.601, mean=0.601, max=0.601, sum=0.601 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=2.437, mean=2.437, max=2.437, sum=2.437 (1)", - "tab": "Efficiency", - "score": 2.4371175302586083 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.994, mean=4.994, max=4.994, sum=4.994 (1)", - "tab": "General information", - "score": 4.994366197183099 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3465.859, mean=3465.859, max=3465.859, sum=3465.859 (1)", - "tab": "General information", - "score": 3465.8591549295775 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.417, - "details": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.421, mean=1.421, max=1.421, sum=1.421 (1)", - "tab": "Efficiency", - "score": 1.4208379020690918 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.577, mean=0.577, max=0.577, sum=0.577 (1)", - "tab": "Efficiency", - "score": 0.5770996954441071 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.863, mean=4.863, max=4.863, sum=4.863 (1)", - "tab": "General information", - "score": 4.863 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.022, mean=0.022, max=0.022, sum=0.022 (1)", - "tab": "General information", - "score": 0.022 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1846.221, mean=1846.221, max=1846.221, sum=1846.221 (1)", - "tab": "General information", - "score": 1846.221 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)", - "tab": "General information", - "score": 119.262 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=0.93 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)", - "tab": "Efficiency", - "score": 0.3381467695236206 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)", - "tab": "General information", - "score": 242.846 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.647, - "details": { - "description": "min=0.44, mean=0.647, max=0.94, sum=3.234 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.338, mean=0.364, max=0.396, sum=1.819 (5)", - "tab": "Efficiency", - "score": 0.3638015921659637 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)", - "tab": "General information", - "score": 470.8357192982456 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.6, mean=0.683, max=0.763, sum=4.784 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=10.776, mean=11.813, max=12.91, sum=82.688 (7)", - "tab": "Efficiency", - "score": 11.812623854443027 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)", - "tab": "General information", - "score": 1323.836848955025 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.799, - "details": { - "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=4.587, mean=4.587, max=4.587, sum=4.587 (1)", - "tab": "Efficiency", - "score": 4.5866835827827455 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)", - "tab": "General information", - "score": 1130.403 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694, - "details": { - "description": "min=0.425, mean=0.694, max=0.958, sum=3.469 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.426, mean=0.878, max=1.58, sum=4.392 (5)", - "tab": "Efficiency", - "score": 0.8783966223148776 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=2.253, mean=4.251, max=5, sum=21.253 (5)", - "tab": "General information", - "score": 4.25061224489796 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=192.453, mean=940.377, max=3422.157, sum=4701.884 (5)", - "tab": "General information", - "score": 940.3768944254368 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.543, mean=0.543, max=0.543, sum=0.543 (1)", - "tab": "Efficiency", - "score": 0.5430597031329782 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)", - "tab": "General information", - "score": 1045.4850894632207 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.201, - "details": { - "description": "min=0.14, mean=0.201, max=0.255, sum=1.006 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.148, mean=1.187, max=1.205, sum=5.933 (5)", - "tab": "Efficiency", - "score": 1.1866255830765444 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)", - "tab": "General information", - "score": 126.65662658663405 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json deleted file mode 100644 index b1bc89d92..000000000 --- a/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 7B", - "id": "qwen/qwen1.5-7b", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.275, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.8087765293383271 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.448, - "details": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.855, mean=0.855, max=0.855, sum=0.855 (1)", - "tab": "Efficiency", - "score": 0.8547548650016248 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3468.913, mean=3468.913, max=3468.913, sum=3468.913 (1)", - "tab": "General information", - "score": 3468.912676056338 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27, - "details": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)", - "tab": "Efficiency", - "score": 0.4786673946380615 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)", - "tab": "Efficiency", - "score": 0.354404949426651 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1990.955, mean=1990.955, max=1990.955, sum=1990.955 (1)", - "tab": "General information", - "score": 1990.955 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)", - "tab": "General information", - "score": 119.262 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.806, - "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=0.806 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)", - "tab": "Efficiency", - "score": 0.2806105532646179 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)", - "tab": "General information", - "score": 242.846 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.569, - "details": { - "description": "min=0.39, mean=0.569, max=0.84, sum=2.847 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.281, mean=0.289, max=0.298, sum=1.447 (5)", - "tab": "Efficiency", - "score": 0.28946571837810053 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)", - "tab": "General information", - "score": 470.8357192982456 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.462, mean=0.561, max=0.726, sum=3.928 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=2.593, mean=2.933, max=3.209, sum=20.53 (7)", - "tab": "Efficiency", - "score": 2.9328109453469335 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)", - "tab": "General information", - "score": 1323.836848955025 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=0.6 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.381, mean=1.381, max=1.381, sum=1.381 (1)", - "tab": "Efficiency", - "score": 1.380831289768219 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)", - "tab": "General information", - "score": 1130.403 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.523, - "details": { - "description": "min=0.253, mean=0.523, max=0.716, sum=2.614 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.298, mean=0.44, max=0.946, sum=2.2 (5)", - "tab": "Efficiency", - "score": 0.4400657887452306 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=192.453, mean=1542.088, max=6430.714, sum=7710.442 (5)", - "tab": "General information", - "score": 1542.0883229968654 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.479, - "details": { - "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.298 (1)", - "tab": "Efficiency", - "score": 0.2983713296962306 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)", - "tab": "General information", - "score": 1045.4850894632207 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.153, - "details": { - "description": "min=0.082, mean=0.153, max=0.19, sum=0.767 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.461, mean=0.484, max=0.517, sum=2.421 (5)", - "tab": "Efficiency", - "score": 0.4841760334465496 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)", - "tab": "General information", - "score": 126.65662658663405 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json deleted file mode 100644 index 58edcde03..000000000 --- a/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2 Instruct 72B", - "id": "qwen/qwen2-72b-instruct", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.592421972534332 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=0.727 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.19, mean=1.19, max=1.19, sum=1.19 (1)", - "tab": "Efficiency", - "score": 1.1896146727279877 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3502.913, mean=3502.913, max=3502.913, sum=3502.913 (1)", - "tab": "General information", - "score": 3502.912676056338 - }, - "NarrativeQA - # output tokens": { - "description": "min=11.642, mean=11.642, max=11.642, sum=11.642 (1)", - "tab": "General information", - "score": 11.64225352112676 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.868, mean=0.868, max=0.868, sum=0.868 (1)", - "tab": "Efficiency", - "score": 0.8683992192745209 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.356 (1)", - "tab": "Efficiency", - "score": 0.35628414297103883 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2017.955, mean=2017.955, max=2017.955, sum=2017.955 (1)", - "tab": "General information", - "score": 2017.955 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=9.044, mean=9.044, max=9.044, sum=9.044 (1)", - "tab": "General information", - "score": 9.044 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=146.262, mean=146.262, max=146.262, sum=146.262 (1)", - "tab": "General information", - "score": 146.262 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=6.433, mean=6.433, max=6.433, sum=6.433 (1)", - "tab": "General information", - "score": 6.433 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.954, - "details": { - "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.218, mean=0.218, max=0.218, sum=0.218 (1)", - "tab": "Efficiency", - "score": 0.21781798839569091 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)", - "tab": "General information", - "score": 249.846 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.769, - "details": { - "description": "min=0.65, mean=0.769, max=0.94, sum=3.847 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.195, mean=0.277, max=0.395, sum=1.385 (5)", - "tab": "Efficiency", - "score": 0.2769099538284435 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)", - "tab": "General information", - "score": 477.8357192982456 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.605, mean=0.79, max=0.93, sum=5.533 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.599, mean=4.461, max=5.828, sum=31.228 (7)", - "tab": "Efficiency", - "score": 4.461141077844028 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)", - "tab": "General information", - "score": 1323.836848955025 - }, - "MATH - # output tokens": { - "description": "min=145.36, mean=173.894, max=202.346, sum=1217.257 (7)", - "tab": "General information", - "score": 173.89384019579856 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=0.92 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=6.592, mean=6.592, max=6.592, sum=6.592 (1)", - "tab": "Efficiency", - "score": 6.592170278310776 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)", - "tab": "General information", - "score": 1130.403 - }, - "GSM8K - # output tokens": { - "description": "min=166.4, mean=166.4, max=166.4, sum=166.4 (1)", - "tab": "General information", - "score": 166.4 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.712, - "details": { - "description": "min=0.411, mean=0.712, max=0.947, sum=3.559 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.233, mean=0.521, max=1.575, sum=2.605 (5)", - "tab": "Efficiency", - "score": 0.5210018908984072 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=207.453, mean=1557.088, max=6445.714, sum=7785.442 (5)", - "tab": "General information", - "score": 1557.0883229968654 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.299, max=3.042, sum=11.494 (5)", - "tab": "General information", - "score": 2.2988842678904344 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.535, mean=0.535, max=0.535, sum=0.535 (1)", - "tab": "Efficiency", - "score": 0.5349795590812122 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)", - "tab": "General information", - "score": 1052.4850894632207 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.207, - "details": { - "description": "min=0.156, mean=0.207, max=0.255, sum=1.033 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.802, mean=0.827, max=0.86, sum=4.135 (5)", - "tab": "Efficiency", - "score": 0.8269615642193179 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=124.855, mean=142.657, max=158.373, sum=713.283 (5)", - "tab": "General information", - "score": 142.65662658663405 - }, - "WMT 2014 - # output tokens": { - "description": "min=25.368, mean=27.029, max=27.714, sum=135.143 (5)", - "tab": "General information", - "score": 27.028530260743235 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json deleted file mode 100644 index 3e08a0cdf..000000000 --- a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json +++ /dev/null @@ -1,644 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5 Instruct Turbo 72B", - "id": "qwen/qwen2.5-72b-instruct-turbo", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5851310861423221 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=0.745 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.853, mean=0.853, max=0.853, sum=0.853 (1)", - "tab": "Efficiency", - "score": 0.8528219290182624 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3492.913, mean=3492.913, max=3492.913, sum=3492.913 (1)", - "tab": "General information", - "score": 3492.912676056338 - }, - "NarrativeQA - # output tokens": { - "description": "min=8.718, mean=8.718, max=8.718, sum=8.718 (1)", - "tab": "General information", - "score": 8.71830985915493 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.359, - "details": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.359 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.974, mean=0.974, max=0.974, sum=0.974 (1)", - "tab": "Efficiency", - "score": 0.9738211624622345 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=0.506 (1)", - "tab": "Efficiency", - "score": 0.5063141629695892 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2007.955, mean=2007.955, max=2007.955, sum=2007.955 (1)", - "tab": "General information", - "score": 2007.955 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=17.681, mean=17.681, max=17.681, sum=17.681 (1)", - "tab": "General information", - "score": 17.681 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=136.262, mean=136.262, max=136.262, sum=136.262 (1)", - "tab": "General information", - "score": 136.262 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=15.132, mean=15.132, max=15.132, sum=15.132 (1)", - "tab": "General information", - "score": 15.132 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.962, - "details": { - "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.372 (1)", - "tab": "Efficiency", - "score": 0.3723496675491333 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)", - "tab": "General information", - "score": 249.846 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.62, mean=0.77, max=0.96, sum=3.848 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.438, mean=0.585, max=0.815, sum=2.924 (5)", - "tab": "Efficiency", - "score": 0.5848997679509614 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)", - "tab": "General information", - "score": 477.8357192982456 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.763, mean=0.884, max=0.97, sum=6.187 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.874, mean=6.367, max=11.192, sum=44.569 (7)", - "tab": "Efficiency", - "score": 6.366941373965945 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)", - "tab": "General information", - "score": 1323.836848955025 - }, - "MATH - # output tokens": { - "description": "min=147.558, mean=186.764, max=230.288, sum=1307.351 (7)", - "tab": "General information", - "score": 186.76438709076407 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=0.9 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.558, mean=2.558, max=2.558, sum=2.558 (1)", - "tab": "Efficiency", - "score": 2.5583292784690856 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)", - "tab": "General information", - "score": 1130.403 - }, - "GSM8K - # output tokens": { - "description": "min=198.303, mean=198.303, max=198.303, sum=198.303 (1)", - "tab": "General information", - "score": 198.303 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.46, mean=0.74, max=0.979, sum=3.7 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.306, mean=0.445, max=0.944, sum=2.224 (5)", - "tab": "Efficiency", - "score": 0.44489043568091446 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=216.453, mean=1558.888, max=6440.714, sum=7794.442 (5)", - "tab": "General information", - "score": 1558.8883229968653 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.453, max=3.021, sum=12.263 (5)", - "tab": "General information", - "score": 2.452587326627195 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.753, - "details": { - "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)", - "tab": "Efficiency", - "score": 0.33223102912751157 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)", - "tab": "General information", - "score": 1052.4850894632207 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.207, - "details": { - "description": "min=0.153, mean=0.207, max=0.257, sum=1.033 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.635, mean=0.67, max=0.752, sum=3.351 (5)", - "tab": "Efficiency", - "score": 0.6702916101891663 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=114.855, mean=132.657, max=148.373, sum=663.283 (5)", - "tab": "General information", - "score": 132.65662658663405 - }, - "WMT 2014 - # output tokens": { - "description": "min=25.517, mean=27.126, max=27.755, sum=135.631 (5)", - "tab": "General information", - "score": 27.126178505887747 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json deleted file mode 100644 index 3f844c281..000000000 --- a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json +++ /dev/null @@ -1,644 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5 Instruct Turbo 7B", - "id": "qwen/qwen2.5-7b-instruct-turbo", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.488, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.8808988764044944 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.742, mean=0.742, max=0.742, sum=0.742 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.516, mean=0.516, max=0.516, sum=0.516 (1)", - "tab": "Efficiency", - "score": 0.5156192410160119 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3492.913, mean=3492.913, max=3492.913, sum=3492.913 (1)", - "tab": "General information", - "score": 3492.912676056338 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.549, mean=5.549, max=5.549, sum=5.549 (1)", - "tab": "General information", - "score": 5.549295774647887 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.205, - "details": { - "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.301 (1)", - "tab": "Efficiency", - "score": 0.30121764993667605 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)", - "tab": "Efficiency", - "score": 0.21686342740058898 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2007.955, mean=2007.955, max=2007.955, sum=2007.955 (1)", - "tab": "General information", - "score": 2007.955 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=8.698, mean=8.698, max=8.698, sum=8.698 (1)", - "tab": "General information", - "score": 8.698 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=136.262, mean=136.262, max=136.262, sum=136.262 (1)", - "tab": "General information", - "score": 136.262 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=7.041, mean=7.041, max=7.041, sum=7.041 (1)", - "tab": "General information", - "score": 7.041 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.862, - "details": { - "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.186, mean=0.186, max=0.186, sum=0.186 (1)", - "tab": "Efficiency", - "score": 0.1863201789855957 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)", - "tab": "General information", - "score": 249.846 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.658, - "details": { - "description": "min=0.49, mean=0.658, max=0.86, sum=3.29 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.285, mean=0.35, max=0.431, sum=1.751 (5)", - "tab": "Efficiency", - "score": 0.35013260537699653 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)", - "tab": "General information", - "score": 477.8357192982456 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.684, mean=0.835, max=0.963, sum=5.846 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.449, mean=1.825, max=2.345, sum=12.778 (7)", - "tab": "Efficiency", - "score": 1.8253796190803115 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)", - "tab": "General information", - "score": 1323.836848955025 - }, - "MATH - # output tokens": { - "description": "min=156.674, mean=196.898, max=240.288, sum=1378.285 (7)", - "tab": "General information", - "score": 196.8978610559394 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=1.7, mean=1.7, max=1.7, sum=1.7 (1)", - "tab": "Efficiency", - "score": 1.7000067098140716 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)", - "tab": "General information", - "score": 1130.403 - }, - "GSM8K - # output tokens": { - "description": "min=194.776, mean=194.776, max=194.776, sum=194.776 (1)", - "tab": "General information", - "score": 194.776 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.632, - "details": { - "description": "min=0.414, mean=0.632, max=0.916, sum=3.161 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.183, mean=0.261, max=0.489, sum=1.305 (5)", - "tab": "Efficiency", - "score": 0.2609495958632719 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=216.453, mean=1558.888, max=6440.714, sum=7794.442 (5)", - "tab": "General information", - "score": 1558.8883229968653 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.402, max=3.084, sum=12.008 (5)", - "tab": "General information", - "score": 2.4015832496773273 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=0.6 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.201, mean=0.201, max=0.201, sum=0.201 (1)", - "tab": "Efficiency", - "score": 0.20058301760709546 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)", - "tab": "General information", - "score": 1052.4850894632207 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.155, - "details": { - "description": "min=0.085, mean=0.155, max=0.204, sum=0.777 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.346, mean=0.376, max=0.414, sum=1.88 (5)", - "tab": "Efficiency", - "score": 0.3759268445955365 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=114.855, mean=132.657, max=148.373, sum=663.283 (5)", - "tab": "General information", - "score": 132.65662658663405 - }, - "WMT 2014 - # output tokens": { - "description": "min=26.946, mean=27.742, max=28.649, sum=138.709 (5)", - "tab": "General information", - "score": 27.74173612173115 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json deleted file mode 100644 index 09f377d89..000000000 --- a/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Arctic Instruct", - "id": "snowflake/snowflake-arctic-instruct", - "developer": "snowflake", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.7606242197253433 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.654, - "details": { - "description": "min=0.654, mean=0.654, max=0.654, sum=0.654 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)", - "tab": "Efficiency", - "score": 0.6239793220036466 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=4.262, mean=4.262, max=4.262, sum=4.262 (1)", - "tab": "General information", - "score": 4.261971830985916 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3603.217, mean=3603.217, max=3603.217, sum=3603.217 (1)", - "tab": "General information", - "score": 3603.2169014084507 - }, - "NarrativeQA - # output tokens": { - "description": "min=11.907, mean=11.907, max=11.907, sum=11.907 (1)", - "tab": "General information", - "score": 11.907042253521126 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.636, mean=0.636, max=0.636, sum=0.636 (1)", - "tab": "Efficiency", - "score": 0.6355201268196106 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.469, mean=0.469, max=0.469, sum=0.469 (1)", - "tab": "Efficiency", - "score": 0.4687326259613037 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.825, mean=4.825, max=4.825, sum=4.825 (1)", - "tab": "General information", - "score": 4.825 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.028, mean=0.028, max=0.028, sum=0.028 (1)", - "tab": "General information", - "score": 0.028 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2311.514, mean=2311.514, max=2311.514, sum=2311.514 (1)", - "tab": "General information", - "score": 2311.514 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=18.701, mean=18.701, max=18.701, sum=18.701 (1)", - "tab": "General information", - "score": 18.701 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=166.383, mean=166.383, max=166.383, sum=166.383 (1)", - "tab": "General information", - "score": 166.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=14.473, mean=14.473, max=14.473, sum=14.473 (1)", - "tab": "General information", - "score": 14.473 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)", - "tab": "Efficiency", - "score": 0.2840936713218689 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=291.574, mean=291.574, max=291.574, sum=291.574 (1)", - "tab": "General information", - "score": 291.574 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.575, - "details": { - "description": "min=0.31, mean=0.575, max=0.88, sum=2.876 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.293, mean=0.303, max=0.317, sum=1.516 (5)", - "tab": "Efficiency", - "score": 0.30325288054817606 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)", - "tab": "General information", - "score": 531.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519, - "details": { - "description": "min=0.316, mean=0.519, max=0.785, sum=3.636 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.482, mean=1.724, max=1.995, sum=12.068 (7)", - "tab": "Efficiency", - "score": 1.723981539653867 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)", - "tab": "General information", - "score": 1438.6362030100095 - }, - "MATH - # output tokens": { - "description": "min=82.872, mean=98.802, max=122.233, sum=691.615 (7)", - "tab": "General information", - "score": 98.80208187931566 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.961, mean=2.961, max=2.961, sum=2.961 (1)", - "tab": "Efficiency", - "score": 2.9610197002887726 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)", - "tab": "General information", - "score": 1207.746 - }, - "GSM8K - # output tokens": { - "description": "min=189.305, mean=189.305, max=189.305, sum=189.305 (1)", - "tab": "General information", - "score": 189.305 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.588, - "details": { - "description": "min=0.351, mean=0.588, max=0.874, sum=2.94 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.292, mean=0.346, max=0.462, sum=1.729 (5)", - "tab": "Efficiency", - "score": 0.34576316386866485 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=1.81, mean=4.162, max=5, sum=20.81 (5)", - "tab": "General information", - "score": 4.162040816326531 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.002, max=0.008, sum=0.008 (5)", - "tab": "General information", - "score": 0.0016326530612244899 - }, - "LegalBench - # prompt tokens": { - "description": "min=239.137, mean=1024.722, max=3561.237, sum=5123.61 (5)", - "tab": "General information", - "score": 1024.7220443430492 - }, - "LegalBench - # output tokens": { - "description": "min=2, mean=2.438, max=3.421, sum=12.188 (5)", - "tab": "General information", - "score": 2.4375592890361366 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.581, - "details": { - "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.313 (1)", - "tab": "Efficiency", - "score": 0.31300480038697864 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)", - "tab": "General information", - "score": 1243.9005964214712 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.172, - "details": { - "description": "min=0.09, mean=0.172, max=0.217, sum=0.86 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.65, mean=0.681, max=0.702, sum=3.405 (5)", - "tab": "Efficiency", - "score": 0.681007040066764 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=145.523, mean=160.288, max=182.972, sum=801.438 (5)", - "tab": "General information", - "score": 160.28751290334915 - }, - "WMT 2014 - # output tokens": { - "description": "min=28.596, mean=30.59, max=31.485, sum=152.951 (5)", - "tab": "General information", - "score": 30.59012702630372 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json deleted file mode 100644 index 2bf240f96..000000000 --- a/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon 40B", - "id": "tiiuae/falcon-40b", - "developer": "tiiuae", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.217, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.086729088639201 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.671, - "details": { - "description": "min=0.671, mean=0.671, max=0.671, sum=0.671 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=4.985, mean=4.985, max=4.985, sum=4.985 (1)", - "tab": "Efficiency", - "score": 4.985411514362819 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.023, mean=2.023, max=2.023, sum=2.023 (1)", - "tab": "General information", - "score": 2.0225352112676056 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1692.33, mean=1692.33, max=1692.33, sum=1692.33 (1)", - "tab": "General information", - "score": 1692.3295774647888 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392, - "details": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=3.184, mean=3.184, max=3.184, sum=3.184 (1)", - "tab": "Efficiency", - "score": 3.184468511581421 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=2.849, mean=2.849, max=2.849, sum=2.849 (1)", - "tab": "Efficiency", - "score": 2.848947753429413 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.598, mean=4.598, max=4.598, sum=4.598 (1)", - "tab": "General information", - "score": 4.598 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1586.717, mean=1586.717, max=1586.717, sum=1586.717 (1)", - "tab": "General information", - "score": 1586.717 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.991, mean=0.991, max=0.991, sum=0.991 (1)", - "tab": "General information", - "score": 0.991 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)", - "tab": "General information", - "score": 124.246 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662, - "details": { - "description": "min=0.662, mean=0.662, max=0.662, sum=0.662 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=1.268, mean=1.268, max=1.268, sum=1.268 (1)", - "tab": "Efficiency", - "score": 1.268236391544342 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=251.174, mean=251.174, max=251.174, sum=251.174 (1)", - "tab": "General information", - "score": 251.174 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507, - "details": { - "description": "min=0.31, mean=0.507, max=0.79, sum=2.535 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=1.176, mean=1.431, max=1.805, sum=7.154 (5)", - "tab": "Efficiency", - "score": 1.4308063889804639 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)", - "tab": "General information", - "score": 500.12014035087725 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.128, - "details": { - "description": "min=0.019, mean=0.128, max=0.228, sum=0.893 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=7.555, mean=11.414, max=18.723, sum=79.896 (7)", - "tab": "Efficiency", - "score": 11.413689562224084 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2.385, mean=6.818, max=8, sum=47.727 (7)", - "tab": "General information", - "score": 6.818102949681896 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=965.096, mean=1150.049, max=1495.447, sum=8050.346 (7)", - "tab": "General information", - "score": 1150.0493709178531 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.267, - "details": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.267 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=12.967, mean=12.967, max=12.967, sum=12.967 (1)", - "tab": "Efficiency", - "score": 12.967224577903748 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1056.967, mean=1056.967, max=1056.967, sum=1056.967 (1)", - "tab": "General information", - "score": 1056.967 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.442, - "details": { - "description": "min=0.204, mean=0.442, max=0.737, sum=2.209 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=1.333, mean=1.731, max=3.174, sum=8.654 (5)", - "tab": "Efficiency", - "score": 1.730808089747147 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.265, mean=3.853, max=5, sum=19.265 (5)", - "tab": "General information", - "score": 3.853061224489796 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.003, max=0.016, sum=0.016 (5)", - "tab": "General information", - "score": 0.0032653061224489797 - }, - "LegalBench - # prompt tokens": { - "description": "min=211.284, mean=566.694, max=1486.482, sum=2833.468 (5)", - "tab": "General information", - "score": 566.6935553560819 - }, - "LegalBench - # output tokens": { - "description": "min=0.876, mean=0.975, max=1, sum=4.876 (5)", - "tab": "General information", - "score": 0.9751020408163266 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.419, - "details": { - "description": "min=0.419, mean=0.419, max=0.419, sum=0.419 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=2.203, mean=2.203, max=2.203, sum=2.203 (1)", - "tab": "Efficiency", - "score": 2.202825612149703 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1048.624, mean=1048.624, max=1048.624, sum=1048.624 (1)", - "tab": "General information", - "score": 1048.624254473161 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.162, - "details": { - "description": "min=0.017, mean=0.162, max=0.208, sum=0.809 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=2.468, mean=3.098, max=4.642, sum=15.491 (5)", - "tab": "Efficiency", - "score": 3.0981059579736714 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=115.642, mean=162.454, max=224.817, sum=812.272 (5)", - "tab": "General information", - "score": 162.45444400902278 - }, - "WMT 2014 - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json deleted file mode 100644 index 9a704269c..000000000 --- a/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon 7B", - "id": "tiiuae/falcon-7b", - "developer": "tiiuae", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.064, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.36905118601747816 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621, - "details": { - "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.141, mean=1.141, max=1.141, sum=1.141 (1)", - "tab": "Efficiency", - "score": 1.1411562691272144 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=2.023, mean=2.023, max=2.023, sum=2.023 (1)", - "tab": "General information", - "score": 2.0225352112676056 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=1692.33, mean=1692.33, max=1692.33, sum=1692.33 (1)", - "tab": "General information", - "score": 1692.3295774647888 - }, - "NarrativeQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.285, - "details": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.009, mean=1.009, max=1.009, sum=1.009 (1)", - "tab": "Efficiency", - "score": 1.0090243232250213 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)", - "tab": "Efficiency", - "score": 0.8758702797889709 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.598, mean=4.598, max=4.598, sum=4.598 (1)", - "tab": "General information", - "score": 4.598 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)", - "tab": "General information", - "score": 0.039 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1586.717, mean=1586.717, max=1586.717, sum=1586.717 (1)", - "tab": "General information", - "score": 1586.717 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=0.99, mean=0.99, max=0.99, sum=0.99 (1)", - "tab": "General information", - "score": 0.99 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)", - "tab": "General information", - "score": 124.246 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)", - "tab": "Efficiency", - "score": 0.4118037748336792 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=251.174, mean=251.174, max=251.174, sum=251.174 (1)", - "tab": "General information", - "score": 251.174 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.288, - "details": { - "description": "min=0.17, mean=0.288, max=0.39, sum=1.441 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.434, mean=0.475, max=0.497, sum=2.373 (5)", - "tab": "Efficiency", - "score": 0.47453500427279555 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)", - "tab": "General information", - "score": 500.12014035087725 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.044, - "details": { - "description": "min=0, mean=0.044, max=0.105, sum=0.307 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=5.445, mean=6.987, max=10.873, sum=48.91 (7)", - "tab": "Efficiency", - "score": 6.987098801445013 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=2.385, mean=6.818, max=8, sum=47.727 (7)", - "tab": "General information", - "score": 6.818102949681896 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=965.096, mean=1150.049, max=1495.447, sum=8050.346 (7)", - "tab": "General information", - "score": 1150.0493709178531 - }, - "MATH - # output tokens": { - "description": "min=1, mean=1, max=1, sum=7 (7)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.055, - "details": { - "description": "min=0.055, mean=0.055, max=0.055, sum=0.055 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=6.94, mean=6.94, max=6.94, sum=6.94 (1)", - "tab": "Efficiency", - "score": 6.940216990470886 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1056.967, mean=1056.967, max=1056.967, sum=1056.967 (1)", - "tab": "General information", - "score": 1056.967 - }, - "GSM8K - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346, - "details": { - "description": "min=0.12, mean=0.346, max=0.558, sum=1.731 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.453, mean=0.628, max=1.041, sum=3.139 (5)", - "tab": "Efficiency", - "score": 0.6278266410596228 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=0.265, mean=3.853, max=5, sum=19.265 (5)", - "tab": "General information", - "score": 3.853061224489796 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0.003, max=0.016, sum=0.016 (5)", - "tab": "General information", - "score": 0.0032653061224489797 - }, - "LegalBench - # prompt tokens": { - "description": "min=211.284, mean=566.694, max=1486.482, sum=2833.468 (5)", - "tab": "General information", - "score": 566.6935553560819 - }, - "LegalBench - # output tokens": { - "description": "min=0.982, mean=0.996, max=1, sum=4.982 (5)", - "tab": "General information", - "score": 0.9963265306122449 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.254, - "details": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.254 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)", - "tab": "Efficiency", - "score": 0.7352914724861889 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1048.624, mean=1048.624, max=1048.624, sum=1048.624 (1)", - "tab": "General information", - "score": 1048.624254473161 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.094, - "details": { - "description": "min=0.0, mean=0.094, max=0.186, sum=0.471 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.05, mean=1.604, max=3.055, sum=8.019 (5)", - "tab": "Efficiency", - "score": 1.6038075838932468 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=115.642, mean=162.454, max=224.817, sum=812.272 (5)", - "tab": "General information", - "score": 162.45444400902278 - }, - "WMT 2014 - # output tokens": { - "description": "min=0.999, mean=1.0, max=1, sum=4.999 (5)", - "tab": "General information", - "score": 0.9997596153846153 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json deleted file mode 100644 index 1f111d01c..000000000 --- a/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json +++ /dev/null @@ -1,643 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Solar Pro", - "id": "upstage/solar-pro-241126", - "developer": "upstage", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.4817852684144819 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.753, - "details": { - "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=2.29, mean=2.29, max=2.29, sum=2.29 (1)", - "tab": "Efficiency", - "score": 2.2897866705773584 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=4063.606, mean=4063.606, max=4063.606, sum=4063.606 (1)", - "tab": "General information", - "score": 4063.605633802817 - }, - "NarrativeQA - # output tokens": { - "description": "min=5.972, mean=5.972, max=5.972, sum=5.972 (1)", - "tab": "General information", - "score": 5.971830985915493 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297, - "details": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.297 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.102, mean=1.102, max=1.102, sum=1.102 (1)", - "tab": "Efficiency", - "score": 1.1022112455368043 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.588, mean=0.588, max=0.588, sum=0.588 (1)", - "tab": "Efficiency", - "score": 0.5883909621238709 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=2513.406, mean=2513.406, max=2513.406, sum=2513.406 (1)", - "tab": "General information", - "score": 2513.406 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.252, mean=7.252, max=7.252, sum=7.252 (1)", - "tab": "General information", - "score": 7.252 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=156.383, mean=156.383, max=156.383, sum=156.383 (1)", - "tab": "General information", - "score": 156.383 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=9.034, mean=9.034, max=9.034, sum=9.034 (1)", - "tab": "General information", - "score": 9.034 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.922, - "details": { - "description": "min=0.922, mean=0.922, max=0.922, sum=0.922 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)", - "tab": "Efficiency", - "score": 0.43103125095367434 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=291.574, mean=291.574, max=291.574, sum=291.574 (1)", - "tab": "General information", - "score": 291.574 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679, - "details": { - "description": "min=0.46, mean=0.679, max=0.97, sum=3.395 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.429, mean=0.529, max=0.765, sum=2.644 (5)", - "tab": "Efficiency", - "score": 0.5287977041361624 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)", - "tab": "General information", - "score": 531.5470877192982 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.567, - "details": { - "description": "min=0.421, mean=0.567, max=0.741, sum=3.968 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.926, mean=2.29, max=2.87, sum=16.027 (7)", - "tab": "Efficiency", - "score": 2.289581796117552 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)", - "tab": "General information", - "score": 1438.6362030100095 - }, - "MATH - # output tokens": { - "description": "min=94.269, mean=124.053, max=183.018, sum=868.373 (7)", - "tab": "General information", - "score": 124.05328023895956 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=0.871 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.666, mean=2.666, max=2.666, sum=2.666 (1)", - "tab": "Efficiency", - "score": 2.6663423478603363 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)", - "tab": "General information", - "score": 1207.746 - }, - "GSM8K - # output tokens": { - "description": "min=143.978, mean=143.978, max=143.978, sum=143.978 (1)", - "tab": "General information", - "score": 143.978 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.384, mean=0.67, max=0.905, sum=3.348 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.438, mean=0.654, max=1.454, sum=3.271 (5)", - "tab": "Efficiency", - "score": 0.6542452756040519 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.8, max=5, sum=24 (5)", - "tab": "General information", - "score": 4.8 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=229.137, mean=1839.512, max=7675.188, sum=9197.561 (5)", - "tab": "General information", - "score": 1839.5122484246817 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.395, max=2.011, sum=6.977 (5)", - "tab": "General information", - "score": 1.3953837372723363 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.698, - "details": { - "description": "min=0.698, mean=0.698, max=0.698, sum=0.698 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.596, mean=0.596, max=0.596, sum=0.596 (1)", - "tab": "Efficiency", - "score": 0.5956100185159187 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)", - "tab": "General information", - "score": 1243.9005964214712 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.169, - "details": { - "description": "min=0.085, mean=0.169, max=0.229, sum=0.844 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.839, mean=0.871, max=0.895, sum=4.357 (5)", - "tab": "Efficiency", - "score": 0.8713457104322841 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=135.523, mean=150.288, max=172.972, sum=751.438 (5)", - "tab": "General information", - "score": 150.28751290334915 - }, - "WMT 2014 - # output tokens": { - "description": "min=27.539, mean=30.28, max=31.635, sum=151.4 (5)", - "tab": "General information", - "score": 30.280004587857473 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json deleted file mode 100644 index 8026be475..000000000 --- a/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json +++ /dev/null @@ -1,649 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/writer_palmyra-x-004/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra-X-004", - "id": "writer/palmyra-x-004", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.4045318352059925 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "description": "min=0.773, mean=0.773, max=0.773, sum=0.773 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.634, mean=1.634, max=1.634, sum=1.634 (1)", - "tab": "Efficiency", - "score": 1.634409177135414 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)", - "tab": "General information", - "score": 3484.2676056338028 - }, - "NarrativeQA - # output tokens": { - "description": "min=6.338, mean=6.338, max=6.338, sum=6.338 (1)", - "tab": "General information", - "score": 6.338028169014085 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.457, - "details": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=1.221, mean=1.221, max=1.221, sum=1.221 (1)", - "tab": "Efficiency", - "score": 1.22119681596756 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=1.213, mean=1.213, max=1.213, sum=1.213 (1)", - "tab": "Efficiency", - "score": 1.2129934797286988 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)", - "tab": "General information", - "score": 4.965 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)", - "tab": "General information", - "score": 0.007 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1675.231, mean=1675.231, max=1675.231, sum=1675.231 (1)", - "tab": "General information", - "score": 1675.231 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=10.295, mean=10.295, max=10.295, sum=10.295 (1)", - "tab": "General information", - "score": 10.295 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)", - "tab": "General information", - "score": 129.12 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=12.549, mean=12.549, max=12.549, sum=12.549 (1)", - "tab": "General information", - "score": 12.549 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook", - "stop": "none" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.926, - "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=0.926 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.271, mean=0.271, max=0.271, sum=0.271 (1)", - "tab": "Efficiency", - "score": 0.2705215420722961 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)", - "tab": "General information", - "score": 249.776 - }, - "OpenbookQA - # output tokens": { - "description": "min=0.992, mean=0.992, max=0.992, sum=0.992 (1)", - "tab": "General information", - "score": 0.992 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739, - "details": { - "description": "min=0.52, mean=0.739, max=0.92, sum=3.694 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.309, mean=0.396, max=0.722, sum=1.982 (5)", - "tab": "Efficiency", - "score": 0.39635124337045774 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)", - "tab": "General information", - "score": 467.6862105263158 - }, - "MMLU - # output tokens": { - "description": "min=0.97, mean=0.99, max=1, sum=4.951 (5)", - "tab": "General information", - "score": 0.9902456140350877 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.767, - "details": { - "description": "min=0.553, mean=0.767, max=0.948, sum=5.371 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=5.13, mean=14.827, max=45.729, sum=103.786 (7)", - "tab": "Efficiency", - "score": 14.82662017363065 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)", - "tab": "General information", - "score": 1262.9092130545007 - }, - "MATH - # output tokens": { - "description": "min=174.547, mean=209.333, max=238.692, sum=1465.33 (7)", - "tab": "General information", - "score": 209.3327932233685 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True", - "stop": "none" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "details": { - "description": "min=0.905, mean=0.905, max=0.905, sum=0.905 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=11.45, mean=11.45, max=11.45, sum=11.45 (1)", - "tab": "Efficiency", - "score": 11.449529441833496 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)", - "tab": "General information", - "score": 959.032 - }, - "GSM8K - # output tokens": { - "description": "min=174.327, mean=174.327, max=174.327, sum=174.327 (1)", - "tab": "General information", - "score": 174.327 - } - } - }, - "generation_config": { - "additional_details": { - "stop": "none" - } - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.433, mean=0.73, max=0.989, sum=3.648 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.478, mean=0.504, max=0.522, sum=2.519 (5)", - "tab": "Efficiency", - "score": 0.5037181089898329 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=4, mean=4.798, max=5, sum=23.992 (5)", - "tab": "General information", - "score": 4.798367346938775 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=216.442, mean=1524.207, max=6297.633, sum=7621.033 (5)", - "tab": "General information", - "score": 1524.206501356544 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.416, max=2.021, sum=7.082 (5)", - "tab": "General information", - "score": 1.4163162483866343 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ], - "stop": "none" - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "details": { - "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.399, mean=0.399, max=0.399, sum=0.399 (1)", - "tab": "Efficiency", - "score": 0.39942375139498093 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)", - "tab": "General information", - "score": 1025.2743538767395 - }, - "MedQA - # output tokens": { - "description": "min=0.992, mean=0.992, max=0.992, sum=0.992 (1)", - "tab": "General information", - "score": 0.9920477137176938 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.203, - "details": { - "description": "min=0.144, mean=0.203, max=0.249, sum=1.016 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.801, mean=2.046, max=2.515, sum=10.228 (5)", - "tab": "Efficiency", - "score": 2.045695114985284 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=96.139, mean=115.712, max=136.117, sum=578.559 (5)", - "tab": "General information", - "score": 115.71178123566294 - }, - "WMT 2014 - # output tokens": { - "description": "min=26.191, mean=29.362, max=37.718, sum=146.808 (5)", - "tab": "General information", - "score": 29.36160106667686 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ], - "stop": "none" - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json deleted file mode 100644 index 5e5faf9fb..000000000 --- a/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra X V2 33B", - "id": "writer/palmyra-x-v2", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.5062546816479401 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=1.202, mean=1.202, max=1.202, sum=1.202 (1)", - "tab": "Efficiency", - "score": 1.2016644296511798 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3504.577, mean=3504.577, max=3504.577, sum=3504.577 (1)", - "tab": "General information", - "score": 3504.5774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=8.208, mean=8.208, max=8.208, sum=8.208 (1)", - "tab": "General information", - "score": 8.208450704225353 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428, - "details": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=0.969, mean=0.969, max=0.969, sum=0.969 (1)", - "tab": "Efficiency", - "score": 0.9688332653045655 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=0.62, mean=0.62, max=0.62, sum=0.62 (1)", - "tab": "Efficiency", - "score": 0.6202523970603943 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.926, mean=4.926, max=4.926, sum=4.926 (1)", - "tab": "General information", - "score": 4.926 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.013, mean=0.013, max=0.013, sum=0.013 (1)", - "tab": "General information", - "score": 0.013 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1662.782, mean=1662.782, max=1662.782, sum=1662.782 (1)", - "tab": "General information", - "score": 1662.782 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=7.809, mean=7.809, max=7.809, sum=7.809 (1)", - "tab": "General information", - "score": 7.809 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)", - "tab": "General information", - "score": 116.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=7.067, mean=7.067, max=7.067, sum=7.067 (1)", - "tab": "General information", - "score": 7.067 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=0.878 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)", - "tab": "Efficiency", - "score": 0.4200127201080322 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)", - "tab": "General information", - "score": 254.21 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621, - "details": { - "description": "min=0.37, mean=0.621, max=0.91, sum=3.106 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.462, mean=0.532, max=0.577, sum=2.661 (5)", - "tab": "Efficiency", - "score": 0.5321985618859008 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)", - "tab": "General information", - "score": 472.2740350877192 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.395, mean=0.58, max=0.8, sum=4.059 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=1.722, mean=2.088, max=2.676, sum=14.619 (7)", - "tab": "Efficiency", - "score": 2.0883775065675723 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)", - "tab": "General information", - "score": 1375.7353092779654 - }, - "MATH - # output tokens": { - "description": "min=64, mean=87.032, max=107.385, sum=609.221 (7)", - "tab": "General information", - "score": 87.03154467364993 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=2.543, mean=2.543, max=2.543, sum=2.543 (1)", - "tab": "Efficiency", - "score": 2.543274956703186 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)", - "tab": "General information", - "score": 938.869 - }, - "GSM8K - # output tokens": { - "description": "min=89.718, mean=89.718, max=89.718, sum=89.718 (1)", - "tab": "General information", - "score": 89.718 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644, - "details": { - "description": "min=0.33, mean=0.644, max=0.989, sum=3.221 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.425, mean=0.731, max=1.784, sum=3.657 (5)", - "tab": "Efficiency", - "score": 0.7313747247589137 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=3.984, mean=4.597, max=5, sum=22.984 (5)", - "tab": "General information", - "score": 4.596734693877551 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.632, mean=1355.759, max=5467.178, sum=6778.793 (5)", - "tab": "General information", - "score": 1355.7586406214054 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=2.077, max=5.406, sum=10.386 (5)", - "tab": "General information", - "score": 2.0771673311343752 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.598, - "details": { - "description": "min=0.598, mean=0.598, max=0.598, sum=0.598 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)", - "tab": "Efficiency", - "score": 0.6051040529967776 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)", - "tab": "General information", - "score": 1038.8608349900596 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.239, - "details": { - "description": "min=0.2, mean=0.239, max=0.27, sum=1.194 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=0.83, mean=0.905, max=0.948, sum=4.524 (5)", - "tab": "Efficiency", - "score": 0.904815991352295 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)", - "tab": "General information", - "score": 181.69386660804403 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.829, mean=25.142, max=25.958, sum=125.709 (5)", - "tab": "General information", - "score": 25.14180111637865 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json deleted file mode 100644 index c8073d254..000000000 --- a/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json +++ /dev/null @@ -1,641 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770834614.1822479", - "retrieved_timestamp": "1770834614.1822479", - "source_metadata": { - "source_name": "helm_lite", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra X V3 72B", - "id": "writer/palmyra-x-v3", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_lite", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679, - "details": { - "tab": "Accuracy", - "Mean win rate - Efficiency": { - "description": null, - "tab": "Efficiency", - "score": 0.25696629213483146 - }, - "Mean win rate - General information": { - "description": null, - "tab": "General information", - "score": null - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NarrativeQA", - "source_data": { - "dataset_name": "NarrativeQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NarrativeQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)", - "tab": "Accuracy", - "NarrativeQA - Observed inference time (s)": { - "description": "min=2.849, mean=2.849, max=2.849, sum=2.849 (1)", - "tab": "Efficiency", - "score": 2.848917615245765 - }, - "NarrativeQA - # eval": { - "description": "min=355, mean=355, max=355, sum=355 (1)", - "tab": "General information", - "score": 355.0 - }, - "NarrativeQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NarrativeQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NarrativeQA - # prompt tokens": { - "description": "min=3504.577, mean=3504.577, max=3504.577, sum=3504.577 (1)", - "tab": "General information", - "score": 3504.5774647887324 - }, - "NarrativeQA - # output tokens": { - "description": "min=11.149, mean=11.149, max=11.149, sum=11.149 (1)", - "tab": "General information", - "score": 11.149295774647888 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "NaturalQuestions (closed-book)", - "source_data": { - "dataset_name": "NaturalQuestions (closed-book)", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "F1 on NaturalQuestions (closed-book)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407, - "details": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)", - "tab": "Accuracy", - "NaturalQuestions (open-book) - Observed inference time (s)": { - "description": "min=2.319, mean=2.319, max=2.319, sum=2.319 (1)", - "tab": "Efficiency", - "score": 2.31904000210762 - }, - "NaturalQuestions (closed-book) - Observed inference time (s)": { - "description": "min=2.373, mean=2.373, max=2.373, sum=2.373 (1)", - "tab": "Efficiency", - "score": 2.3729000978469847 - }, - "NaturalQuestions (open-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (open-book) - # train": { - "description": "min=4.885, mean=4.885, max=4.885, sum=4.885 (1)", - "tab": "General information", - "score": 4.885 - }, - "NaturalQuestions (open-book) - truncated": { - "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)", - "tab": "General information", - "score": 0.02 - }, - "NaturalQuestions (open-book) - # prompt tokens": { - "description": "min=1617.709, mean=1617.709, max=1617.709, sum=1617.709 (1)", - "tab": "General information", - "score": 1617.709 - }, - "NaturalQuestions (open-book) - # output tokens": { - "description": "min=12.864, mean=12.864, max=12.864, sum=12.864 (1)", - "tab": "General information", - "score": 12.864 - }, - "NaturalQuestions (closed-book) - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "NaturalQuestions (closed-book) - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "NaturalQuestions (closed-book) - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "NaturalQuestions (closed-book) - # prompt tokens": { - "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)", - "tab": "General information", - "score": 116.254 - }, - "NaturalQuestions (closed-book) - # output tokens": { - "description": "min=19.113, mean=19.113, max=19.113, sum=19.113 (1)", - "tab": "General information", - "score": 19.113 - } - } - }, - "generation_config": { - "additional_details": { - "mode": "closedbook" - } - } - }, - { - "evaluation_name": "OpenbookQA", - "source_data": { - "dataset_name": "OpenbookQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on OpenbookQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938, - "details": { - "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)", - "tab": "Accuracy", - "OpenbookQA - Observed inference time (s)": { - "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)", - "tab": "Efficiency", - "score": 0.6074039902687073 - }, - "OpenbookQA - # eval": { - "description": "min=500, mean=500, max=500, sum=500 (1)", - "tab": "General information", - "score": 500.0 - }, - "OpenbookQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "OpenbookQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "OpenbookQA - # prompt tokens": { - "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)", - "tab": "General information", - "score": 254.21 - }, - "OpenbookQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "dataset": "openbookqa", - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MMLU", - "source_data": { - "dataset_name": "MMLU", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.53, mean=0.702, max=0.96, sum=3.509 (5)", - "tab": "Accuracy", - "MMLU - Observed inference time (s)": { - "description": "min=0.604, mean=0.657, max=0.783, sum=3.283 (5)", - "tab": "Efficiency", - "score": 0.656667516515966 - }, - "MMLU - # eval": { - "description": "min=100, mean=102.8, max=114, sum=514 (5)", - "tab": "General information", - "score": 102.8 - }, - "MMLU - # train": { - "description": "min=5, mean=5, max=5, sum=25 (5)", - "tab": "General information", - "score": 5.0 - }, - "MMLU - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "MMLU - # prompt tokens": { - "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)", - "tab": "General information", - "score": 472.2740350877192 - }, - "MMLU - # output tokens": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "college_chemistry", - "computer_security", - "econometrics", - "us_foreign_policy" - ], - "method": "multiple_choice_joint" - } - } - }, - { - "evaluation_name": "MATH", - "source_data": { - "dataset_name": "MATH", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "Equivalent (CoT) on MATH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723, - "details": { - "description": "min=0.579, mean=0.723, max=0.896, sum=5.06 (7)", - "tab": "Accuracy", - "MATH - Observed inference time (s)": { - "description": "min=3.23, mean=4.259, max=6.331, sum=29.811 (7)", - "tab": "Efficiency", - "score": 4.258683228698372 - }, - "MATH - # eval": { - "description": "min=30, mean=62.429, max=135, sum=437 (7)", - "tab": "General information", - "score": 62.42857142857143 - }, - "MATH - # train": { - "description": "min=8, mean=8, max=8, sum=56 (7)", - "tab": "General information", - "score": 8.0 - }, - "MATH - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (7)", - "tab": "General information", - "score": 0.0 - }, - "MATH - # prompt tokens": { - "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)", - "tab": "General information", - "score": 1375.7353092779654 - }, - "MATH - # output tokens": { - "description": "min=60.012, mean=83.135, max=128.942, sum=581.943 (7)", - "tab": "General information", - "score": 83.13468064416656 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "algebra", - "counting_and_probability", - "geometry", - "intermediate_algebra", - "number_theory", - "prealgebra", - "precalculus" - ], - "level": "1", - "use_official_examples": "False", - "use_chain_of_thought": "True" - } - } - }, - { - "evaluation_name": "GSM8K", - "source_data": { - "dataset_name": "GSM8K", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on GSM8K", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.831, - "details": { - "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)", - "tab": "Accuracy", - "GSM8K - Observed inference time (s)": { - "description": "min=5.07, mean=5.07, max=5.07, sum=5.07 (1)", - "tab": "Efficiency", - "score": 5.069576686620712 - }, - "GSM8K - # eval": { - "description": "min=1000, mean=1000, max=1000, sum=1000 (1)", - "tab": "General information", - "score": 1000.0 - }, - "GSM8K - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "GSM8K - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "GSM8K - # prompt tokens": { - "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)", - "tab": "General information", - "score": 938.869 - }, - "GSM8K - # output tokens": { - "description": "min=89.919, mean=89.919, max=89.919, sum=89.919 (1)", - "tab": "General information", - "score": 89.919 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "LegalBench", - "source_data": { - "dataset_name": "LegalBench", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on LegalBench", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.439, mean=0.709, max=0.926, sum=3.544 (5)", - "tab": "Accuracy", - "LegalBench - Observed inference time (s)": { - "description": "min=0.668, mean=1.16, max=3.0, sum=5.798 (5)", - "tab": "Efficiency", - "score": 1.1595191393847304 - }, - "LegalBench - # eval": { - "description": "min=95, mean=409.4, max=1000, sum=2047 (5)", - "tab": "General information", - "score": 409.4 - }, - "LegalBench - # train": { - "description": "min=3.984, mean=4.597, max=5, sum=22.984 (5)", - "tab": "General information", - "score": 4.596734693877551 - }, - "LegalBench - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "LegalBench - # prompt tokens": { - "description": "min=205.632, mean=1355.759, max=5467.178, sum=6778.793 (5)", - "tab": "General information", - "score": 1355.7586406214054 - }, - "LegalBench - # output tokens": { - "description": "min=1, mean=1.078, max=1.2, sum=5.388 (5)", - "tab": "General information", - "score": 1.0776021798365123 - } - } - }, - "generation_config": { - "additional_details": { - "subset": [ - "abercrombie", - "corporate_lobbying", - "function_of_decision_section", - "international_citizenship_questions", - "proa" - ] - } - } - }, - { - "evaluation_name": "MedQA", - "source_data": { - "dataset_name": "MedQA", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MedQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)", - "tab": "Accuracy", - "MedQA - Observed inference time (s)": { - "description": "min=0.927, mean=0.927, max=0.927, sum=0.927 (1)", - "tab": "Efficiency", - "score": 0.9268994279220611 - }, - "MedQA - # eval": { - "description": "min=503, mean=503, max=503, sum=503 (1)", - "tab": "General information", - "score": 503.0 - }, - "MedQA - # train": { - "description": "min=5, mean=5, max=5, sum=5 (1)", - "tab": "General information", - "score": 5.0 - }, - "MedQA - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (1)", - "tab": "General information", - "score": 0.0 - }, - "MedQA - # prompt tokens": { - "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)", - "tab": "General information", - "score": 1038.8608349900596 - }, - "MedQA - # output tokens": { - "description": "min=1, mean=1, max=1, sum=1 (1)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": {} - } - }, - { - "evaluation_name": "WMT 2014", - "source_data": { - "dataset_name": "WMT 2014", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json" - ] - }, - "metric_config": { - "evaluation_description": "BLEU-4 on WMT 2014", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.262, - "details": { - "description": "min=0.235, mean=0.262, max=0.284, sum=1.309 (5)", - "tab": "Accuracy", - "WMT 2014 - Observed inference time (s)": { - "description": "min=1.32, mean=1.406, max=1.477, sum=7.032 (5)", - "tab": "Efficiency", - "score": 1.4063038200537652 - }, - "WMT 2014 - # eval": { - "description": "min=503, mean=568.8, max=832, sum=2844 (5)", - "tab": "General information", - "score": 568.8 - }, - "WMT 2014 - # train": { - "description": "min=1, mean=1, max=1, sum=5 (5)", - "tab": "General information", - "score": 1.0 - }, - "WMT 2014 - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (5)", - "tab": "General information", - "score": 0.0 - }, - "WMT 2014 - # prompt tokens": { - "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)", - "tab": "General information", - "score": 181.69386660804403 - }, - "WMT 2014 - # output tokens": { - "description": "min=23.356, mean=24.983, max=25.829, sum=124.915 (5)", - "tab": "General information", - "score": 24.983090877810064 - } - } - }, - "generation_config": { - "additional_details": { - "language_pair": [ - "cs-en", - "de-en", - "fr-en", - "hi-en", - "ru-en" - ] - } - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json deleted file mode 100644 index a5d4de71f..000000000 --- a/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi 34B", - "id": "01-ai/yi-34b", - "developer": "01-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.4, mean=0.762, max=0.974, sum=86.905 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.407, mean=0.823, max=2.683, sum=93.841 (114)", - "tab": "Efficiency", - "score": 0.8231679963633336 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=289.971, mean=661.842, max=2957.412, sum=75449.942 (114)", - "tab": "General information", - "score": 661.8416008681387 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4, - "details": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.658, mean=0.658, max=0.658, sum=1.315 (2)", - "tab": "Efficiency", - "score": 0.6577284264564515 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=383.67, mean=383.67, max=383.67, sum=767.34 (2)", - "tab": "General information", - "score": 383.67 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.748, - "details": { - "description": "min=0.748, mean=0.748, max=0.748, sum=1.496 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.601, mean=0.601, max=0.601, sum=1.202 (2)", - "tab": "Efficiency", - "score": 0.6009190011907507 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=375.77, mean=375.77, max=375.77, sum=751.541 (2)", - "tab": "General information", - "score": 375.77037037037036 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.531, mean=0.531, max=0.531, sum=1.061 (2)", - "tab": "Efficiency", - "score": 0.5305842399597168 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)", - "tab": "Efficiency", - "score": 0.5021488202942742 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.708, mean=0.708, max=0.708, sum=1.415 (2)", - "tab": "Efficiency", - "score": 0.7075318503379822 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.569, mean=0.569, max=0.569, sum=1.138 (2)", - "tab": "Efficiency", - "score": 0.5689087891578675 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.575, mean=0.575, max=0.575, sum=1.15 (2)", - "tab": "Efficiency", - "score": 0.5747669638925894 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.604, mean=0.604, max=0.604, sum=1.207 (2)", - "tab": "Efficiency", - "score": 0.603668584543116 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=597.54, mean=597.54, max=597.54, sum=1195.08 (2)", - "tab": "General information", - "score": 597.54 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=514.819, mean=514.819, max=514.819, sum=1029.639 (2)", - "tab": "General information", - "score": 514.8194444444445 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=883.06, mean=883.06, max=883.06, sum=1766.12 (2)", - "tab": "General information", - "score": 883.06 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=635.3, mean=635.3, max=635.3, sum=1270.6 (2)", - "tab": "General information", - "score": 635.3 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=549.688, mean=549.688, max=549.688, sum=1099.376 (2)", - "tab": "General information", - "score": 549.6878612716763 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=512.912, mean=512.912, max=512.912, sum=1025.824 (2)", - "tab": "General information", - "score": 512.9117647058823 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.943 (2)", - "tab": "Efficiency", - "score": 0.47160084009170533 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=405.74, mean=405.74, max=405.74, sum=811.48 (2)", - "tab": "General information", - "score": 405.74 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.588, - "details": { - "description": "min=0.588, mean=0.588, max=0.588, sum=1.175 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.61, mean=0.61, max=0.61, sum=1.219 (2)", - "tab": "Efficiency", - "score": 0.6095903463530958 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=667.789, mean=667.789, max=667.789, sum=1335.579 (2)", - "tab": "General information", - "score": 667.7894736842105 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.537, mean=0.537, max=0.537, sum=1.074 (2)", - "tab": "Efficiency", - "score": 0.5369880175590516 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=462.32, mean=462.32, max=462.32, sum=924.64 (2)", - "tab": "General information", - "score": 462.32 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898, - "details": { - "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)", - "tab": "Efficiency", - "score": 0.668224381075965 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=431.898, mean=431.898, max=431.898, sum=863.796 (2)", - "tab": "General information", - "score": 431.89814814814815 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)", - "tab": "Efficiency", - "score": 0.42395149779856395 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=356.723, mean=356.723, max=356.723, sum=713.447 (2)", - "tab": "General information", - "score": 356.7234726688103 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=2.222, mean=2.222, max=2.222, sum=4.444 (2)", - "tab": "Efficiency", - "score": 2.222188143169179 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Efficiency", - "score": 0.6598629156748453 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.839, mean=1.839, max=1.839, sum=3.678 (2)", - "tab": "Efficiency", - "score": 1.839003596032303 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=2.178, mean=2.178, max=2.178, sum=4.356 (2)", - "tab": "Efficiency", - "score": 2.1780028343200684 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1202.533, mean=1202.533, max=1202.533, sum=2405.066 (2)", - "tab": "General information", - "score": 1202.5330882352941 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=771.16, mean=771.16, max=771.16, sum=1542.319 (2)", - "tab": "General information", - "score": 771.1595744680851 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1759.098, mean=1759.098, max=1759.098, sum=3518.197 (2)", - "tab": "General information", - "score": 1759.0984354628422 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=608.201, mean=608.201, max=608.201, sum=1216.402 (2)", - "tab": "General information", - "score": 608.2009803921569 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.607, mean=0.607, max=0.607, sum=1.214 (2)", - "tab": "Efficiency", - "score": 0.6068471717834473 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=458.53, mean=458.53, max=458.53, sum=917.06 (2)", - "tab": "General information", - "score": 458.53 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.803 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.117 (2)", - "tab": "Efficiency", - "score": 0.5586237562330145 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=626.895, mean=626.895, max=626.895, sum=1253.789 (2)", - "tab": "General information", - "score": 626.8947368421053 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)", - "tab": "Efficiency", - "score": 0.5663742089271545 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=616.97, mean=616.97, max=616.97, sum=1233.94 (2)", - "tab": "General information", - "score": 616.97 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.487, mean=0.487, max=0.487, sum=0.975 (2)", - "tab": "Efficiency", - "score": 0.4874912774787759 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=446.966, mean=446.966, max=446.966, sum=893.932 (2)", - "tab": "General information", - "score": 446.96603773584906 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)", - "tab": "Efficiency", - "score": 0.4390637499220828 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=311.94, mean=311.94, max=311.94, sum=623.881 (2)", - "tab": "General information", - "score": 311.9404255319149 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.531, mean=0.531, max=0.531, sum=1.063 (2)", - "tab": "Efficiency", - "score": 0.531287300175634 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=491.993, mean=491.993, max=491.993, sum=983.986 (2)", - "tab": "General information", - "score": 491.99310344827586 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656, - "details": { - "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)", - "tab": "Efficiency", - "score": 0.5613514084033865 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=601.344, mean=601.344, max=601.344, sum=1202.688 (2)", - "tab": "General information", - "score": 601.3439153439153 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548, - "details": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.095 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.626, mean=0.626, max=0.626, sum=1.253 (2)", - "tab": "Efficiency", - "score": 0.6264226947511945 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=675.579, mean=675.579, max=675.579, sum=1351.159 (2)", - "tab": "General information", - "score": 675.5793650793651 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.907, - "details": { - "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.179 (2)", - "tab": "Efficiency", - "score": 0.5895279146009876 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.124 (2)", - "tab": "Efficiency", - "score": 0.5618457112993512 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)", - "tab": "Efficiency", - "score": 0.8510373497009277 - }, - "High School European History - Observed inference time (s)": { - "description": "min=2.359, mean=2.359, max=2.359, sum=4.717 (2)", - "tab": "Efficiency", - "score": 2.358732930096713 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=1.215, mean=1.215, max=1.215, sum=2.43 (2)", - "tab": "Efficiency", - "score": 1.21489392266129 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.677, mean=0.677, max=0.677, sum=1.354 (2)", - "tab": "Efficiency", - "score": 0.6768323757487875 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)", - "tab": "Efficiency", - "score": 0.5697616595488328 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.541, mean=0.541, max=0.541, sum=1.082 (2)", - "tab": "Efficiency", - "score": 0.5409333193743671 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.657, mean=0.657, max=0.657, sum=1.314 (2)", - "tab": "Efficiency", - "score": 0.6570467107436236 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.738, mean=0.738, max=0.738, sum=1.476 (2)", - "tab": "Efficiency", - "score": 0.7378138311651369 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.524, mean=0.524, max=0.524, sum=1.049 (2)", - "tab": "Efficiency", - "score": 0.5244918534515101 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", - "tab": "Efficiency", - "score": 0.7453252838717567 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.821, mean=1.821, max=1.821, sum=3.642 (2)", - "tab": "Efficiency", - "score": 1.8211165923698276 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.27, mean=1.27, max=1.27, sum=2.541 (2)", - "tab": "Efficiency", - "score": 1.2703520537428714 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=546.394, mean=546.394, max=546.394, sum=1092.787 (2)", - "tab": "General information", - "score": 546.3935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=537.015, mean=537.015, max=537.015, sum=1074.03 (2)", - "tab": "General information", - "score": 537.0147783251232 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=962.1, mean=962.1, max=962.1, sum=1924.2 (2)", - "tab": "General information", - "score": 962.1 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2957.412, mean=2957.412, max=2957.412, sum=5914.824 (2)", - "tab": "General information", - "score": 2957.4121212121213 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=404.035, mean=404.035, max=404.035, sum=808.071 (2)", - "tab": "General information", - "score": 404.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=484.725, mean=484.725, max=484.725, sum=969.451 (2)", - "tab": "General information", - "score": 484.7253886010363 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=398.892, mean=398.892, max=398.892, sum=797.785 (2)", - "tab": "General information", - "score": 398.89230769230767 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=575.622, mean=575.622, max=575.622, sum=1151.244 (2)", - "tab": "General information", - "score": 575.6222222222223 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=420.739, mean=420.739, max=420.739, sum=841.479 (2)", - "tab": "General information", - "score": 420.73949579831935 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=599.411, mean=599.411, max=599.411, sum=1198.821 (2)", - "tab": "General information", - "score": 599.4105960264901 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=526.826, mean=526.826, max=526.826, sum=1053.651 (2)", - "tab": "General information", - "score": 526.8256880733945 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=869.778, mean=869.778, max=869.778, sum=1739.556 (2)", - "tab": "General information", - "score": 869.7777777777778 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2369.132, mean=2369.132, max=2369.132, sum=4738.265 (2)", - "tab": "General information", - "score": 2369.1323529411766 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1541.371, mean=1541.371, max=1541.371, sum=3082.743 (2)", - "tab": "General information", - "score": 1541.3713080168777 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.768, mean=0.768, max=0.768, sum=1.535 (2)", - "tab": "Efficiency", - "score": 0.76751750146327 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Efficiency", - "score": 0.4077764613027791 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=332.013, mean=332.013, max=332.013, sum=664.027 (2)", - "tab": "General information", - "score": 332.0134529147982 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=367.855, mean=367.855, max=367.855, sum=735.71 (2)", - "tab": "General information", - "score": 367.85496183206106 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.909, - "details": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.588, mean=0.588, max=0.588, sum=1.175 (2)", - "tab": "Efficiency", - "score": 0.5876634554429487 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=663.289, mean=663.289, max=663.289, sum=1326.579 (2)", - "tab": "General information", - "score": 663.2892561983471 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.622, mean=0.622, max=0.622, sum=1.245 (2)", - "tab": "Efficiency", - "score": 0.6223941814680041 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=466.595, mean=466.595, max=466.595, sum=933.19 (2)", - "tab": "General information", - "score": 466.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.161 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.638, mean=0.638, max=0.638, sum=1.277 (2)", - "tab": "Efficiency", - "score": 0.6384105682373047 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=720.161, mean=720.161, max=720.161, sum=1440.321 (2)", - "tab": "General information", - "score": 720.1607142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)", - "tab": "Efficiency", - "score": 0.4204523748564489 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=300.544, mean=300.544, max=300.544, sum=601.087 (2)", - "tab": "General information", - "score": 300.54368932038835 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.936, - "details": { - "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)", - "tab": "Efficiency", - "score": 0.463064443351876 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=442.825, mean=442.825, max=442.825, sum=885.65 (2)", - "tab": "General information", - "score": 442.8247863247863 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.857 (2)", - "tab": "Efficiency", - "score": 0.42836678981781007 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=362, mean=362, max=362, sum=724 (2)", - "tab": "General information", - "score": 362.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.902, - "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.537, mean=0.537, max=0.537, sum=1.075 (2)", - "tab": "Efficiency", - "score": 0.5372742845333095 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=331.441, mean=331.441, max=331.441, sum=662.881 (2)", - "tab": "General information", - "score": 331.4406130268199 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.606, - "details": { - "description": "min=0.606, mean=0.606, max=0.606, sum=1.211 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.671, mean=0.671, max=0.671, sum=1.341 (2)", - "tab": "Efficiency", - "score": 0.6705957754498961 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.764, mean=0.764, max=0.764, sum=1.528 (2)", - "tab": "Efficiency", - "score": 0.7642385613318928 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=507.913, mean=507.913, max=507.913, sum=1015.827 (2)", - "tab": "General information", - "score": 507.91329479768785 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=703.334, mean=703.334, max=703.334, sum=1406.668 (2)", - "tab": "General information", - "score": 703.3340782122905 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.869, mean=0.869, max=0.869, sum=1.739 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=1.038, mean=1.038, max=1.038, sum=2.077 (2)", - "tab": "Efficiency", - "score": 1.0384757246067322 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=643.317, mean=643.317, max=643.317, sum=1286.634 (2)", - "tab": "General information", - "score": 643.3169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.753 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.121 (2)", - "tab": "Efficiency", - "score": 0.560588002204895 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=565.096, mean=565.096, max=565.096, sum=1130.191 (2)", - "tab": "General information", - "score": 565.0956790123457 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=2.107, mean=2.107, max=2.107, sum=4.213 (2)", - "tab": "Efficiency", - "score": 2.1067019375887783 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=432.436, mean=432.436, max=432.436, sum=864.873 (2)", - "tab": "General information", - "score": 432.43636363636364 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.665 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=2.683, mean=2.683, max=2.683, sum=5.366 (2)", - "tab": "Efficiency", - "score": 2.682755525744691 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1227.196, mean=1227.196, max=1227.196, sum=2454.392 (2)", - "tab": "General information", - "score": 1227.1959183673468 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=1.401, mean=1.401, max=1.401, sum=2.803 (2)", - "tab": "Efficiency", - "score": 1.4013089469416224 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=463.99, mean=463.99, max=463.99, sum=927.98 (2)", - "tab": "General information", - "score": 463.99004975124376 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572, - "details": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.563, mean=0.563, max=0.563, sum=1.127 (2)", - "tab": "Efficiency", - "score": 0.5633984617440098 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=363.102, mean=363.102, max=363.102, sum=726.205 (2)", - "tab": "General information", - "score": 363.1024096385542 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)", - "tab": "Efficiency", - "score": 0.4067504726655302 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=289.971, mean=289.971, max=289.971, sum=579.942 (2)", - "tab": "General information", - "score": 289.97076023391816 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json deleted file mode 100644 index 1f0a7e20f..000000000 --- a/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi 6B", - "id": "01-ai/yi-6b", - "developer": "01-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.3, mean=0.64, max=0.907, sum=72.967 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.315, mean=0.388, max=0.912, sum=44.195 (114)", - "tab": "Efficiency", - "score": 0.3876731134304364 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=289.971, mean=661.842, max=2957.412, sum=75449.942 (114)", - "tab": "General information", - "score": 661.8416008681387 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3, - "details": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.34289863109588625 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=383.67, mean=383.67, max=383.67, sum=767.34 (2)", - "tab": "General information", - "score": 383.67 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)", - "tab": "Efficiency", - "score": 0.3338937794720685 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=375.77, mean=375.77, max=375.77, sum=751.541 (2)", - "tab": "General information", - "score": 375.77037037037036 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422, - "details": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Efficiency", - "score": 0.3400930452346802 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.661 (2)", - "tab": "Efficiency", - "score": 0.3306954221593009 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.793 (2)", - "tab": "Efficiency", - "score": 0.39658718585968017 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)", - "tab": "Efficiency", - "score": 0.3718992257118225 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)", - "tab": "Efficiency", - "score": 0.360349433270493 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.363, mean=0.363, max=0.363, sum=0.726 (2)", - "tab": "Efficiency", - "score": 0.36309780092800364 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=597.54, mean=597.54, max=597.54, sum=1195.08 (2)", - "tab": "General information", - "score": 597.54 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=514.819, mean=514.819, max=514.819, sum=1029.639 (2)", - "tab": "General information", - "score": 514.8194444444445 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=883.06, mean=883.06, max=883.06, sum=1766.12 (2)", - "tab": "General information", - "score": 883.06 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=635.3, mean=635.3, max=635.3, sum=1270.6 (2)", - "tab": "General information", - "score": 635.3 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=549.688, mean=549.688, max=549.688, sum=1099.376 (2)", - "tab": "General information", - "score": 549.6878612716763 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=512.912, mean=512.912, max=512.912, sum=1025.824 (2)", - "tab": "General information", - "score": 512.9117647058823 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)", - "tab": "Efficiency", - "score": 0.3364018177986145 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=405.74, mean=405.74, max=405.74, sum=811.48 (2)", - "tab": "General information", - "score": 405.74 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.351, - "details": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.796 (2)", - "tab": "Efficiency", - "score": 0.397992962285092 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=667.789, mean=667.789, max=667.789, sum=1335.579 (2)", - "tab": "General information", - "score": 667.7894736842105 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43, - "details": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.3273779916763306 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=462.32, mean=462.32, max=462.32, sum=924.64 (2)", - "tab": "General information", - "score": 462.32 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.361, mean=0.361, max=0.361, sum=0.721 (2)", - "tab": "Efficiency", - "score": 0.3607365202020716 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=431.898, mean=431.898, max=431.898, sum=863.796 (2)", - "tab": "General information", - "score": 431.89814814814815 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.678, - "details": { - "description": "min=0.678, mean=0.678, max=0.678, sum=1.357 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.34667477807048047 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=356.723, mean=356.723, max=356.723, sum=713.447 (2)", - "tab": "General information", - "score": 356.7234726688103 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.668, - "details": { - "description": "min=0.668, mean=0.668, max=0.668, sum=1.337 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.769, mean=0.769, max=0.769, sum=1.538 (2)", - "tab": "Efficiency", - "score": 0.7688747907386106 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)", - "tab": "Efficiency", - "score": 0.37016247857546974 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.131 (2)", - "tab": "Efficiency", - "score": 0.5655125939084467 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)", - "tab": "Efficiency", - "score": 0.33476316071803275 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1202.533, mean=1202.533, max=1202.533, sum=2405.066 (2)", - "tab": "General information", - "score": 1202.5330882352941 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=771.16, mean=771.16, max=771.16, sum=1542.319 (2)", - "tab": "General information", - "score": 771.1595744680851 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1759.098, mean=1759.098, max=1759.098, sum=3518.197 (2)", - "tab": "General information", - "score": 1759.0984354628422 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=608.201, mean=608.201, max=608.201, sum=1216.402 (2)", - "tab": "General information", - "score": 608.2009803921569 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Efficiency", - "score": 0.38381587505340575 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=458.53, mean=458.53, max=458.53, sum=917.06 (2)", - "tab": "General information", - "score": 458.53 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.3511188610603935 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=626.895, mean=626.895, max=626.895, sum=1253.789 (2)", - "tab": "General information", - "score": 626.8947368421053 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.671 (2)", - "tab": "Efficiency", - "score": 0.33533199548721315 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=616.97, mean=616.97, max=616.97, sum=1233.94 (2)", - "tab": "General information", - "score": 616.97 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.321 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)", - "tab": "Efficiency", - "score": 0.34722964808625995 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=446.966, mean=446.966, max=446.966, sum=893.932 (2)", - "tab": "General information", - "score": 446.96603773584906 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621, - "details": { - "description": "min=0.621, mean=0.621, max=0.621, sum=1.243 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)", - "tab": "Efficiency", - "score": 0.3323540139705577 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=311.94, mean=311.94, max=311.94, sum=623.881 (2)", - "tab": "General information", - "score": 311.9404255319149 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662, - "details": { - "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.661 (2)", - "tab": "Efficiency", - "score": 0.33032174274839204 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=491.993, mean=491.993, max=491.993, sum=983.986 (2)", - "tab": "General information", - "score": 491.99310344827586 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Efficiency", - "score": 0.34218634310222806 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=601.344, mean=601.344, max=601.344, sum=1202.688 (2)", - "tab": "General information", - "score": 601.3439153439153 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.713 (2)", - "tab": "Efficiency", - "score": 0.3562947171075003 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=675.579, mean=675.579, max=675.579, sum=1351.159 (2)", - "tab": "General information", - "score": 675.5793650793651 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.685 (2)", - "tab": "Efficiency", - "score": 0.3425526588193832 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.667 (2)", - "tab": "Efficiency", - "score": 0.3337097426353417 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.822 (2)", - "tab": "Efficiency", - "score": 0.4111129188537598 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)", - "tab": "Efficiency", - "score": 0.9120050358049797 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.781, mean=0.781, max=0.781, sum=1.563 (2)", - "tab": "Efficiency", - "score": 0.7814190243229722 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.688 (2)", - "tab": "Efficiency", - "score": 0.3440394698029355 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)", - "tab": "Efficiency", - "score": 0.3361299728735899 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)", - "tab": "Efficiency", - "score": 0.36511756932293926 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)", - "tab": "Efficiency", - "score": 0.3350923071388437 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.363, mean=0.363, max=0.363, sum=0.727 (2)", - "tab": "Efficiency", - "score": 0.3634012266500107 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.678 (2)", - "tab": "Efficiency", - "score": 0.3389187379714546 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.767 (2)", - "tab": "Efficiency", - "score": 0.38363339724364104 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.661, mean=0.661, max=0.661, sum=1.322 (2)", - "tab": "Efficiency", - "score": 0.6610236086097419 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)", - "tab": "Efficiency", - "score": 0.5019015682397513 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=546.394, mean=546.394, max=546.394, sum=1092.787 (2)", - "tab": "General information", - "score": 546.3935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=537.015, mean=537.015, max=537.015, sum=1074.03 (2)", - "tab": "General information", - "score": 537.0147783251232 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=962.1, mean=962.1, max=962.1, sum=1924.2 (2)", - "tab": "General information", - "score": 962.1 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2957.412, mean=2957.412, max=2957.412, sum=5914.824 (2)", - "tab": "General information", - "score": 2957.4121212121213 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=404.035, mean=404.035, max=404.035, sum=808.071 (2)", - "tab": "General information", - "score": 404.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=484.725, mean=484.725, max=484.725, sum=969.451 (2)", - "tab": "General information", - "score": 484.7253886010363 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=398.892, mean=398.892, max=398.892, sum=797.785 (2)", - "tab": "General information", - "score": 398.89230769230767 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=575.622, mean=575.622, max=575.622, sum=1151.244 (2)", - "tab": "General information", - "score": 575.6222222222223 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=420.739, mean=420.739, max=420.739, sum=841.479 (2)", - "tab": "General information", - "score": 420.73949579831935 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=599.411, mean=599.411, max=599.411, sum=1198.821 (2)", - "tab": "General information", - "score": 599.4105960264901 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=526.826, mean=526.826, max=526.826, sum=1053.651 (2)", - "tab": "General information", - "score": 526.8256880733945 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=869.778, mean=869.778, max=869.778, sum=1739.556 (2)", - "tab": "General information", - "score": 869.7777777777778 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2369.132, mean=2369.132, max=2369.132, sum=4738.265 (2)", - "tab": "General information", - "score": 2369.1323529411766 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1541.371, mean=1541.371, max=1541.371, sum=3082.743 (2)", - "tab": "General information", - "score": 1541.3713080168777 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.763, - "details": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.527 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)", - "tab": "Efficiency", - "score": 0.3457356803620343 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.3222540717088539 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=332.013, mean=332.013, max=332.013, sum=664.027 (2)", - "tab": "General information", - "score": 332.0134529147982 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=367.855, mean=367.855, max=367.855, sum=735.71 (2)", - "tab": "General information", - "score": 367.85496183206106 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.769, - "details": { - "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.711 (2)", - "tab": "Efficiency", - "score": 0.35565017274588595 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=663.289, mean=663.289, max=663.289, sum=1326.579 (2)", - "tab": "General information", - "score": 663.2892561983471 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.703 (2)", - "tab": "Efficiency", - "score": 0.3515900117487995 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=466.595, mean=466.595, max=466.595, sum=933.19 (2)", - "tab": "General information", - "score": 466.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.411, - "details": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.355, mean=0.355, max=0.355, sum=0.71 (2)", - "tab": "Efficiency", - "score": 0.35482590326241087 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=720.161, mean=720.161, max=720.161, sum=1440.321 (2)", - "tab": "General information", - "score": 720.1607142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.806, - "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.33675998622931325 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=300.544, mean=300.544, max=300.544, sum=601.087 (2)", - "tab": "General information", - "score": 300.54368932038835 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)", - "tab": "Efficiency", - "score": 0.3279143999784421 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=442.825, mean=442.825, max=442.825, sum=885.65 (2)", - "tab": "General information", - "score": 442.8247863247863 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)", - "tab": "Efficiency", - "score": 0.3717941379547119 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=362, mean=362, max=362, sum=724 (2)", - "tab": "General information", - "score": 362.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.591 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)", - "tab": "Efficiency", - "score": 0.31703713509619313 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=331.441, mean=331.441, max=331.441, sum=662.881 (2)", - "tab": "General information", - "score": 331.4406130268199 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.335, - "details": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.3214432848671268 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Efficiency", - "score": 0.3421009585844072 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=507.913, mean=507.913, max=507.913, sum=1015.827 (2)", - "tab": "General information", - "score": 507.91329479768785 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=703.334, mean=703.334, max=703.334, sum=1406.668 (2)", - "tab": "General information", - "score": 703.3340782122905 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739, - "details": { - "description": "min=0.739, mean=0.739, max=0.739, sum=1.477 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.708 (2)", - "tab": "Efficiency", - "score": 0.35382014474058465 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=643.317, mean=643.317, max=643.317, sum=1286.634 (2)", - "tab": "General information", - "score": 643.3169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.713, - "details": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.358, mean=0.358, max=0.358, sum=0.715 (2)", - "tab": "Efficiency", - "score": 0.3577412587625009 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=565.096, mean=565.096, max=565.096, sum=1130.191 (2)", - "tab": "General information", - "score": 565.0956790123457 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)", - "tab": "Efficiency", - "score": 0.35222616412422875 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=432.436, mean=432.436, max=432.436, sum=864.873 (2)", - "tab": "General information", - "score": 432.43636363636364 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.735, mean=0.735, max=0.735, sum=1.469 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.877 (2)", - "tab": "Efficiency", - "score": 0.4387260553788166 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1227.196, mean=1227.196, max=1227.196, sum=2454.392 (2)", - "tab": "General information", - "score": 1227.1959183673468 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.831, - "details": { - "description": "min=0.831, mean=0.831, max=0.831, sum=1.662 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.31509182820865766 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=463.99, mean=463.99, max=463.99, sum=927.98 (2)", - "tab": "General information", - "score": 463.99004975124376 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.904 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.705 (2)", - "tab": "Efficiency", - "score": 0.3524869034089238 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=363.102, mean=363.102, max=363.102, sum=726.205 (2)", - "tab": "General information", - "score": 363.1024096385542 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.673 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.34344731576261467 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=289.971, mean=289.971, max=289.971, sum=579.942 (2)", - "tab": "General information", - "score": 289.97076023391816 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.651, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json deleted file mode 100644 index 4838cda1c..000000000 --- a/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi Large Preview", - "id": "01-ai/yi-large-preview", - "developer": "01-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.36, mean=0.793, max=0.969, sum=90.428 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.621, mean=0.764, max=1.689, sum=87.08 (114)", - "tab": "Efficiency", - "score": 0.7638553584278898 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=302.971, mean=674.842, max=2970.412, sum=76931.942 (114)", - "tab": "General information", - "score": 674.8416008681387 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)", - "tab": "Efficiency", - "score": 0.718058660030365 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=396.67, mean=396.67, max=396.67, sum=793.34 (2)", - "tab": "General information", - "score": 396.67 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.672, mean=0.672, max=0.672, sum=1.343 (2)", - "tab": "Efficiency", - "score": 0.6716545846727159 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=388.77, mean=388.77, max=388.77, sum=777.541 (2)", - "tab": "General information", - "score": 388.77037037037036 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.569, - "details": { - "description": "min=0.569, mean=0.569, max=0.569, sum=1.137 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.722, mean=0.722, max=0.722, sum=1.443 (2)", - "tab": "Efficiency", - "score": 0.721672637462616 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)", - "tab": "Efficiency", - "score": 0.7195867978864245 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.828, mean=0.828, max=0.828, sum=1.657 (2)", - "tab": "Efficiency", - "score": 0.8283914875984192 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.734, mean=0.734, max=0.734, sum=1.468 (2)", - "tab": "Efficiency", - "score": 0.734215636253357 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)", - "tab": "Efficiency", - "score": 0.7037480470073016 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.742, mean=0.742, max=0.742, sum=1.484 (2)", - "tab": "Efficiency", - "score": 0.7418750898510802 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=610.54, mean=610.54, max=610.54, sum=1221.08 (2)", - "tab": "General information", - "score": 610.54 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=527.819, mean=527.819, max=527.819, sum=1055.639 (2)", - "tab": "General information", - "score": 527.8194444444445 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=896.06, mean=896.06, max=896.06, sum=1792.12 (2)", - "tab": "General information", - "score": 896.06 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=648.3, mean=648.3, max=648.3, sum=1296.6 (2)", - "tab": "General information", - "score": 648.3 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=562.688, mean=562.688, max=562.688, sum=1125.376 (2)", - "tab": "General information", - "score": 562.6878612716763 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=525.912, mean=525.912, max=525.912, sum=1051.824 (2)", - "tab": "General information", - "score": 525.9117647058823 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.679, mean=0.679, max=0.679, sum=1.358 (2)", - "tab": "Efficiency", - "score": 0.6791670727729797 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=418.74, mean=418.74, max=418.74, sum=837.48 (2)", - "tab": "General information", - "score": 418.74 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.728, - "details": { - "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.504 (2)", - "tab": "Efficiency", - "score": 0.7519724473618624 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=680.789, mean=680.789, max=680.789, sum=1361.579 (2)", - "tab": "General information", - "score": 680.7894736842105 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.401 (2)", - "tab": "Efficiency", - "score": 0.7004458856582642 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=475.32, mean=475.32, max=475.32, sum=950.64 (2)", - "tab": "General information", - "score": 475.32 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)", - "tab": "Efficiency", - "score": 0.7087078028255038 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=444.898, mean=444.898, max=444.898, sum=889.796 (2)", - "tab": "General information", - "score": 444.89814814814815 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.685 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.665, mean=0.665, max=0.665, sum=1.33 (2)", - "tab": "Efficiency", - "score": 0.6652177269435772 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=369.723, mean=369.723, max=369.723, sum=739.447 (2)", - "tab": "General information", - "score": 369.7234726688103 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.853, - "details": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)", - "tab": "Efficiency", - "score": 0.9064707010984421 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.774, mean=0.774, max=0.774, sum=1.549 (2)", - "tab": "Efficiency", - "score": 0.7743352516323116 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.112, mean=1.112, max=1.112, sum=2.224 (2)", - "tab": "Efficiency", - "score": 1.1117667775732287 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.729, mean=0.729, max=0.729, sum=1.458 (2)", - "tab": "Efficiency", - "score": 0.7289925248794307 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1215.533, mean=1215.533, max=1215.533, sum=2431.066 (2)", - "tab": "General information", - "score": 1215.5330882352941 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=784.16, mean=784.16, max=784.16, sum=1568.319 (2)", - "tab": "General information", - "score": 784.1595744680851 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1772.098, mean=1772.098, max=1772.098, sum=3544.197 (2)", - "tab": "General information", - "score": 1772.0984354628422 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=621.201, mean=621.201, max=621.201, sum=1242.402 (2)", - "tab": "General information", - "score": 621.2009803921569 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Efficiency", - "score": 0.6958462524414063 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=471.53, mean=471.53, max=471.53, sum=943.06 (2)", - "tab": "General information", - "score": 471.53 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.829 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)", - "tab": "Efficiency", - "score": 0.7604575784582841 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=639.895, mean=639.895, max=639.895, sum=1279.789 (2)", - "tab": "General information", - "score": 639.8947368421053 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.463 (2)", - "tab": "Efficiency", - "score": 0.7314971995353698 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=629.97, mean=629.97, max=629.97, sum=1259.94 (2)", - "tab": "General information", - "score": 629.97 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.376 (2)", - "tab": "Efficiency", - "score": 0.6877818728392979 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=459.966, mean=459.966, max=459.966, sum=919.932 (2)", - "tab": "General information", - "score": 459.96603773584906 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.319 (2)", - "tab": "Efficiency", - "score": 0.6594150309867047 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=324.94, mean=324.94, max=324.94, sum=649.881 (2)", - "tab": "General information", - "score": 324.9404255319149 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.394 (2)", - "tab": "Efficiency", - "score": 0.6971425631950642 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=504.993, mean=504.993, max=504.993, sum=1009.986 (2)", - "tab": "General information", - "score": 504.99310344827586 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.685, - "details": { - "description": "min=0.685, mean=0.685, max=0.685, sum=1.37 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.715, mean=0.715, max=0.715, sum=1.43 (2)", - "tab": "Efficiency", - "score": 0.7149287146866006 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=614.344, mean=614.344, max=614.344, sum=1228.688 (2)", - "tab": "General information", - "score": 614.3439153439153 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.603, - "details": { - "description": "min=0.603, mean=0.603, max=0.603, sum=1.206 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.761, mean=0.761, max=0.761, sum=1.522 (2)", - "tab": "Efficiency", - "score": 0.7611211935679117 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=688.579, mean=688.579, max=688.579, sum=1377.159 (2)", - "tab": "General information", - "score": 688.5793650793651 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.739, mean=0.739, max=0.739, sum=1.478 (2)", - "tab": "Efficiency", - "score": 0.7389615043517082 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.454 (2)", - "tab": "Efficiency", - "score": 0.7272039317145136 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", - "tab": "Efficiency", - "score": 0.8772388291358948 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.689, mean=1.689, max=1.689, sum=3.378 (2)", - "tab": "Efficiency", - "score": 1.6891969362894694 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.725, mean=0.725, max=0.725, sum=1.451 (2)", - "tab": "Efficiency", - "score": 0.7252739162156077 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.693, mean=0.693, max=0.693, sum=1.387 (2)", - "tab": "Efficiency", - "score": 0.6934328054517044 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.367 (2)", - "tab": "Efficiency", - "score": 0.6835794656704633 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.716, mean=0.716, max=0.716, sum=1.432 (2)", - "tab": "Efficiency", - "score": 0.7162466013873064 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)", - "tab": "Efficiency", - "score": 0.7111842982909259 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.74, mean=0.74, max=0.74, sum=1.481 (2)", - "tab": "Efficiency", - "score": 0.7403108505223761 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Efficiency", - "score": 0.7000295271567248 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.666 (2)", - "tab": "Efficiency", - "score": 0.8330503514519444 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.349, mean=1.349, max=1.349, sum=2.698 (2)", - "tab": "Efficiency", - "score": 1.3490371108055115 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.047, mean=1.047, max=1.047, sum=2.093 (2)", - "tab": "Efficiency", - "score": 1.046591958919155 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=559.394, mean=559.394, max=559.394, sum=1118.787 (2)", - "tab": "General information", - "score": 559.3935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=550.015, mean=550.015, max=550.015, sum=1100.03 (2)", - "tab": "General information", - "score": 550.0147783251232 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=975.1, mean=975.1, max=975.1, sum=1950.2 (2)", - "tab": "General information", - "score": 975.1 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2970.412, mean=2970.412, max=2970.412, sum=5940.824 (2)", - "tab": "General information", - "score": 2970.4121212121213 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=417.035, mean=417.035, max=417.035, sum=834.071 (2)", - "tab": "General information", - "score": 417.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=497.725, mean=497.725, max=497.725, sum=995.451 (2)", - "tab": "General information", - "score": 497.7253886010363 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=411.892, mean=411.892, max=411.892, sum=823.785 (2)", - "tab": "General information", - "score": 411.89230769230767 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=588.622, mean=588.622, max=588.622, sum=1177.244 (2)", - "tab": "General information", - "score": 588.6222222222223 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=433.739, mean=433.739, max=433.739, sum=867.479 (2)", - "tab": "General information", - "score": 433.73949579831935 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=612.411, mean=612.411, max=612.411, sum=1224.821 (2)", - "tab": "General information", - "score": 612.4105960264901 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=539.826, mean=539.826, max=539.826, sum=1079.651 (2)", - "tab": "General information", - "score": 539.8256880733945 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=882.778, mean=882.778, max=882.778, sum=1765.556 (2)", - "tab": "General information", - "score": 882.7777777777778 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2382.132, mean=2382.132, max=2382.132, sum=4764.265 (2)", - "tab": "General information", - "score": 2382.1323529411766 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1554.371, mean=1554.371, max=1554.371, sum=3108.743 (2)", - "tab": "General information", - "score": 1554.3713080168777 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Efficiency", - "score": 0.6601343742935112 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.704, mean=0.704, max=0.704, sum=1.409 (2)", - "tab": "Efficiency", - "score": 0.7043184669873187 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=345.013, mean=345.013, max=345.013, sum=690.027 (2)", - "tab": "General information", - "score": 345.0134529147982 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=380.855, mean=380.855, max=380.855, sum=761.71 (2)", - "tab": "General information", - "score": 380.85496183206106 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.769, mean=0.769, max=0.769, sum=1.538 (2)", - "tab": "Efficiency", - "score": 0.7691502098209602 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=676.289, mean=676.289, max=676.289, sum=1352.579 (2)", - "tab": "General information", - "score": 676.2892561983471 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.367 (2)", - "tab": "Efficiency", - "score": 0.6835026492370418 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=479.595, mean=479.595, max=479.595, sum=959.19 (2)", - "tab": "General information", - "score": 479.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.489 (2)", - "tab": "Efficiency", - "score": 0.7447149263960975 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=733.161, mean=733.161, max=733.161, sum=1466.321 (2)", - "tab": "General information", - "score": 733.1607142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.621, mean=0.621, max=0.621, sum=1.243 (2)", - "tab": "Efficiency", - "score": 0.6213390433672562 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=313.544, mean=313.544, max=313.544, sum=627.087 (2)", - "tab": "General information", - "score": 313.54368932038835 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927, - "details": { - "description": "min=0.927, mean=0.927, max=0.927, sum=1.855 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.679, mean=0.679, max=0.679, sum=1.357 (2)", - "tab": "Efficiency", - "score": 0.6785362948719252 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=455.825, mean=455.825, max=455.825, sum=911.65 (2)", - "tab": "General information", - "score": 455.8247863247863 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.689, mean=0.689, max=0.689, sum=1.379 (2)", - "tab": "Efficiency", - "score": 0.6893473124504089 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=375, mean=375, max=375, sum=750 (2)", - "tab": "General information", - "score": 375.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.916, - "details": { - "description": "min=0.916, mean=0.916, max=0.916, sum=1.831 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.633, mean=0.633, max=0.633, sum=1.266 (2)", - "tab": "Efficiency", - "score": 0.6329697509073815 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=344.441, mean=344.441, max=344.441, sum=688.881 (2)", - "tab": "General information", - "score": 344.4406130268199 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.831, - "details": { - "description": "min=0.831, mean=0.831, max=0.831, sum=1.663 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.703, mean=0.703, max=0.703, sum=1.406 (2)", - "tab": "Efficiency", - "score": 0.7028186107646524 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.754, mean=0.754, max=0.754, sum=1.509 (2)", - "tab": "Efficiency", - "score": 0.7543408100831442 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=520.913, mean=520.913, max=520.913, sum=1041.827 (2)", - "tab": "General information", - "score": 520.9132947976879 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=716.334, mean=716.334, max=716.334, sum=1432.668 (2)", - "tab": "General information", - "score": 716.3340782122905 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.846, - "details": { - "description": "min=0.846, mean=0.846, max=0.846, sum=1.693 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.721, mean=0.721, max=0.721, sum=1.442 (2)", - "tab": "Efficiency", - "score": 0.7212473138485079 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=656.317, mean=656.317, max=656.317, sum=1312.634 (2)", - "tab": "General information", - "score": 656.3169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.892, - "details": { - "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)", - "tab": "Efficiency", - "score": 0.7115242841802998 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=578.096, mean=578.096, max=578.096, sum=1156.191 (2)", - "tab": "General information", - "score": 578.0956790123457 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827, - "details": { - "description": "min=0.827, mean=0.827, max=0.827, sum=1.655 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.708, mean=0.708, max=0.708, sum=1.417 (2)", - "tab": "Efficiency", - "score": 0.708361968127164 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=445.436, mean=445.436, max=445.436, sum=890.873 (2)", - "tab": "General information", - "score": 445.43636363636364 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Efficiency", - "score": 0.9198286231683225 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1240.196, mean=1240.196, max=1240.196, sum=2480.392 (2)", - "tab": "General information", - "score": 1240.1959183673468 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.881, - "details": { - "description": "min=0.881, mean=0.881, max=0.881, sum=1.761 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.421 (2)", - "tab": "Efficiency", - "score": 0.7103830344641386 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=476.99, mean=476.99, max=476.99, sum=953.98 (2)", - "tab": "General information", - "score": 476.99004975124376 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.677, mean=0.677, max=0.677, sum=1.354 (2)", - "tab": "Efficiency", - "score": 0.6768132835985666 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=376.102, mean=376.102, max=376.102, sum=752.205 (2)", - "tab": "General information", - "score": 376.1024096385542 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)", - "tab": "Efficiency", - "score": 0.644616849241201 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=302.971, mean=302.971, max=302.971, sum=605.942 (2)", - "tab": "General information", - "score": 302.97076023391816 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.258, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json deleted file mode 100644 index 45536e1a1..000000000 --- a/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jamba 1.5 Large", - "id": "ai21/jamba-1.5-large", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782, - "details": { - "description": "min=0.46, mean=0.782, max=0.969, sum=89.128 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.889, mean=1.01, max=1.394, sum=115.088 (114)", - "tab": "Efficiency", - "score": 1.0095401397461812 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=293.649, mean=658.432, max=2900.673, sum=75061.271 (114)", - "tab": "General information", - "score": 658.4322049384847 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.933, mean=0.933, max=0.933, sum=1.865 (2)", - "tab": "Efficiency", - "score": 0.9326767182350159 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=397.58, mean=397.58, max=397.58, sum=795.16 (2)", - "tab": "General information", - "score": 397.58 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.777 (2)", - "tab": "Efficiency", - "score": 0.8885634528266059 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=376.741, mean=376.741, max=376.741, sum=753.481 (2)", - "tab": "General information", - "score": 376.74074074074076 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.971, mean=0.971, max=0.971, sum=1.942 (2)", - "tab": "Efficiency", - "score": 0.9710254788398742 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.968, mean=0.968, max=0.968, sum=1.936 (2)", - "tab": "Efficiency", - "score": 0.968123722407553 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.986, mean=0.986, max=0.986, sum=1.973 (2)", - "tab": "Efficiency", - "score": 0.9862666988372802 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Efficiency", - "score": 0.9599522399902344 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.978, mean=0.978, max=0.978, sum=1.957 (2)", - "tab": "Efficiency", - "score": 0.9782800839815525 - }, - "College Physics - Observed inference time (s)": { - "description": "min=1.01, mean=1.01, max=1.01, sum=2.019 (2)", - "tab": "Efficiency", - "score": 1.0095638387343462 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=598.67, mean=598.67, max=598.67, sum=1197.34 (2)", - "tab": "General information", - "score": 598.67 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=507.306, mean=507.306, max=507.306, sum=1014.611 (2)", - "tab": "General information", - "score": 507.30555555555554 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=883.21, mean=883.21, max=883.21, sum=1766.42 (2)", - "tab": "General information", - "score": 883.21 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=643.97, mean=643.97, max=643.97, sum=1287.94 (2)", - "tab": "General information", - "score": 643.97 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=543.347, mean=543.347, max=543.347, sum=1086.694 (2)", - "tab": "General information", - "score": 543.3468208092486 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=533.402, mean=533.402, max=533.402, sum=1066.804 (2)", - "tab": "General information", - "score": 533.4019607843137 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.0, mean=1.0, max=1.0, sum=2.0 (2)", - "tab": "Efficiency", - "score": 1.000160608291626 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=404.27, mean=404.27, max=404.27, sum=808.54 (2)", - "tab": "General information", - "score": 404.27 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614, - "details": { - "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.971, mean=0.971, max=0.971, sum=1.942 (2)", - "tab": "Efficiency", - "score": 0.9712212587657728 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=678.64, mean=678.64, max=678.64, sum=1357.281 (2)", - "tab": "General information", - "score": 678.640350877193 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54, - "details": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.951, mean=0.951, max=0.951, sum=1.901 (2)", - "tab": "Efficiency", - "score": 0.9506172919273377 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=466.9, mean=466.9, max=466.9, sum=933.8 (2)", - "tab": "General information", - "score": 466.9 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.929, mean=0.929, max=0.929, sum=1.858 (2)", - "tab": "Efficiency", - "score": 0.9292316171858046 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=427.185, mean=427.185, max=427.185, sum=854.37 (2)", - "tab": "General information", - "score": 427.18518518518516 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)", - "tab": "Efficiency", - "score": 0.9240530403480652 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=359.441, mean=359.441, max=359.441, sum=718.881 (2)", - "tab": "General information", - "score": 359.4405144694534 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.683 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.025, mean=1.025, max=1.025, sum=2.05 (2)", - "tab": "Efficiency", - "score": 1.0251652388011707 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.954, mean=0.954, max=0.954, sum=1.907 (2)", - "tab": "Efficiency", - "score": 0.9537228667144234 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.039, mean=1.039, max=1.039, sum=2.078 (2)", - "tab": "Efficiency", - "score": 1.0390360032097767 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.959, mean=0.959, max=0.959, sum=1.918 (2)", - "tab": "Efficiency", - "score": 0.9592212933340883 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1170.393, mean=1170.393, max=1170.393, sum=2340.787 (2)", - "tab": "General information", - "score": 1170.3933823529412 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=770.316, mean=770.316, max=770.316, sum=1540.631 (2)", - "tab": "General information", - "score": 770.3156028368794 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1725.955, mean=1725.955, max=1725.955, sum=3451.91 (2)", - "tab": "General information", - "score": 1725.9550195567144 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=611.645, mean=611.645, max=611.645, sum=1223.291 (2)", - "tab": "General information", - "score": 611.6454248366013 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.991, mean=0.991, max=0.991, sum=1.982 (2)", - "tab": "Efficiency", - "score": 0.9911877512931824 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=461.53, mean=461.53, max=461.53, sum=923.06 (2)", - "tab": "General information", - "score": 461.53 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882, - "details": { - "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.975, mean=0.975, max=0.975, sum=1.95 (2)", - "tab": "Efficiency", - "score": 0.9748745105768505 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=632.947, mean=632.947, max=632.947, sum=1265.895 (2)", - "tab": "General information", - "score": 632.9473684210526 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.963, mean=0.963, max=0.963, sum=1.926 (2)", - "tab": "Efficiency", - "score": 0.9630230093002319 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=591.96, mean=591.96, max=591.96, sum=1183.92 (2)", - "tab": "General information", - "score": 591.96 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.937, mean=0.937, max=0.937, sum=1.874 (2)", - "tab": "Efficiency", - "score": 0.9370616642933971 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=437.34, mean=437.34, max=437.34, sum=874.679 (2)", - "tab": "General information", - "score": 437.33962264150944 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.557 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.898, mean=0.898, max=0.898, sum=1.795 (2)", - "tab": "Efficiency", - "score": 0.8976521999277967 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=322.962, mean=322.962, max=322.962, sum=645.923 (2)", - "tab": "General information", - "score": 322.9617021276596 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Efficiency", - "score": 0.9001944936555007 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=494.662, mean=494.662, max=494.662, sum=989.324 (2)", - "tab": "General information", - "score": 494.6620689655172 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656, - "details": { - "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.976, mean=0.976, max=0.976, sum=1.951 (2)", - "tab": "Efficiency", - "score": 0.9756249517360062 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=607.042, mean=607.042, max=607.042, sum=1214.085 (2)", - "tab": "General information", - "score": 607.042328042328 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.619, - "details": { - "description": "min=0.619, mean=0.619, max=0.619, sum=1.238 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.973, mean=0.973, max=0.973, sum=1.947 (2)", - "tab": "Efficiency", - "score": 0.9733156949754745 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=656.468, mean=656.468, max=656.468, sum=1312.937 (2)", - "tab": "General information", - "score": 656.468253968254 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911, - "details": { - "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)", - "tab": "Efficiency", - "score": 0.9529511121011549 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.955, mean=0.955, max=0.955, sum=1.911 (2)", - "tab": "Efficiency", - "score": 0.955410502814307 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.978, mean=0.978, max=0.978, sum=1.957 (2)", - "tab": "Efficiency", - "score": 0.9784861493110657 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.394, mean=1.394, max=1.394, sum=2.789 (2)", - "tab": "Efficiency", - "score": 1.394392929655133 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=1.119, mean=1.119, max=1.119, sum=2.238 (2)", - "tab": "Efficiency", - "score": 1.1188469896412858 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=1.151, mean=1.151, max=1.151, sum=2.302 (2)", - "tab": "Efficiency", - "score": 1.1508279983243794 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=1.015, mean=1.015, max=1.015, sum=2.03 (2)", - "tab": "Efficiency", - "score": 1.014756965637207 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=1.115, mean=1.115, max=1.115, sum=2.229 (2)", - "tab": "Efficiency", - "score": 1.1145719607671103 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=1.094, mean=1.094, max=1.094, sum=2.189 (2)", - "tab": "Efficiency", - "score": 1.094437322696718 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=1.117, mean=1.117, max=1.117, sum=2.235 (2)", - "tab": "Efficiency", - "score": 1.1174537361852381 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=1.026, mean=1.026, max=1.026, sum=2.051 (2)", - "tab": "Efficiency", - "score": 1.025726358606181 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=1.119, mean=1.119, max=1.119, sum=2.238 (2)", - "tab": "Efficiency", - "score": 1.1191309756702847 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.362, mean=1.362, max=1.362, sum=2.724 (2)", - "tab": "Efficiency", - "score": 1.3617976483176737 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.138, mean=1.138, max=1.138, sum=2.275 (2)", - "tab": "Efficiency", - "score": 1.1377391141175217 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=532.455, mean=532.455, max=532.455, sum=1064.91 (2)", - "tab": "General information", - "score": 532.4548387096775 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=537.089, mean=537.089, max=537.089, sum=1074.177 (2)", - "tab": "General information", - "score": 537.0886699507389 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=958.39, mean=958.39, max=958.39, sum=1916.78 (2)", - "tab": "General information", - "score": 958.39 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2900.673, mean=2900.673, max=2900.673, sum=5801.345 (2)", - "tab": "General information", - "score": 2900.672727272727 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=406.146, mean=406.146, max=406.146, sum=812.293 (2)", - "tab": "General information", - "score": 406.14646464646466 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=492.788, mean=492.788, max=492.788, sum=985.575 (2)", - "tab": "General information", - "score": 492.78756476683935 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=406.1, mean=406.1, max=406.1, sum=812.2 (2)", - "tab": "General information", - "score": 406.1 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=583.248, mean=583.248, max=583.248, sum=1166.496 (2)", - "tab": "General information", - "score": 583.2481481481482 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=426.265, mean=426.265, max=426.265, sum=852.529 (2)", - "tab": "General information", - "score": 426.2647058823529 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=603.272, mean=603.272, max=603.272, sum=1206.543 (2)", - "tab": "General information", - "score": 603.2715231788079 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=525.635, mean=525.635, max=525.635, sum=1051.27 (2)", - "tab": "General information", - "score": 525.6348623853211 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=876.032, mean=876.032, max=876.032, sum=1752.065 (2)", - "tab": "General information", - "score": 876.0324074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2310.931, mean=2310.931, max=2310.931, sum=4621.863 (2)", - "tab": "General information", - "score": 2310.9313725490197 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1501.477, mean=1501.477, max=1501.477, sum=3002.954 (2)", - "tab": "General information", - "score": 1501.4767932489451 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.832, - "details": { - "description": "min=0.832, mean=0.832, max=0.832, sum=1.664 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=1.018, mean=1.018, max=1.018, sum=2.036 (2)", - "tab": "Efficiency", - "score": 1.0177636157236827 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=1.059, mean=1.059, max=1.059, sum=2.118 (2)", - "tab": "Efficiency", - "score": 1.0589779351503794 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=333.036, mean=333.036, max=333.036, sum=666.072 (2)", - "tab": "General information", - "score": 333.0358744394619 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=362.466, mean=362.466, max=362.466, sum=724.931 (2)", - "tab": "General information", - "score": 362.46564885496184 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=1.098, mean=1.098, max=1.098, sum=2.197 (2)", - "tab": "Efficiency", - "score": 1.098483010757068 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=662.628, mean=662.628, max=662.628, sum=1325.256 (2)", - "tab": "General information", - "score": 662.6280991735537 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.718 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=1.023, mean=1.023, max=1.023, sum=2.046 (2)", - "tab": "Efficiency", - "score": 1.0228094908357397 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=466.227, mean=466.227, max=466.227, sum=932.454 (2)", - "tab": "General information", - "score": 466.2269938650307 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688, - "details": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=1.124, mean=1.124, max=1.124, sum=2.247 (2)", - "tab": "Efficiency", - "score": 1.123652777501515 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=719.938, mean=719.938, max=719.938, sum=1439.875 (2)", - "tab": "General information", - "score": 719.9375 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=1.033, mean=1.033, max=1.033, sum=2.067 (2)", - "tab": "Efficiency", - "score": 1.0334750402320936 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=299.553, mean=299.553, max=299.553, sum=599.107 (2)", - "tab": "General information", - "score": 299.5533980582524 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=1.097, mean=1.097, max=1.097, sum=2.194 (2)", - "tab": "Efficiency", - "score": 1.0967916657782009 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=446.714, mean=446.714, max=446.714, sum=893.427 (2)", - "tab": "General information", - "score": 446.71367521367523 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=1.101, mean=1.101, max=1.101, sum=2.201 (2)", - "tab": "Efficiency", - "score": 1.1006885027885438 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=361.45, mean=361.45, max=361.45, sum=722.9 (2)", - "tab": "General information", - "score": 361.45 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.931, - "details": { - "description": "min=0.931, mean=0.931, max=0.931, sum=1.862 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)", - "tab": "Efficiency", - "score": 0.9063281955085647 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=332.257, mean=332.257, max=332.257, sum=664.513 (2)", - "tab": "General information", - "score": 332.2567049808429 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.686, mean=0.686, max=0.686, sum=1.372 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.946, mean=0.946, max=0.946, sum=1.892 (2)", - "tab": "Efficiency", - "score": 0.9461793238027937 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Efficiency", - "score": 0.9602039808667572 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=506.514, mean=506.514, max=506.514, sum=1013.029 (2)", - "tab": "General information", - "score": 506.514450867052 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=709.934, mean=709.934, max=709.934, sum=1419.868 (2)", - "tab": "General information", - "score": 709.9340782122905 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.869, mean=0.869, max=0.869, sum=1.739 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)", - "tab": "Efficiency", - "score": 0.9469306157305349 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=619.683, mean=619.683, max=619.683, sum=1239.366 (2)", - "tab": "General information", - "score": 619.6830065359477 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.892, - "details": { - "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.956, mean=0.956, max=0.956, sum=1.912 (2)", - "tab": "Efficiency", - "score": 0.9560920861032274 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=566.244, mean=566.244, max=566.244, sum=1132.488 (2)", - "tab": "General information", - "score": 566.2438271604939 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.944, mean=0.944, max=0.944, sum=1.887 (2)", - "tab": "Efficiency", - "score": 0.9436206535859541 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=440.6, mean=440.6, max=440.6, sum=881.2 (2)", - "tab": "General information", - "score": 440.6 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.771, - "details": { - "description": "min=0.771, mean=0.771, max=0.771, sum=1.543 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.988, mean=0.988, max=0.988, sum=1.976 (2)", - "tab": "Efficiency", - "score": 0.9880037901352863 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1221.388, mean=1221.388, max=1221.388, sum=2442.776 (2)", - "tab": "General information", - "score": 1221.3877551020407 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)", - "tab": "Efficiency", - "score": 0.9468028070914805 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=465.925, mean=465.925, max=465.925, sum=931.851 (2)", - "tab": "General information", - "score": 465.92537313432837 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.803 (2)", - "tab": "Efficiency", - "score": 0.9013677418950092 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=358.048, mean=358.048, max=358.048, sum=716.096 (2)", - "tab": "General information", - "score": 358.04819277108436 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.899, mean=0.899, max=0.899, sum=1.799 (2)", - "tab": "Efficiency", - "score": 0.8992712400112933 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=293.649, mean=293.649, max=293.649, sum=587.298 (2)", - "tab": "General information", - "score": 293.64912280701753 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.147, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json deleted file mode 100644 index 727c60261..000000000 --- a/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jamba 1.5 Mini", - "id": "ai21/jamba-1.5-mini", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.699, - "details": { - "description": "min=0.269, mean=0.699, max=0.943, sum=79.696 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.78, mean=0.859, max=1.024, sum=97.957 (114)", - "tab": "Efficiency", - "score": 0.8592709427634447 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=293.649, mean=658.432, max=2900.673, sum=75061.271 (114)", - "tab": "General information", - "score": 658.4322049384847 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)", - "tab": "Efficiency", - "score": 0.783083221912384 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=397.58, mean=397.58, max=397.58, sum=795.16 (2)", - "tab": "General information", - "score": 397.58 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.832, mean=0.832, max=0.832, sum=1.664 (2)", - "tab": "Efficiency", - "score": 0.8321040700983118 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=376.741, mean=376.741, max=376.741, sum=753.481 (2)", - "tab": "General information", - "score": 376.74074074074076 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48, - "details": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.961 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)", - "tab": "Efficiency", - "score": 0.8074449944496155 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.643 (2)", - "tab": "Efficiency", - "score": 0.8214208516809676 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Efficiency", - "score": 0.8334288668632507 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Efficiency", - "score": 0.8399906301498413 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.831, mean=0.831, max=0.831, sum=1.662 (2)", - "tab": "Efficiency", - "score": 0.8312392317490771 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)", - "tab": "Efficiency", - "score": 0.8287959309185252 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=598.67, mean=598.67, max=598.67, sum=1197.34 (2)", - "tab": "General information", - "score": 598.67 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=507.306, mean=507.306, max=507.306, sum=1014.611 (2)", - "tab": "General information", - "score": 507.30555555555554 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=883.21, mean=883.21, max=883.21, sum=1766.42 (2)", - "tab": "General information", - "score": 883.21 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=643.97, mean=643.97, max=643.97, sum=1287.94 (2)", - "tab": "General information", - "score": 643.97 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=543.347, mean=543.347, max=543.347, sum=1086.694 (2)", - "tab": "General information", - "score": 543.3468208092486 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=533.402, mean=533.402, max=533.402, sum=1066.804 (2)", - "tab": "General information", - "score": 533.4019607843137 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.801, mean=0.801, max=0.801, sum=1.602 (2)", - "tab": "Efficiency", - "score": 0.8010901069641113 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=404.27, mean=404.27, max=404.27, sum=808.54 (2)", - "tab": "General information", - "score": 404.27 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.491, - "details": { - "description": "min=0.491, mean=0.491, max=0.491, sum=0.982 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.661 (2)", - "tab": "Efficiency", - "score": 0.8303811194603903 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=678.64, mean=678.64, max=678.64, sum=1357.281 (2)", - "tab": "General information", - "score": 678.640350877193 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43, - "details": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.694 (2)", - "tab": "Efficiency", - "score": 0.8467721128463745 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=466.9, mean=466.9, max=466.9, sum=933.8 (2)", - "tab": "General information", - "score": 466.9 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.619 (2)", - "tab": "Efficiency", - "score": 0.8092732672338132 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=427.185, mean=427.185, max=427.185, sum=854.37 (2)", - "tab": "General information", - "score": 427.18518518518516 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.815, mean=0.815, max=0.815, sum=1.629 (2)", - "tab": "Efficiency", - "score": 0.8147224314343124 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=359.441, mean=359.441, max=359.441, sum=718.881 (2)", - "tab": "General information", - "score": 359.4405144694534 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.832, mean=0.832, max=0.832, sum=1.663 (2)", - "tab": "Efficiency", - "score": 0.8315524055677301 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)", - "tab": "Efficiency", - "score": 0.8028552659014438 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.671 (2)", - "tab": "Efficiency", - "score": 0.8356168884031154 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)", - "tab": "Efficiency", - "score": 0.811913901684331 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1170.393, mean=1170.393, max=1170.393, sum=2340.787 (2)", - "tab": "General information", - "score": 1170.3933823529412 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=770.316, mean=770.316, max=770.316, sum=1540.631 (2)", - "tab": "General information", - "score": 770.3156028368794 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1725.955, mean=1725.955, max=1725.955, sum=3451.91 (2)", - "tab": "General information", - "score": 1725.9550195567144 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=611.645, mean=611.645, max=611.645, sum=1223.291 (2)", - "tab": "General information", - "score": 611.6454248366013 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)", - "tab": "Efficiency", - "score": 0.8269450402259827 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=461.53, mean=461.53, max=461.53, sum=923.06 (2)", - "tab": "General information", - "score": 461.53 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.645 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.811, mean=0.811, max=0.811, sum=1.622 (2)", - "tab": "Efficiency", - "score": 0.8109481099404787 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=632.947, mean=632.947, max=632.947, sum=1265.895 (2)", - "tab": "General information", - "score": 632.9473684210526 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.803, mean=0.803, max=0.803, sum=1.607 (2)", - "tab": "Efficiency", - "score": 0.8034474205970764 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=591.96, mean=591.96, max=591.96, sum=1183.92 (2)", - "tab": "General information", - "score": 591.96 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=1.479 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.641 (2)", - "tab": "Efficiency", - "score": 0.8206060139638073 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=437.34, mean=437.34, max=437.34, sum=874.679 (2)", - "tab": "General information", - "score": 437.33962264150944 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677, - "details": { - "description": "min=0.677, mean=0.677, max=0.677, sum=1.353 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.577 (2)", - "tab": "Efficiency", - "score": 0.7882616854728537 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=322.962, mean=322.962, max=322.962, sum=645.923 (2)", - "tab": "General information", - "score": 322.9617021276596 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Efficiency", - "score": 0.800032663345337 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=494.662, mean=494.662, max=494.662, sum=989.324 (2)", - "tab": "General information", - "score": 494.6620689655172 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.553, - "details": { - "description": "min=0.553, mean=0.553, max=0.553, sum=1.106 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.619 (2)", - "tab": "Efficiency", - "score": 0.8097125253980122 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=607.042, mean=607.042, max=607.042, sum=1214.085 (2)", - "tab": "General information", - "score": 607.042328042328 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.641 (2)", - "tab": "Efficiency", - "score": 0.8205922804181538 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=656.468, mean=656.468, max=656.468, sum=1312.937 (2)", - "tab": "General information", - "score": 656.468253968254 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.604 (2)", - "tab": "Efficiency", - "score": 0.8022162606639247 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)", - "tab": "Efficiency", - "score": 0.7860349763203137 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Efficiency", - "score": 0.7999507975578308 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.782 (2)", - "tab": "Efficiency", - "score": 0.8912014065366802 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.943, mean=0.943, max=0.943, sum=1.887 (2)", - "tab": "Efficiency", - "score": 0.9434030766438957 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.989, mean=0.989, max=0.989, sum=1.977 (2)", - "tab": "Efficiency", - "score": 0.9887206962071552 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)", - "tab": "Efficiency", - "score": 0.9210334313221467 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.977, mean=0.977, max=0.977, sum=1.953 (2)", - "tab": "Efficiency", - "score": 0.976661871097706 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)", - "tab": "Efficiency", - "score": 0.9139112444484935 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.933, mean=0.933, max=0.933, sum=1.866 (2)", - "tab": "Efficiency", - "score": 0.9328556392366523 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.915, mean=0.915, max=0.915, sum=1.83 (2)", - "tab": "Efficiency", - "score": 0.9148573503581756 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.956, mean=0.956, max=0.956, sum=1.912 (2)", - "tab": "Efficiency", - "score": 0.95619613704858 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.98, mean=0.98, max=0.98, sum=1.959 (2)", - "tab": "Efficiency", - "score": 0.9797390874694375 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.996, mean=0.996, max=0.996, sum=1.991 (2)", - "tab": "Efficiency", - "score": 0.9955862363179525 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=532.455, mean=532.455, max=532.455, sum=1064.91 (2)", - "tab": "General information", - "score": 532.4548387096775 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=537.089, mean=537.089, max=537.089, sum=1074.177 (2)", - "tab": "General information", - "score": 537.0886699507389 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=958.39, mean=958.39, max=958.39, sum=1916.78 (2)", - "tab": "General information", - "score": 958.39 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2900.673, mean=2900.673, max=2900.673, sum=5801.345 (2)", - "tab": "General information", - "score": 2900.672727272727 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=406.146, mean=406.146, max=406.146, sum=812.293 (2)", - "tab": "General information", - "score": 406.14646464646466 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=492.788, mean=492.788, max=492.788, sum=985.575 (2)", - "tab": "General information", - "score": 492.78756476683935 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=406.1, mean=406.1, max=406.1, sum=812.2 (2)", - "tab": "General information", - "score": 406.1 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=583.248, mean=583.248, max=583.248, sum=1166.496 (2)", - "tab": "General information", - "score": 583.2481481481482 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=426.265, mean=426.265, max=426.265, sum=852.529 (2)", - "tab": "General information", - "score": 426.2647058823529 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=603.272, mean=603.272, max=603.272, sum=1206.543 (2)", - "tab": "General information", - "score": 603.2715231788079 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=525.635, mean=525.635, max=525.635, sum=1051.27 (2)", - "tab": "General information", - "score": 525.6348623853211 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=876.032, mean=876.032, max=876.032, sum=1752.065 (2)", - "tab": "General information", - "score": 876.0324074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2310.931, mean=2310.931, max=2310.931, sum=4621.863 (2)", - "tab": "General information", - "score": 2310.9313725490197 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1501.477, mean=1501.477, max=1501.477, sum=3002.954 (2)", - "tab": "General information", - "score": 1501.4767932489451 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Efficiency", - "score": 0.889766787199696 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.856 (2)", - "tab": "Efficiency", - "score": 0.9282377730799085 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=333.036, mean=333.036, max=333.036, sum=666.072 (2)", - "tab": "General information", - "score": 333.0358744394619 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=362.466, mean=362.466, max=362.466, sum=724.931 (2)", - "tab": "General information", - "score": 362.46564885496184 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.938, mean=0.938, max=0.938, sum=1.875 (2)", - "tab": "Efficiency", - "score": 0.9376649265446939 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=662.628, mean=662.628, max=662.628, sum=1325.256 (2)", - "tab": "General information", - "score": 662.6280991735537 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Efficiency", - "score": 0.9101676209572634 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=466.227, mean=466.227, max=466.227, sum=932.454 (2)", - "tab": "General information", - "score": 466.2269938650307 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.936, mean=0.936, max=0.936, sum=1.873 (2)", - "tab": "Efficiency", - "score": 0.9363672009536198 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=719.938, mean=719.938, max=719.938, sum=1439.875 (2)", - "tab": "General information", - "score": 719.9375 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.65 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=1.024, mean=1.024, max=1.024, sum=2.049 (2)", - "tab": "Efficiency", - "score": 1.0244285111288423 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=299.553, mean=299.553, max=299.553, sum=599.107 (2)", - "tab": "General information", - "score": 299.5533980582524 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "details": { - "description": "min=0.915, mean=0.915, max=0.915, sum=1.829 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.967, mean=0.967, max=0.967, sum=1.934 (2)", - "tab": "Efficiency", - "score": 0.9670558464832795 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=446.714, mean=446.714, max=446.714, sum=893.427 (2)", - "tab": "General information", - "score": 446.71367521367523 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=1.001, mean=1.001, max=1.001, sum=2.002 (2)", - "tab": "Efficiency", - "score": 1.0011137557029723 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=361.45, mean=361.45, max=361.45, sum=722.9 (2)", - "tab": "General information", - "score": 361.45 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.902, - "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.907, mean=0.907, max=0.907, sum=1.813 (2)", - "tab": "Efficiency", - "score": 0.9065530522420793 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=332.257, mean=332.257, max=332.257, sum=664.513 (2)", - "tab": "General information", - "score": 332.2567049808429 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.269, - "details": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.539 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.799, mean=0.799, max=0.799, sum=1.599 (2)", - "tab": "Efficiency", - "score": 0.7992533741658823 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.581 (2)", - "tab": "Efficiency", - "score": 0.7903663371528327 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=506.514, mean=506.514, max=506.514, sum=1013.029 (2)", - "tab": "General information", - "score": 506.514450867052 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=709.934, mean=709.934, max=709.934, sum=1419.868 (2)", - "tab": "General information", - "score": 709.9340782122905 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.801, - "details": { - "description": "min=0.801, mean=0.801, max=0.801, sum=1.601 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.799, mean=0.799, max=0.799, sum=1.599 (2)", - "tab": "Efficiency", - "score": 0.7992852076985477 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=619.683, mean=619.683, max=619.683, sum=1239.366 (2)", - "tab": "General information", - "score": 619.6830065359477 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.607 (2)", - "tab": "Efficiency", - "score": 0.8036901479885902 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=566.244, mean=566.244, max=566.244, sum=1132.488 (2)", - "tab": "General information", - "score": 566.2438271604939 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.819, mean=0.819, max=0.819, sum=1.638 (2)", - "tab": "Efficiency", - "score": 0.8189079783179544 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=440.6, mean=440.6, max=440.6, sum=881.2 (2)", - "tab": "General information", - "score": 440.6 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.51 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.828, mean=0.828, max=0.828, sum=1.655 (2)", - "tab": "Efficiency", - "score": 0.8276801226090412 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1221.388, mean=1221.388, max=1221.388, sum=2442.776 (2)", - "tab": "General information", - "score": 1221.3877551020407 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.751 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.792, mean=0.792, max=0.792, sum=1.583 (2)", - "tab": "Efficiency", - "score": 0.7917492271062747 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=465.925, mean=465.925, max=465.925, sum=931.851 (2)", - "tab": "General information", - "score": 465.92537313432837 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)", - "tab": "Efficiency", - "score": 0.7796976523227003 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=358.048, mean=358.048, max=358.048, sum=716.096 (2)", - "tab": "General information", - "score": 358.04819277108436 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", - "tab": "Efficiency", - "score": 0.8218589679539552 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=293.649, mean=293.649, max=293.649, sum=587.298 (2)", - "tab": "General information", - "score": 293.64912280701753 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.206, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json deleted file mode 100644 index 3a25316d9..000000000 --- a/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jamba Instruct", - "id": "ai21/jamba-instruct", - "developer": "ai21", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.341, mean=0.659, max=0.91, sum=75.114 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.233, mean=0.277, max=0.519, sum=31.585 (114)", - "tab": "Efficiency", - "score": 0.2770578114829593 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=223.731, mean=490.686, max=2081.679, sum=55938.26 (114)", - "tab": "General information", - "score": 490.6864895752317 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.36, - "details": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.275, mean=0.275, max=0.275, sum=0.55 (2)", - "tab": "Efficiency", - "score": 0.27479029655456544 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.44, mean=373.44, max=373.44, sum=746.88 (2)", - "tab": "General information", - "score": 373.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.615, - "details": { - "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.236, mean=0.236, max=0.236, sum=0.473 (2)", - "tab": "Efficiency", - "score": 0.2363892325648555 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=270.2, mean=270.2, max=270.2, sum=540.4 (2)", - "tab": "General information", - "score": 270.2 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422, - "details": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.275, mean=0.275, max=0.275, sum=0.55 (2)", - "tab": "Efficiency", - "score": 0.2747657370567322 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.519 (2)", - "tab": "Efficiency", - "score": 0.2595776534742779 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.2938127589225769 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)", - "tab": "Efficiency", - "score": 0.26912292957305906 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)", - "tab": "Efficiency", - "score": 0.30890216579327007 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.749 (2)", - "tab": "Efficiency", - "score": 0.374276315464693 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)", - "tab": "General information", - "score": 549.4 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=363.431, mean=363.431, max=363.431, sum=726.861 (2)", - "tab": "General information", - "score": 363.43055555555554 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=720.67, mean=720.67, max=720.67, sum=1441.34 (2)", - "tab": "General information", - "score": 720.67 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=535.22, mean=535.22, max=535.22, sum=1070.44 (2)", - "tab": "General information", - "score": 535.22 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=397.855, mean=397.855, max=397.855, sum=795.711 (2)", - "tab": "General information", - "score": 397.8554913294798 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=392.598, mean=392.598, max=392.598, sum=785.196 (2)", - "tab": "General information", - "score": 392.5980392156863 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)", - "tab": "Efficiency", - "score": 0.2529018998146057 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.54, mean=378.54, max=378.54, sum=757.08 (2)", - "tab": "General information", - "score": 378.54 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.439, - "details": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.877 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.507 (2)", - "tab": "Efficiency", - "score": 0.25371592086658146 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)", - "tab": "General information", - "score": 614.4298245614035 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4, - "details": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)", - "tab": "Efficiency", - "score": 0.25686686754226684 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=329.71, mean=329.71, max=329.71, sum=659.42 (2)", - "tab": "General information", - "score": 329.71 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.521 (2)", - "tab": "Efficiency", - "score": 0.260397990544637 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=312.287, mean=312.287, max=312.287, sum=624.574 (2)", - "tab": "General information", - "score": 312.287037037037 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.749, - "details": { - "description": "min=0.749, mean=0.749, max=0.749, sum=1.498 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.252, mean=0.252, max=0.252, sum=0.504 (2)", - "tab": "Efficiency", - "score": 0.25189057270430293 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=267.441, mean=267.441, max=267.441, sum=534.881 (2)", - "tab": "General information", - "score": 267.4405144694534 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.716, - "details": { - "description": "min=0.716, mean=0.716, max=0.716, sum=1.431 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)", - "tab": "Efficiency", - "score": 0.30818068542901206 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.266, mean=0.266, max=0.266, sum=0.532 (2)", - "tab": "Efficiency", - "score": 0.26598995881723175 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)", - "tab": "Efficiency", - "score": 0.36489380229716195 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.255, mean=0.255, max=0.255, sum=0.511 (2)", - "tab": "Efficiency", - "score": 0.25544750768374774 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=813.651, mean=813.651, max=813.651, sum=1627.301 (2)", - "tab": "General information", - "score": 813.6507352941177 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=555.461, mean=555.461, max=555.461, sum=1110.922 (2)", - "tab": "General information", - "score": 555.4609929078014 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1151.508, mean=1151.508, max=1151.508, sum=2303.016 (2)", - "tab": "General information", - "score": 1151.5078226857888 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=422.158, mean=422.158, max=422.158, sum=844.317 (2)", - "tab": "General information", - "score": 422.15849673202615 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.271, mean=0.271, max=0.271, sum=0.542 (2)", - "tab": "Efficiency", - "score": 0.27118161678314207 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.461 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.553 (2)", - "tab": "Efficiency", - "score": 0.27634719171022115 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=440.612, mean=440.612, max=440.612, sum=881.224 (2)", - "tab": "General information", - "score": 440.6118421052632 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.533 (2)", - "tab": "Efficiency", - "score": 0.2665403389930725 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=521.13, mean=521.13, max=521.13, sum=1042.26 (2)", - "tab": "General information", - "score": 521.13 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.259, mean=0.259, max=0.259, sum=0.517 (2)", - "tab": "Efficiency", - "score": 0.25872870661177727 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=317.268, mean=317.268, max=317.268, sum=634.536 (2)", - "tab": "General information", - "score": 317.2679245283019 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677, - "details": { - "description": "min=0.677, mean=0.677, max=0.677, sum=1.353 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)", - "tab": "Efficiency", - "score": 0.30636518965376186 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=241.511, mean=241.511, max=241.511, sum=483.021 (2)", - "tab": "General information", - "score": 241.51063829787233 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621, - "details": { - "description": "min=0.621, mean=0.621, max=0.621, sum=1.241 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)", - "tab": "Efficiency", - "score": 0.41247522255470015 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=382.393, mean=382.393, max=382.393, sum=764.786 (2)", - "tab": "General information", - "score": 382.39310344827584 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.497, - "details": { - "description": "min=0.497, mean=0.497, max=0.497, sum=0.995 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.259, mean=0.259, max=0.259, sum=0.517 (2)", - "tab": "Efficiency", - "score": 0.2586819948973479 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=467.987, mean=467.987, max=467.987, sum=935.974 (2)", - "tab": "General information", - "score": 467.9867724867725 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444, - "details": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.263, mean=0.263, max=0.263, sum=0.526 (2)", - "tab": "Efficiency", - "score": 0.2629187542294699 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=559.865, mean=559.865, max=559.865, sum=1119.73 (2)", - "tab": "General information", - "score": 559.8650793650794 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.797, - "details": { - "description": "min=0.797, mean=0.797, max=0.797, sum=1.595 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.513 (2)", - "tab": "Efficiency", - "score": 0.25630061088069794 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.259, mean=0.259, max=0.259, sum=0.519 (2)", - "tab": "Efficiency", - "score": 0.2594739521665526 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.29399110078811647 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.519, mean=0.519, max=0.519, sum=1.039 (2)", - "tab": "Efficiency", - "score": 0.5194540543989702 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)", - "tab": "Efficiency", - "score": 0.24992815051415954 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.242, mean=0.242, max=0.242, sum=0.484 (2)", - "tab": "Efficiency", - "score": 0.242088835474123 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.24, mean=0.24, max=0.24, sum=0.481 (2)", - "tab": "Efficiency", - "score": 0.240464658003587 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.252, mean=0.252, max=0.252, sum=0.503 (2)", - "tab": "Efficiency", - "score": 0.25154934459262424 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.501 (2)", - "tab": "Efficiency", - "score": 0.25046268931957855 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.511 (2)", - "tab": "Efficiency", - "score": 0.25560809444907484 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.251, mean=0.251, max=0.251, sum=0.501 (2)", - "tab": "Efficiency", - "score": 0.250657169971991 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)", - "tab": "Efficiency", - "score": 0.2818450938772272 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)", - "tab": "Efficiency", - "score": 0.44991188072690774 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.3466388042466047 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=380.871, mean=380.871, max=380.871, sum=761.742 (2)", - "tab": "General information", - "score": 380.8709677419355 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=401.734, mean=401.734, max=401.734, sum=803.468 (2)", - "tab": "General information", - "score": 401.73399014778323 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=793.8, mean=793.8, max=793.8, sum=1587.6 (2)", - "tab": "General information", - "score": 793.8 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2081.679, mean=2081.679, max=2081.679, sum=4163.358 (2)", - "tab": "General information", - "score": 2081.6787878787877 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=299.717, mean=299.717, max=299.717, sum=599.434 (2)", - "tab": "General information", - "score": 299.7171717171717 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=333.601, mean=333.601, max=333.601, sum=667.202 (2)", - "tab": "General information", - "score": 333.60103626943004 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=286.562, mean=286.562, max=286.562, sum=573.123 (2)", - "tab": "General information", - "score": 286.5615384615385 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=421.889, mean=421.889, max=421.889, sum=843.778 (2)", - "tab": "General information", - "score": 421.8888888888889 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=301.231, mean=301.231, max=301.231, sum=602.462 (2)", - "tab": "General information", - "score": 301.2310924369748 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=453.51, mean=453.51, max=453.51, sum=907.02 (2)", - "tab": "General information", - "score": 453.50993377483445 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=355.059, mean=355.059, max=355.059, sum=710.117 (2)", - "tab": "General information", - "score": 355.0587155963303 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=648.037, mean=648.037, max=648.037, sum=1296.074 (2)", - "tab": "General information", - "score": 648.0370370370371 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=1628.495, mean=1628.495, max=1628.495, sum=3256.99 (2)", - "tab": "General information", - "score": 1628.4950980392157 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1025.097, mean=1025.097, max=1025.097, sum=2050.194 (2)", - "tab": "General information", - "score": 1025.097046413502 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.794, - "details": { - "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.233, mean=0.233, max=0.233, sum=0.466 (2)", - "tab": "Efficiency", - "score": 0.2328128023532474 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.251, mean=0.251, max=0.251, sum=0.501 (2)", - "tab": "Efficiency", - "score": 0.2506928462108583 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=250.915, mean=250.915, max=250.915, sum=501.83 (2)", - "tab": "General information", - "score": 250.91479820627802 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=263.183, mean=263.183, max=263.183, sum=526.366 (2)", - "tab": "General information", - "score": 263.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.271, mean=0.271, max=0.271, sum=0.542 (2)", - "tab": "Efficiency", - "score": 0.27110107082965945 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=477.843, mean=477.843, max=477.843, sum=955.686 (2)", - "tab": "General information", - "score": 477.8429752066116 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.411 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.499 (2)", - "tab": "Efficiency", - "score": 0.24970631804202964 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=337.718, mean=337.718, max=337.718, sum=675.436 (2)", - "tab": "General information", - "score": 337.7177914110429 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536, - "details": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.071 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.533 (2)", - "tab": "Efficiency", - "score": 0.2665597881589617 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=559.277, mean=559.277, max=559.277, sum=1118.554 (2)", - "tab": "General information", - "score": 559.2767857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.241, mean=0.241, max=0.241, sum=0.481 (2)", - "tab": "Efficiency", - "score": 0.24073980386974742 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=225.262, mean=225.262, max=225.262, sum=450.524 (2)", - "tab": "General information", - "score": 225.2621359223301 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.769 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.258, mean=0.258, max=0.258, sum=0.517 (2)", - "tab": "Efficiency", - "score": 0.25835410753885907 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=351.573, mean=351.573, max=351.573, sum=703.145 (2)", - "tab": "General information", - "score": 351.5726495726496 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.251, mean=0.251, max=0.251, sum=0.502 (2)", - "tab": "Efficiency", - "score": 0.2510761094093323 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=274.75, mean=274.75, max=274.75, sum=549.5 (2)", - "tab": "General information", - "score": 274.75 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.729 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.233, mean=0.233, max=0.233, sum=0.466 (2)", - "tab": "Efficiency", - "score": 0.23304342005596915 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=254.525, mean=254.525, max=254.525, sum=509.05 (2)", - "tab": "General information", - "score": 254.5249042145594 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.465, - "details": { - "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)", - "tab": "Efficiency", - "score": 0.2561916905331474 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.262, mean=0.262, max=0.262, sum=0.525 (2)", - "tab": "Efficiency", - "score": 0.2624055065922231 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=357.165, mean=357.165, max=357.165, sum=714.329 (2)", - "tab": "General information", - "score": 357.16473988439304 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=546.793, mean=546.793, max=546.793, sum=1093.587 (2)", - "tab": "General information", - "score": 546.7932960893854 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.496 (2)", - "tab": "Efficiency", - "score": 0.2479639964945176 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=454.758, mean=454.758, max=454.758, sum=909.516 (2)", - "tab": "General information", - "score": 454.75816993464053 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)", - "tab": "Efficiency", - "score": 0.2538878917694092 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=410.315, mean=410.315, max=410.315, sum=820.63 (2)", - "tab": "General information", - "score": 410.31481481481484 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.682, - "details": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.252, mean=0.252, max=0.252, sum=0.505 (2)", - "tab": "Efficiency", - "score": 0.25225248553536156 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=316.591, mean=316.591, max=316.591, sum=633.182 (2)", - "tab": "General information", - "score": 316.59090909090907 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.486 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)", - "tab": "Efficiency", - "score": 0.30983400539476047 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=856.637, mean=856.637, max=856.637, sum=1713.273 (2)", - "tab": "General information", - "score": 856.6367346938775 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.891, - "details": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.258, mean=0.258, max=0.258, sum=0.515 (2)", - "tab": "Efficiency", - "score": 0.25752189384764107 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=327.801, mean=327.801, max=327.801, sum=655.602 (2)", - "tab": "General information", - "score": 327.80099502487565 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.477 (2)", - "tab": "Efficiency", - "score": 0.23830672200903835 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=267.458, mean=267.458, max=267.458, sum=534.916 (2)", - "tab": "General information", - "score": 267.4578313253012 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.813, - "details": { - "description": "min=0.813, mean=0.813, max=0.813, sum=1.626 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.236, mean=0.236, max=0.236, sum=0.473 (2)", - "tab": "Efficiency", - "score": 0.23630904593662908 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=223.731, mean=223.731, max=223.731, sum=447.462 (2)", - "tab": "General information", - "score": 223.73099415204678 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.887, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json deleted file mode 100644 index 8bf036c64..000000000 --- a/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo 1.7 7B", - "id": "allenai/olmo-1.7-7b", - "developer": "allenai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.538, - "details": { - "description": "min=0.307, mean=0.538, max=0.769, sum=61.295 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.518, mean=1.024, max=2.978, sum=116.777 (114)", - "tab": "Efficiency", - "score": 1.024362741022275 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=2.909, mean=4.946, max=5, sum=563.813 (114)", - "tab": "General information", - "score": 4.945727778020373 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=285.766, mean=597.916, max=1816.758, sum=68162.415 (114)", - "tab": "General information", - "score": 597.9159199418197 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.664, mean=0.664, max=0.664, sum=1.328 (2)", - "tab": "Efficiency", - "score": 0.664234619140625 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=358.76, mean=358.76, max=358.76, sum=717.52 (2)", - "tab": "General information", - "score": 358.76 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.496, - "details": { - "description": "min=0.496, mean=0.496, max=0.496, sum=0.993 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.619, mean=0.619, max=0.619, sum=1.237 (2)", - "tab": "Efficiency", - "score": 0.618622675648442 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=352.03, mean=352.03, max=352.03, sum=704.059 (2)", - "tab": "General information", - "score": 352.02962962962965 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.333, - "details": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.954, mean=0.954, max=0.954, sum=1.908 (2)", - "tab": "Efficiency", - "score": 0.9539380264282227 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)", - "tab": "Efficiency", - "score": 0.7911433676878611 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=1.44, mean=1.44, max=1.44, sum=2.88 (2)", - "tab": "Efficiency", - "score": 1.4402443194389343 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=1.005, mean=1.005, max=1.005, sum=2.01 (2)", - "tab": "Efficiency", - "score": 1.0049437880516052 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.933, mean=0.933, max=0.933, sum=1.866 (2)", - "tab": "Efficiency", - "score": 0.9331957646188019 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.707 (2)", - "tab": "Efficiency", - "score": 0.8537454745348763 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=535.85, mean=535.85, max=535.85, sum=1071.7 (2)", - "tab": "General information", - "score": 535.85 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=470.319, mean=470.319, max=470.319, sum=940.639 (2)", - "tab": "General information", - "score": 470.31944444444446 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=842.89, mean=842.89, max=842.89, sum=1685.78 (2)", - "tab": "General information", - "score": 842.89 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=592.82, mean=592.82, max=592.82, sum=1185.64 (2)", - "tab": "General information", - "score": 592.82 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=519.376, mean=519.376, max=519.376, sum=1038.751 (2)", - "tab": "General information", - "score": 519.3757225433526 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=476.657, mean=476.657, max=476.657, sum=953.314 (2)", - "tab": "General information", - "score": 476.65686274509807 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65, - "details": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.671, mean=0.671, max=0.671, sum=1.343 (2)", - "tab": "Efficiency", - "score": 0.6713726472854614 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=388.19, mean=388.19, max=388.19, sum=776.38 (2)", - "tab": "General information", - "score": 388.19 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.404, - "details": { - "description": "min=0.404, mean=0.404, max=0.404, sum=0.807 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=1.05, mean=1.05, max=1.05, sum=2.099 (2)", - "tab": "Efficiency", - "score": 1.0495816971126355 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=612.798, mean=612.798, max=612.798, sum=1225.596 (2)", - "tab": "General information", - "score": 612.7982456140351 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.34, - "details": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.739, mean=0.739, max=0.739, sum=1.477 (2)", - "tab": "Efficiency", - "score": 0.7387202930450439 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=400.58, mean=400.58, max=400.58, sum=801.16 (2)", - "tab": "General information", - "score": 400.58 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565, - "details": { - "description": "min=0.565, mean=0.565, max=0.565, sum=1.13 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.51 (2)", - "tab": "Efficiency", - "score": 0.7549951495947661 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=420.861, mean=420.861, max=420.861, sum=841.722 (2)", - "tab": "General information", - "score": 420.8611111111111 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.592, - "details": { - "description": "min=0.592, mean=0.592, max=0.592, sum=1.183 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.622, mean=0.622, max=0.622, sum=1.244 (2)", - "tab": "Efficiency", - "score": 0.6219598725677686 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=345.277, mean=345.277, max=345.277, sum=690.553 (2)", - "tab": "General information", - "score": 345.2765273311897 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.526, - "details": { - "description": "min=0.526, mean=0.526, max=0.526, sum=1.052 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.88, mean=1.88, max=1.88, sum=3.759 (2)", - "tab": "Efficiency", - "score": 1.8796235156409882 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=1.156, mean=1.156, max=1.156, sum=2.312 (2)", - "tab": "Efficiency", - "score": 1.1558757741400536 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=2.735, mean=2.735, max=2.735, sum=5.47 (2)", - "tab": "Efficiency", - "score": 2.734811251757198 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=1.006, mean=1.006, max=1.006, sum=2.012 (2)", - "tab": "Efficiency", - "score": 1.0057547404096017 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1080.882, mean=1080.882, max=1080.882, sum=2161.765 (2)", - "tab": "General information", - "score": 1080.8823529411766 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=660.922, mean=660.922, max=660.922, sum=1321.844 (2)", - "tab": "General information", - "score": 660.9219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=4.997, mean=4.997, max=4.997, sum=9.995 (2)", - "tab": "General information", - "score": 4.9973924380704045 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1654.433, mean=1654.433, max=1654.433, sum=3308.866 (2)", - "tab": "General information", - "score": 1654.4328552803129 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=590.873, mean=590.873, max=590.873, sum=1181.745 (2)", - "tab": "General information", - "score": 590.8725490196078 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.604 (2)", - "tab": "Efficiency", - "score": 0.8018933439254761 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=444.08, mean=444.08, max=444.08, sum=888.16 (2)", - "tab": "General information", - "score": 444.08 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.526, - "details": { - "description": "min=0.526, mean=0.526, max=0.526, sum=1.053 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=1.012, mean=1.012, max=1.012, sum=2.023 (2)", - "tab": "Efficiency", - "score": 1.0116610966230695 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=598.487, mean=598.487, max=598.487, sum=1196.974 (2)", - "tab": "General information", - "score": 598.4868421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.964, mean=0.964, max=0.964, sum=1.929 (2)", - "tab": "Efficiency", - "score": 0.9642905473709107 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=585.05, mean=585.05, max=585.05, sum=1170.1 (2)", - "tab": "General information", - "score": 585.05 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57, - "details": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.139 (2)", - "tab": "Efficiency", - "score": 0.5697462513761701 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=401.917, mean=401.917, max=401.917, sum=803.834 (2)", - "tab": "General information", - "score": 401.9169811320755 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.434, - "details": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.524, mean=0.524, max=0.524, sum=1.049 (2)", - "tab": "Efficiency", - "score": 0.5244635977643601 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=311.311, mean=311.311, max=311.311, sum=622.621 (2)", - "tab": "General information", - "score": 311.31063829787234 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517, - "details": { - "description": "min=0.517, mean=0.517, max=0.517, sum=1.034 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.764, mean=0.764, max=0.764, sum=1.528 (2)", - "tab": "Efficiency", - "score": 0.7642407762593236 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=424.848, mean=424.848, max=424.848, sum=849.697 (2)", - "tab": "General information", - "score": 424.848275862069 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307, - "details": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.817 (2)", - "tab": "Efficiency", - "score": 0.9087190634359128 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=505.071, mean=505.071, max=505.071, sum=1010.143 (2)", - "tab": "General information", - "score": 505.07142857142856 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325, - "details": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.651 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=1.12, mean=1.12, max=1.12, sum=2.24 (2)", - "tab": "Efficiency", - "score": 1.1198924439293998 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=653.595, mean=653.595, max=653.595, sum=1307.19 (2)", - "tab": "General information", - "score": 653.5952380952381 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.713, - "details": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)", - "tab": "Efficiency", - "score": 0.9262428129872968 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)", - "tab": "Efficiency", - "score": 0.710636249316737 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=1.389, mean=1.389, max=1.389, sum=2.779 (2)", - "tab": "Efficiency", - "score": 1.3893755102157592 - }, - "High School European History - Observed inference time (s)": { - "description": "min=2.978, mean=2.978, max=2.978, sum=5.957 (2)", - "tab": "Efficiency", - "score": 2.9784073266116056 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.736, mean=0.736, max=0.736, sum=1.471 (2)", - "tab": "Efficiency", - "score": 0.7356561253769229 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)", - "tab": "Efficiency", - "score": 0.8775828440572314 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)", - "tab": "Efficiency", - "score": 0.6891599153861021 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.834 (2)", - "tab": "Efficiency", - "score": 0.9171109632209495 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.748, mean=0.748, max=0.748, sum=1.496 (2)", - "tab": "Efficiency", - "score": 0.7482213062398574 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.983, mean=0.983, max=0.983, sum=1.965 (2)", - "tab": "Efficiency", - "score": 0.9825576125391272 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Efficiency", - "score": 0.9199631371629348 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=1.151, mean=1.151, max=1.151, sum=2.303 (2)", - "tab": "Efficiency", - "score": 1.1514487498336368 - }, - "High School US History - Observed inference time (s)": { - "description": "min=2.908, mean=2.908, max=2.908, sum=5.816 (2)", - "tab": "Efficiency", - "score": 2.9081676029691508 - }, - "High School World History - Observed inference time (s)": { - "description": "min=2.459, mean=2.459, max=2.459, sum=4.919 (2)", - "tab": "Efficiency", - "score": 2.4593187173207602 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.932, mean=513.932, max=513.932, sum=1027.865 (2)", - "tab": "General information", - "score": 513.9322580645161 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=479.842, mean=479.842, max=479.842, sum=959.685 (2)", - "tab": "General information", - "score": 479.8423645320197 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=889.39, mean=889.39, max=889.39, sum=1778.78 (2)", - "tab": "General information", - "score": 889.39 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=2.909, mean=2.909, max=2.909, sum=5.818 (2)", - "tab": "General information", - "score": 2.909090909090909 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=1816.758, mean=1816.758, max=1816.758, sum=3633.515 (2)", - "tab": "General information", - "score": 1816.7575757575758 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=400.091, mean=400.091, max=400.091, sum=800.182 (2)", - "tab": "General information", - "score": 400.09090909090907 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=482.762, mean=482.762, max=482.762, sum=965.523 (2)", - "tab": "General information", - "score": 482.7616580310881 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=392.351, mean=392.351, max=392.351, sum=784.703 (2)", - "tab": "General information", - "score": 392.35128205128206 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=506.689, mean=506.689, max=506.689, sum=1013.378 (2)", - "tab": "General information", - "score": 506.68888888888887 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=411.235, mean=411.235, max=411.235, sum=822.471 (2)", - "tab": "General information", - "score": 411.2352941176471 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=548.728, mean=548.728, max=548.728, sum=1097.457 (2)", - "tab": "General information", - "score": 548.7284768211921 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=514.793, mean=514.793, max=514.793, sum=1029.585 (2)", - "tab": "General information", - "score": 514.7926605504587 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=796.606, mean=796.606, max=796.606, sum=1593.213 (2)", - "tab": "General information", - "score": 796.6064814814815 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=4, mean=4, max=4, sum=8 (2)", - "tab": "General information", - "score": 4.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=1788.387, mean=1788.387, max=1788.387, sum=3576.775 (2)", - "tab": "General information", - "score": 1788.387254901961 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1461.443, mean=1461.443, max=1461.443, sum=2922.886 (2)", - "tab": "General information", - "score": 1461.4430379746836 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.595, - "details": { - "description": "min=0.595, mean=0.595, max=0.595, sum=1.191 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.568, mean=0.568, max=0.568, sum=1.135 (2)", - "tab": "Efficiency", - "score": 0.5676639603926996 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.254 (2)", - "tab": "Efficiency", - "score": 0.6270790318496354 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=323.691, mean=323.691, max=323.691, sum=647.381 (2)", - "tab": "General information", - "score": 323.69058295964123 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=355.351, mean=355.351, max=355.351, sum=710.702 (2)", - "tab": "General information", - "score": 355.35114503816794 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612, - "details": { - "description": "min=0.612, mean=0.612, max=0.612, sum=1.223 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=1.125, mean=1.125, max=1.125, sum=2.25 (2)", - "tab": "Efficiency", - "score": 1.1249816102429855 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=650.372, mean=650.372, max=650.372, sum=1300.744 (2)", - "tab": "General information", - "score": 650.3719008264463 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.607, - "details": { - "description": "min=0.607, mean=0.607, max=0.607, sum=1.215 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)", - "tab": "Efficiency", - "score": 0.8238252847472582 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=458.828, mean=458.828, max=458.828, sum=917.656 (2)", - "tab": "General information", - "score": 458.8282208588957 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375, - "details": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=1.161, mean=1.161, max=1.161, sum=2.321 (2)", - "tab": "Efficiency", - "score": 1.160504766872951 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=661.214, mean=661.214, max=661.214, sum=1322.429 (2)", - "tab": "General information", - "score": 661.2142857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689, - "details": { - "description": "min=0.689, mean=0.689, max=0.689, sum=1.379 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.518, mean=0.518, max=0.518, sum=1.035 (2)", - "tab": "Efficiency", - "score": 0.5176426901400668 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=298.049, mean=298.049, max=298.049, sum=596.097 (2)", - "tab": "General information", - "score": 298.0485436893204 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.769, - "details": { - "description": "min=0.769, mean=0.769, max=0.769, sum=1.538 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.749, mean=0.749, max=0.749, sum=1.499 (2)", - "tab": "Efficiency", - "score": 0.7494234182895758 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=440.103, mean=440.103, max=440.103, sum=880.205 (2)", - "tab": "General information", - "score": 440.1025641025641 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.121 (2)", - "tab": "Efficiency", - "score": 0.5603377485275268 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=340.48, mean=340.48, max=340.48, sum=680.96 (2)", - "tab": "General information", - "score": 340.48 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.734, - "details": { - "description": "min=0.734, mean=0.734, max=0.734, sum=1.469 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.533, mean=0.533, max=0.533, sum=1.066 (2)", - "tab": "Efficiency", - "score": 0.533118042452582 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=320.443, mean=320.443, max=320.443, sum=640.886 (2)", - "tab": "General information", - "score": 320.4431673052363 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.335, - "details": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)", - "tab": "Efficiency", - "score": 0.8448189255819155 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=1.193, mean=1.193, max=1.193, sum=2.387 (2)", - "tab": "Efficiency", - "score": 1.1933270441087265 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=502.243, mean=502.243, max=502.243, sum=1004.486 (2)", - "tab": "General information", - "score": 502.242774566474 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=667.861, mean=667.861, max=667.861, sum=1335.723 (2)", - "tab": "General information", - "score": 667.8614525139665 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.608, - "details": { - "description": "min=0.608, mean=0.608, max=0.608, sum=1.216 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.99, mean=0.99, max=0.99, sum=1.979 (2)", - "tab": "Efficiency", - "score": 0.9895777281592874 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=579.127, mean=579.127, max=579.127, sum=1158.255 (2)", - "tab": "General information", - "score": 579.1274509803922 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.593, - "details": { - "description": "min=0.593, mean=0.593, max=0.593, sum=1.185 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.966, mean=0.966, max=0.966, sum=1.932 (2)", - "tab": "Efficiency", - "score": 0.9661886655254128 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=535.151, mean=535.151, max=535.151, sum=1070.302 (2)", - "tab": "General information", - "score": 535.1512345679013 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)", - "tab": "Efficiency", - "score": 0.7631508913907138 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=422.982, mean=422.982, max=422.982, sum=845.964 (2)", - "tab": "General information", - "score": 422.9818181818182 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.522, - "details": { - "description": "min=0.522, mean=0.522, max=0.522, sum=1.045 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=2.064, mean=2.064, max=2.064, sum=4.128 (2)", - "tab": "Efficiency", - "score": 2.0640801809271987 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1207.057, mean=1207.057, max=1207.057, sum=2414.114 (2)", - "tab": "General information", - "score": 1207.057142857143 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.751, - "details": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.642 (2)", - "tab": "Efficiency", - "score": 0.8210354812109648 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=452.02, mean=452.02, max=452.02, sum=904.04 (2)", - "tab": "General information", - "score": 452.0199004975124 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.904 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.62, mean=0.62, max=0.62, sum=1.241 (2)", - "tab": "Efficiency", - "score": 0.6204164372869285 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=349.584, mean=349.584, max=349.584, sum=699.169 (2)", - "tab": "General information", - "score": 349.5843373493976 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731, - "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Efficiency", - "score": 0.5299853595376712 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=285.766, mean=285.766, max=285.766, sum=571.532 (2)", - "tab": "General information", - "score": 285.766081871345 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.196, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json deleted file mode 100644 index 2b8d4cdfb..000000000 --- a/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo 7B", - "id": "allenai/olmo-7b", - "developer": "allenai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.295, - "details": { - "description": "min=0.22, mean=0.295, max=0.454, sum=33.59 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.258, mean=0.386, max=0.824, sum=44.021 (114)", - "tab": "Efficiency", - "score": 0.38615337806031275 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=2.903, mean=4.946, max=5, sum=563.801 (114)", - "tab": "General information", - "score": 4.9456214515982575 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=285.766, mean=597.867, max=1813.97, sum=68156.839 (114)", - "tab": "General information", - "score": 597.8670097876463 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.619 (2)", - "tab": "Efficiency", - "score": 0.309316143989563 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=358.76, mean=358.76, max=358.76, sum=717.52 (2)", - "tab": "General information", - "score": 358.76 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.222, - "details": { - "description": "min=0.222, mean=0.222, max=0.222, sum=0.444 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)", - "tab": "Efficiency", - "score": 0.5358577339737504 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=352.03, mean=352.03, max=352.03, sum=704.059 (2)", - "tab": "General information", - "score": 352.02962962962965 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.294, - "details": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)", - "tab": "Efficiency", - "score": 0.34570912599563597 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.619 (2)", - "tab": "Efficiency", - "score": 0.30927823815080857 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.847 (2)", - "tab": "Efficiency", - "score": 0.42337616443634035 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.34355913400650023 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.32374938237184736 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.3302010788637049 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=535.85, mean=535.85, max=535.85, sum=1071.7 (2)", - "tab": "General information", - "score": 535.85 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=470.319, mean=470.319, max=470.319, sum=940.639 (2)", - "tab": "General information", - "score": 470.31944444444446 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=842.89, mean=842.89, max=842.89, sum=1685.78 (2)", - "tab": "General information", - "score": 842.89 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=592.82, mean=592.82, max=592.82, sum=1185.64 (2)", - "tab": "General information", - "score": 592.82 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=519.376, mean=519.376, max=519.376, sum=1038.751 (2)", - "tab": "General information", - "score": 519.3757225433526 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=476.657, mean=476.657, max=476.657, sum=953.314 (2)", - "tab": "General information", - "score": 476.65686274509807 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3, - "details": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)", - "tab": "Efficiency", - "score": 0.31721718072891236 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=388.19, mean=388.19, max=388.19, sum=776.38 (2)", - "tab": "General information", - "score": 388.19 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325, - "details": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)", - "tab": "Efficiency", - "score": 0.34500646591186523 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=612.798, mean=612.798, max=612.798, sum=1225.596 (2)", - "tab": "General information", - "score": 612.7982456140351 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.32, - "details": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.633 (2)", - "tab": "Efficiency", - "score": 0.3163221001625061 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=400.58, mean=400.58, max=400.58, sum=801.16 (2)", - "tab": "General information", - "score": 400.58 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25, - "details": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)", - "tab": "Efficiency", - "score": 0.3064618044429355 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=420.861, mean=420.861, max=420.861, sum=841.722 (2)", - "tab": "General information", - "score": 420.8611111111111 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325, - "details": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.65 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)", - "tab": "Efficiency", - "score": 0.39610295280382946 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=345.277, mean=345.277, max=345.277, sum=690.553 (2)", - "tab": "General information", - "score": 345.2765273311897 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.232, - "details": { - "description": "min=0.232, mean=0.232, max=0.232, sum=0.464 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.0 (2)", - "tab": "Efficiency", - "score": 0.4999704089234857 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)", - "tab": "Efficiency", - "score": 0.3458050379516385 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.768, mean=0.768, max=0.768, sum=1.537 (2)", - "tab": "Efficiency", - "score": 0.7683826767325868 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)", - "tab": "Efficiency", - "score": 0.43272479998519997 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1080.882, mean=1080.882, max=1080.882, sum=2161.765 (2)", - "tab": "General information", - "score": 1080.8823529411766 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=660.922, mean=660.922, max=660.922, sum=1321.844 (2)", - "tab": "General information", - "score": 660.9219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=4.997, mean=4.997, max=4.997, sum=9.995 (2)", - "tab": "General information", - "score": 4.9973924380704045 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1654.433, mean=1654.433, max=1654.433, sum=3308.866 (2)", - "tab": "General information", - "score": 1654.4328552803129 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=590.873, mean=590.873, max=590.873, sum=1181.745 (2)", - "tab": "General information", - "score": 590.8725490196078 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)", - "tab": "Efficiency", - "score": 0.31185237407684324 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=444.08, mean=444.08, max=444.08, sum=888.16 (2)", - "tab": "General information", - "score": 444.08 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342, - "details": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.3300002766282935 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=598.487, mean=598.487, max=598.487, sum=1196.974 (2)", - "tab": "General information", - "score": 598.4868421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.24, - "details": { - "description": "min=0.24, mean=0.24, max=0.24, sum=0.48 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.713 (2)", - "tab": "Efficiency", - "score": 0.3563597345352173 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=585.05, mean=585.05, max=585.05, sum=1170.1 (2)", - "tab": "General information", - "score": 585.05 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.521 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)", - "tab": "Efficiency", - "score": 0.2817675842429107 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=401.917, mean=401.917, max=401.917, sum=803.834 (2)", - "tab": "General information", - "score": 401.9169811320755 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.319, - "details": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.601 (2)", - "tab": "Efficiency", - "score": 0.3004691002216745 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=311.311, mean=311.311, max=311.311, sum=622.621 (2)", - "tab": "General information", - "score": 311.31063829787234 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29, - "details": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.579 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.271, mean=0.271, max=0.271, sum=0.542 (2)", - "tab": "Efficiency", - "score": 0.27095125954726645 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=424.848, mean=424.848, max=424.848, sum=849.697 (2)", - "tab": "General information", - "score": 424.848275862069 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.254, - "details": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)", - "tab": "Efficiency", - "score": 0.3099196644687148 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=505.071, mean=505.071, max=505.071, sum=1010.143 (2)", - "tab": "General information", - "score": 505.07142857142856 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.278, - "details": { - "description": "min=0.278, mean=0.278, max=0.278, sum=0.556 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.582, mean=0.582, max=0.582, sum=1.165 (2)", - "tab": "Efficiency", - "score": 0.5824837514332363 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=653.595, mean=653.595, max=653.595, sum=1307.19 (2)", - "tab": "General information", - "score": 653.5952380952381 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.253, - "details": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Efficiency", - "score": 0.28990614798761183 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)", - "tab": "Efficiency", - "score": 0.29780743039887525 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.427, mean=0.427, max=0.427, sum=0.854 (2)", - "tab": "Efficiency", - "score": 0.4271339774131775 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)", - "tab": "Efficiency", - "score": 0.8240610585068211 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.603 (2)", - "tab": "Efficiency", - "score": 0.30138304980114256 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.653 (2)", - "tab": "Efficiency", - "score": 0.32666249472860226 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.30416087615184295 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)", - "tab": "Efficiency", - "score": 0.3329446854414763 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.555 (2)", - "tab": "Efficiency", - "score": 0.27732292243412565 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.3369376612025381 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)", - "tab": "Efficiency", - "score": 0.294664117830609 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.817 (2)", - "tab": "Efficiency", - "score": 0.40864299955191435 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)", - "tab": "Efficiency", - "score": 0.8157591445773256 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.672, mean=0.672, max=0.672, sum=1.343 (2)", - "tab": "Efficiency", - "score": 0.6715093554323736 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.932, mean=513.932, max=513.932, sum=1027.865 (2)", - "tab": "General information", - "score": 513.9322580645161 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=479.842, mean=479.842, max=479.842, sum=959.685 (2)", - "tab": "General information", - "score": 479.8423645320197 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=889.39, mean=889.39, max=889.39, sum=1778.78 (2)", - "tab": "General information", - "score": 889.39 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=2.903, mean=2.903, max=2.903, sum=5.806 (2)", - "tab": "General information", - "score": 2.903030303030303 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=1813.97, mean=1813.97, max=1813.97, sum=3627.939 (2)", - "tab": "General information", - "score": 1813.969696969697 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=400.091, mean=400.091, max=400.091, sum=800.182 (2)", - "tab": "General information", - "score": 400.09090909090907 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=482.762, mean=482.762, max=482.762, sum=965.523 (2)", - "tab": "General information", - "score": 482.7616580310881 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=392.351, mean=392.351, max=392.351, sum=784.703 (2)", - "tab": "General information", - "score": 392.35128205128206 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=506.689, mean=506.689, max=506.689, sum=1013.378 (2)", - "tab": "General information", - "score": 506.68888888888887 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=411.235, mean=411.235, max=411.235, sum=822.471 (2)", - "tab": "General information", - "score": 411.2352941176471 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=548.728, mean=548.728, max=548.728, sum=1097.457 (2)", - "tab": "General information", - "score": 548.7284768211921 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=514.793, mean=514.793, max=514.793, sum=1029.585 (2)", - "tab": "General information", - "score": 514.7926605504587 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=796.606, mean=796.606, max=796.606, sum=1593.213 (2)", - "tab": "General information", - "score": 796.6064814814815 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=4, mean=4, max=4, sum=8 (2)", - "tab": "General information", - "score": 4.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=1788.387, mean=1788.387, max=1788.387, sum=3576.775 (2)", - "tab": "General information", - "score": 1788.387254901961 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1461.443, mean=1461.443, max=1461.443, sum=2922.886 (2)", - "tab": "General information", - "score": 1461.4430379746836 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.267, - "details": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", - "tab": "Efficiency", - "score": 0.2699183316508751 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.552, mean=0.552, max=0.552, sum=1.104 (2)", - "tab": "Efficiency", - "score": 0.5521998168857953 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=323.691, mean=323.691, max=323.691, sum=647.381 (2)", - "tab": "General information", - "score": 323.69058295964123 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=355.351, mean=355.351, max=355.351, sum=710.702 (2)", - "tab": "General information", - "score": 355.35114503816794 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306, - "details": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)", - "tab": "Efficiency", - "score": 0.3259233679653199 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=650.372, mean=650.372, max=650.372, sum=1300.744 (2)", - "tab": "General information", - "score": 650.3719008264463 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.264, - "details": { - "description": "min=0.264, mean=0.264, max=0.264, sum=0.528 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)", - "tab": "Efficiency", - "score": 0.3324835944029451 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=458.828, mean=458.828, max=458.828, sum=917.656 (2)", - "tab": "General information", - "score": 458.8282208588957 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.286, - "details": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.571 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)", - "tab": "Efficiency", - "score": 0.3520317098924092 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=661.214, mean=661.214, max=661.214, sum=1322.429 (2)", - "tab": "General information", - "score": 661.2142857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.272, - "details": { - "description": "min=0.272, mean=0.272, max=0.272, sum=0.544 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)", - "tab": "Efficiency", - "score": 0.3064361937995096 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=298.049, mean=298.049, max=298.049, sum=596.097 (2)", - "tab": "General information", - "score": 298.0485436893204 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.269, - "details": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.3111040826536651 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=440.103, mean=440.103, max=440.103, sum=880.205 (2)", - "tab": "General information", - "score": 440.1025641025641 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.28, - "details": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.258, mean=0.258, max=0.258, sum=0.516 (2)", - "tab": "Efficiency", - "score": 0.2580227541923523 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=340.48, mean=340.48, max=340.48, sum=680.96 (2)", - "tab": "General information", - "score": 340.48 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.292, - "details": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.585 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Efficiency", - "score": 0.3421932640051324 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=320.443, mean=320.443, max=320.443, sum=640.886 (2)", - "tab": "General information", - "score": 320.4431673052363 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.265, - "details": { - "description": "min=0.265, mean=0.265, max=0.265, sum=0.53 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.791 (2)", - "tab": "Efficiency", - "score": 0.39545129627161635 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)", - "tab": "Efficiency", - "score": 0.3597933335011232 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=502.243, mean=502.243, max=502.243, sum=1004.486 (2)", - "tab": "General information", - "score": 502.242774566474 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=667.861, mean=667.861, max=667.861, sum=1335.723 (2)", - "tab": "General information", - "score": 667.8614525139665 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.34, - "details": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.902 (2)", - "tab": "Efficiency", - "score": 0.45079101612365324 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=579.127, mean=579.127, max=579.127, sum=1158.255 (2)", - "tab": "General information", - "score": 579.1274509803922 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318, - "details": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.636 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)", - "tab": "Efficiency", - "score": 0.32820526979587694 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=535.151, mean=535.151, max=535.151, sum=1070.302 (2)", - "tab": "General information", - "score": 535.1512345679013 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.345, - "details": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.691 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.571 (2)", - "tab": "Efficiency", - "score": 0.28533268625086006 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=422.982, mean=422.982, max=422.982, sum=845.964 (2)", - "tab": "General information", - "score": 422.9818181818182 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408, - "details": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.551, mean=0.551, max=0.551, sum=1.102 (2)", - "tab": "Efficiency", - "score": 0.5510748113904681 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1207.057, mean=1207.057, max=1207.057, sum=2414.114 (2)", - "tab": "General information", - "score": 1207.057142857143 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.383, - "details": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)", - "tab": "Efficiency", - "score": 0.2929653884166509 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=452.02, mean=452.02, max=452.02, sum=904.04 (2)", - "tab": "General information", - "score": 452.0199004975124 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416, - "details": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.831 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.492, mean=0.492, max=0.492, sum=0.983 (2)", - "tab": "Efficiency", - "score": 0.4916250992970294 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=349.584, mean=349.584, max=349.584, sum=699.169 (2)", - "tab": "General information", - "score": 349.5843373493976 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.234, - "details": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.503, mean=0.503, max=0.503, sum=1.007 (2)", - "tab": "Efficiency", - "score": 0.5034504368988394 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=285.766, mean=285.766, max=285.766, sum=571.532 (2)", - "tab": "General information", - "score": 285.766081871345 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json deleted file mode 100644 index 1bb99dccc..000000000 --- a/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Lite", - "id": "amazon/nova-lite-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.509, mean=0.77, max=0.969, sum=87.802 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.113, mean=0.127, max=0.174, sum=14.526 (114)", - "tab": "Efficiency", - "score": 0.12742174922519597 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=305.386, mean=655.489, max=2872.03, sum=74725.746 (114)", - "tab": "General information", - "score": 655.4890026560713 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.136, mean=0.136, max=0.136, sum=0.272 (2)", - "tab": "Efficiency", - "score": 0.13592 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=399.38, mean=399.38, max=399.38, sum=798.76 (2)", - "tab": "General information", - "score": 399.38 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719, - "details": { - "description": "min=0.719, mean=0.719, max=0.719, sum=1.437 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.124, mean=0.124, max=0.124, sum=0.248 (2)", - "tab": "Efficiency", - "score": 0.12411851851851854 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=400.081, mean=400.081, max=400.081, sum=800.163 (2)", - "tab": "General information", - "score": 400.0814814814815 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.608, - "details": { - "description": "min=0.608, mean=0.608, max=0.608, sum=1.216 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.133, mean=0.133, max=0.133, sum=0.265 (2)", - "tab": "Efficiency", - "score": 0.13258 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.126, mean=0.126, max=0.126, sum=0.252 (2)", - "tab": "Efficiency", - "score": 0.12590277777777775 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.137, mean=0.137, max=0.137, sum=0.274 (2)", - "tab": "Efficiency", - "score": 0.13685 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.134, mean=0.134, max=0.134, sum=0.268 (2)", - "tab": "Efficiency", - "score": 0.13410999999999995 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)", - "tab": "Efficiency", - "score": 0.12883815028901727 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)", - "tab": "Efficiency", - "score": 0.12883333333333336 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=573.4, mean=573.4, max=573.4, sum=1146.8 (2)", - "tab": "General information", - "score": 573.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=510.278, mean=510.278, max=510.278, sum=1020.556 (2)", - "tab": "General information", - "score": 510.27777777777777 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=880.15, mean=880.15, max=880.15, sum=1760.3 (2)", - "tab": "General information", - "score": 880.15 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=639.53, mean=639.53, max=639.53, sum=1279.06 (2)", - "tab": "General information", - "score": 639.53 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=558.301, mean=558.301, max=558.301, sum=1116.601 (2)", - "tab": "General information", - "score": 558.3005780346821 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=517.324, mean=517.324, max=517.324, sum=1034.647 (2)", - "tab": "General information", - "score": 517.3235294117648 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.124, mean=0.124, max=0.124, sum=0.247 (2)", - "tab": "Efficiency", - "score": 0.12359999999999999 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=415.4, mean=415.4, max=415.4, sum=830.8 (2)", - "tab": "General information", - "score": 415.4 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.263 (2)", - "tab": "Efficiency", - "score": 0.13153508771929825 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=652.07, mean=652.07, max=652.07, sum=1304.14 (2)", - "tab": "General information", - "score": 652.0701754385965 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.127, mean=0.127, max=0.127, sum=0.255 (2)", - "tab": "Efficiency", - "score": 0.12749 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=426.42, mean=426.42, max=426.42, sum=852.84 (2)", - "tab": "General information", - "score": 426.42 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.124, mean=0.124, max=0.124, sum=0.248 (2)", - "tab": "Efficiency", - "score": 0.12411111111111109 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=446.722, mean=446.722, max=446.722, sum=893.444 (2)", - "tab": "General information", - "score": 446.72222222222223 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.817, - "details": { - "description": "min=0.817, mean=0.817, max=0.817, sum=1.633 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.121, mean=0.121, max=0.121, sum=0.242 (2)", - "tab": "Efficiency", - "score": 0.12122186495176847 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=381.704, mean=381.704, max=381.704, sum=763.408 (2)", - "tab": "General information", - "score": 381.7041800643087 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.139, mean=0.139, max=0.139, sum=0.277 (2)", - "tab": "Efficiency", - "score": 0.13866176470588237 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.126, mean=0.126, max=0.126, sum=0.253 (2)", - "tab": "Efficiency", - "score": 0.1264397163120567 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.143, mean=0.143, max=0.143, sum=0.286 (2)", - "tab": "Efficiency", - "score": 0.14286505867014285 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.124, mean=0.124, max=0.124, sum=0.248 (2)", - "tab": "Efficiency", - "score": 0.12417647058823517 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1146.287, mean=1146.287, max=1146.287, sum=2292.574 (2)", - "tab": "General information", - "score": 1146.2867647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=688.72, mean=688.72, max=688.72, sum=1377.44 (2)", - "tab": "General information", - "score": 688.7198581560284 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1686.73, mean=1686.73, max=1686.73, sum=3373.46 (2)", - "tab": "General information", - "score": 1686.7301173402868 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=625.574, mean=625.574, max=625.574, sum=1251.147 (2)", - "tab": "General information", - "score": 625.5735294117648 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.128, mean=0.128, max=0.128, sum=0.256 (2)", - "tab": "Efficiency", - "score": 0.12775000000000003 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=461.12, mean=461.12, max=461.12, sum=922.24 (2)", - "tab": "General information", - "score": 461.12 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.862, - "details": { - "description": "min=0.862, mean=0.862, max=0.862, sum=1.724 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)", - "tab": "Efficiency", - "score": 0.12905921052631578 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=628.112, mean=628.112, max=628.112, sum=1256.224 (2)", - "tab": "General information", - "score": 628.1118421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.126, mean=0.126, max=0.126, sum=0.252 (2)", - "tab": "Efficiency", - "score": 0.12613000000000005 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=617.46, mean=617.46, max=617.46, sum=1234.92 (2)", - "tab": "General information", - "score": 617.46 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.126, mean=0.126, max=0.126, sum=0.251 (2)", - "tab": "Efficiency", - "score": 0.1255018867924528 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=451.925, mean=451.925, max=451.925, sum=903.849 (2)", - "tab": "General information", - "score": 451.92452830188677 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.591 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.115, mean=0.115, max=0.115, sum=0.23 (2)", - "tab": "Efficiency", - "score": 0.11518723404255315 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=341.723, mean=341.723, max=341.723, sum=683.447 (2)", - "tab": "General information", - "score": 341.72340425531917 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.232 (2)", - "tab": "Efficiency", - "score": 0.11609655172413792 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=458.345, mean=458.345, max=458.345, sum=916.69 (2)", - "tab": "General information", - "score": 458.3448275862069 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757, - "details": { - "description": "min=0.757, mean=0.757, max=0.757, sum=1.513 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.126, mean=0.126, max=0.126, sum=0.253 (2)", - "tab": "Efficiency", - "score": 0.12626455026455036 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=534.09, mean=534.09, max=534.09, sum=1068.18 (2)", - "tab": "General information", - "score": 534.0899470899471 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.257 (2)", - "tab": "Efficiency", - "score": 0.12850793650793654 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=669, mean=669, max=669, sum=1338 (2)", - "tab": "General information", - "score": 669.0 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.122, mean=0.122, max=0.122, sum=0.244 (2)", - "tab": "Efficiency", - "score": 0.12203870967741924 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.127, mean=0.127, max=0.127, sum=0.254 (2)", - "tab": "Efficiency", - "score": 0.1271921182266009 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.136, mean=0.136, max=0.136, sum=0.271 (2)", - "tab": "Efficiency", - "score": 0.13555999999999999 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.174, mean=0.174, max=0.174, sum=0.348 (2)", - "tab": "Efficiency", - "score": 0.1741696969696969 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.123, mean=0.123, max=0.123, sum=0.245 (2)", - "tab": "Efficiency", - "score": 0.1226313131313131 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.125, mean=0.125, max=0.125, sum=0.251 (2)", - "tab": "Efficiency", - "score": 0.12531606217616578 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.121, mean=0.121, max=0.121, sum=0.242 (2)", - "tab": "Efficiency", - "score": 0.12077948717948701 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.126, mean=0.126, max=0.126, sum=0.251 (2)", - "tab": "Efficiency", - "score": 0.1257444444444444 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.123, mean=0.123, max=0.123, sum=0.247 (2)", - "tab": "Efficiency", - "score": 0.12331512605042017 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.128, mean=0.128, max=0.128, sum=0.256 (2)", - "tab": "Efficiency", - "score": 0.1282052980132451 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.123, mean=0.123, max=0.123, sum=0.246 (2)", - "tab": "Efficiency", - "score": 0.12288256880733935 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.13, mean=0.13, max=0.13, sum=0.261 (2)", - "tab": "Efficiency", - "score": 0.13030555555555556 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.161, mean=0.161, max=0.161, sum=0.322 (2)", - "tab": "Efficiency", - "score": 0.16099019607843132 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.146, mean=0.146, max=0.146, sum=0.293 (2)", - "tab": "Efficiency", - "score": 0.14643881856540092 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=568.748, mean=568.748, max=568.748, sum=1137.497 (2)", - "tab": "General information", - "score": 568.7483870967742 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=523.65, mean=523.65, max=523.65, sum=1047.3 (2)", - "tab": "General information", - "score": 523.6502463054187 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=932.15, mean=932.15, max=932.15, sum=1864.3 (2)", - "tab": "General information", - "score": 932.15 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2872.03, mean=2872.03, max=2872.03, sum=5744.061 (2)", - "tab": "General information", - "score": 2872.030303030303 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=425.646, mean=425.646, max=425.646, sum=851.293 (2)", - "tab": "General information", - "score": 425.64646464646466 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=506.073, mean=506.073, max=506.073, sum=1012.145 (2)", - "tab": "General information", - "score": 506.07253886010363 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=419.987, mean=419.987, max=419.987, sum=839.974 (2)", - "tab": "General information", - "score": 419.9871794871795 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=554.352, mean=554.352, max=554.352, sum=1108.704 (2)", - "tab": "General information", - "score": 554.3518518518518 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=439.055, mean=439.055, max=439.055, sum=878.109 (2)", - "tab": "General information", - "score": 439.0546218487395 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=581.669, mean=581.669, max=581.669, sum=1163.338 (2)", - "tab": "General information", - "score": 581.6688741721854 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=544.842, mean=544.842, max=544.842, sum=1089.684 (2)", - "tab": "General information", - "score": 544.8422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=833, mean=833, max=833, sum=1666 (2)", - "tab": "General information", - "score": 833.0 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2270.25, mean=2270.25, max=2270.25, sum=4540.5 (2)", - "tab": "General information", - "score": 2270.25 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1466.561, mean=1466.561, max=1466.561, sum=2933.122 (2)", - "tab": "General information", - "score": 1466.5611814345991 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.113, mean=0.113, max=0.113, sum=0.227 (2)", - "tab": "Efficiency", - "score": 0.11326008968609867 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.118, mean=0.118, max=0.118, sum=0.236 (2)", - "tab": "Efficiency", - "score": 0.11813740458015273 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=352.48, mean=352.48, max=352.48, sum=704.96 (2)", - "tab": "General information", - "score": 352.47982062780267 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=385.626, mean=385.626, max=385.626, sum=771.252 (2)", - "tab": "General information", - "score": 385.62595419847327 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)", - "tab": "Efficiency", - "score": 0.129206611570248 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=667.843, mean=667.843, max=667.843, sum=1335.686 (2)", - "tab": "General information", - "score": 667.8429752066115 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.124, mean=0.124, max=0.124, sum=0.249 (2)", - "tab": "Efficiency", - "score": 0.12445398773006137 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=482.227, mean=482.227, max=482.227, sum=964.454 (2)", - "tab": "General information", - "score": 482.2269938650307 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)", - "tab": "Efficiency", - "score": 0.13516071428571433 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=699.598, mean=699.598, max=699.598, sum=1399.196 (2)", - "tab": "General information", - "score": 699.5982142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.118, mean=0.118, max=0.118, sum=0.237 (2)", - "tab": "Efficiency", - "score": 0.1183980582524272 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=320.34, mean=320.34, max=320.34, sum=640.68 (2)", - "tab": "General information", - "score": 320.3398058252427 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.122, mean=0.122, max=0.122, sum=0.243 (2)", - "tab": "Efficiency", - "score": 0.12151282051282052 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=466.697, mean=466.697, max=466.697, sum=933.393 (2)", - "tab": "General information", - "score": 466.6965811965812 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.115, mean=0.115, max=0.115, sum=0.23 (2)", - "tab": "Efficiency", - "score": 0.11518 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=380.71, mean=380.71, max=380.71, sum=761.42 (2)", - "tab": "General information", - "score": 380.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "description": "min=0.872, mean=0.872, max=0.872, sum=1.745 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.114, mean=0.114, max=0.114, sum=0.227 (2)", - "tab": "Efficiency", - "score": 0.11356577266922054 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=342.847, mean=342.847, max=342.847, sum=685.693 (2)", - "tab": "General information", - "score": 342.84674329501917 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694, - "details": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.125, mean=0.125, max=0.125, sum=0.249 (2)", - "tab": "Efficiency", - "score": 0.12473699421965324 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.124, mean=0.124, max=0.124, sum=0.247 (2)", - "tab": "Efficiency", - "score": 0.12357988826815636 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=525.329, mean=525.329, max=525.329, sum=1050.659 (2)", - "tab": "General information", - "score": 525.3294797687861 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=692.482, mean=692.482, max=692.482, sum=1384.963 (2)", - "tab": "General information", - "score": 692.4815642458101 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.575 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.124, mean=0.124, max=0.124, sum=0.247 (2)", - "tab": "Efficiency", - "score": 0.12373529411764701 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=612.69, mean=612.69, max=612.69, sum=1225.379 (2)", - "tab": "General information", - "score": 612.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)", - "tab": "Efficiency", - "score": 0.1291882716049382 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=552.454, mean=552.454, max=552.454, sum=1104.907 (2)", - "tab": "General information", - "score": 552.4537037037037 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.682, - "details": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.12, mean=0.12, max=0.12, sum=0.241 (2)", - "tab": "Efficiency", - "score": 0.1202636363636364 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=448.609, mean=448.609, max=448.609, sum=897.218 (2)", - "tab": "General information", - "score": 448.6090909090909 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.137, mean=0.137, max=0.137, sum=0.273 (2)", - "tab": "Efficiency", - "score": 0.13666530612244904 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1224.433, mean=1224.433, max=1224.433, sum=2448.865 (2)", - "tab": "General information", - "score": 1224.4326530612245 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.896, - "details": { - "description": "min=0.896, mean=0.896, max=0.896, sum=1.791 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.121, mean=0.121, max=0.121, sum=0.241 (2)", - "tab": "Efficiency", - "score": 0.12068656716417903 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=474.512, mean=474.512, max=474.512, sum=949.025 (2)", - "tab": "General information", - "score": 474.5124378109453 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542, - "details": { - "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.114, mean=0.114, max=0.114, sum=0.227 (2)", - "tab": "Efficiency", - "score": 0.113578313253012 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=380.753, mean=380.753, max=380.753, sum=761.506 (2)", - "tab": "General information", - "score": 380.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.114, mean=0.114, max=0.114, sum=0.229 (2)", - "tab": "Efficiency", - "score": 0.11440935672514624 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=305.386, mean=305.386, max=305.386, sum=610.772 (2)", - "tab": "General information", - "score": 305.3859649122807 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.987, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json deleted file mode 100644 index ab9b8c843..000000000 --- a/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Micro", - "id": "amazon/nova-micro-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.708, - "details": { - "description": "min=0.42, mean=0.708, max=0.922, sum=80.671 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.102, mean=0.114, max=0.152, sum=13.049 (114)", - "tab": "Efficiency", - "score": 0.1144634124237814 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=305.386, mean=655.489, max=2872.03, sum=74725.746 (114)", - "tab": "General information", - "score": 655.4890026560713 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0.999, mean=1.0, max=1, sum=113.997 (114)", - "tab": "General information", - "score": 0.9999775940489795 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42, - "details": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.117, mean=0.117, max=0.117, sum=0.234 (2)", - "tab": "Efficiency", - "score": 0.11696000000000005 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=399.38, mean=399.38, max=399.38, sum=798.76 (2)", - "tab": "General information", - "score": 399.38 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "description": "min=0.726, mean=0.726, max=0.726, sum=1.452 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.107, mean=0.107, max=0.107, sum=0.214 (2)", - "tab": "Efficiency", - "score": 0.10704444444444451 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=400.081, mean=400.081, max=400.081, sum=800.163 (2)", - "tab": "General information", - "score": 400.0814814814815 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.118, mean=0.118, max=0.118, sum=0.235 (2)", - "tab": "Efficiency", - "score": 0.11762000000000004 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.118, mean=0.118, max=0.118, sum=0.237 (2)", - "tab": "Efficiency", - "score": 0.11843055555555557 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.125, mean=0.125, max=0.125, sum=0.25 (2)", - "tab": "Efficiency", - "score": 0.12490000000000004 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.122, mean=0.122, max=0.122, sum=0.244 (2)", - "tab": "Efficiency", - "score": 0.12207000000000001 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.233 (2)", - "tab": "Efficiency", - "score": 0.11635838150289027 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.115, mean=0.115, max=0.115, sum=0.229 (2)", - "tab": "Efficiency", - "score": 0.11473529411764712 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=573.4, mean=573.4, max=573.4, sum=1146.8 (2)", - "tab": "General information", - "score": 573.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=510.278, mean=510.278, max=510.278, sum=1020.556 (2)", - "tab": "General information", - "score": 510.27777777777777 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=880.15, mean=880.15, max=880.15, sum=1760.3 (2)", - "tab": "General information", - "score": 880.15 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=639.53, mean=639.53, max=639.53, sum=1279.06 (2)", - "tab": "General information", - "score": 639.53 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=558.301, mean=558.301, max=558.301, sum=1116.601 (2)", - "tab": "General information", - "score": 558.3005780346821 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=517.324, mean=517.324, max=517.324, sum=1034.647 (2)", - "tab": "General information", - "score": 517.3235294117648 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.115, mean=0.115, max=0.115, sum=0.231 (2)", - "tab": "Efficiency", - "score": 0.11527000000000003 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=415.4, mean=415.4, max=415.4, sum=830.8 (2)", - "tab": "General information", - "score": 415.4 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57, - "details": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.231 (2)", - "tab": "Efficiency", - "score": 0.11560526315789472 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=652.07, mean=652.07, max=652.07, sum=1304.14 (2)", - "tab": "General information", - "score": 652.0701754385965 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44, - "details": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.115, mean=0.115, max=0.115, sum=0.231 (2)", - "tab": "Efficiency", - "score": 0.11540999999999998 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=426.42, mean=426.42, max=426.42, sum=852.84 (2)", - "tab": "General information", - "score": 426.42 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=1.63 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.111, mean=0.111, max=0.111, sum=0.223 (2)", - "tab": "Efficiency", - "score": 0.11141666666666669 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=446.722, mean=446.722, max=446.722, sum=893.444 (2)", - "tab": "General information", - "score": 446.72222222222223 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.107, mean=0.107, max=0.107, sum=0.214 (2)", - "tab": "Efficiency", - "score": 0.10707717041800643 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=381.704, mean=381.704, max=381.704, sum=763.408 (2)", - "tab": "General information", - "score": 381.7041800643087 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739, - "details": { - "description": "min=0.739, mean=0.739, max=0.739, sum=1.477 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.127, mean=0.127, max=0.127, sum=0.255 (2)", - "tab": "Efficiency", - "score": 0.12727573529411765 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.117, mean=0.117, max=0.117, sum=0.234 (2)", - "tab": "Efficiency", - "score": 0.11683687943262412 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.128, mean=0.128, max=0.128, sum=0.256 (2)", - "tab": "Efficiency", - "score": 0.1279393741851367 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.111, mean=0.111, max=0.111, sum=0.221 (2)", - "tab": "Efficiency", - "score": 0.11058333333333302 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1146.287, mean=1146.287, max=1146.287, sum=2292.574 (2)", - "tab": "General information", - "score": 1146.2867647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=688.72, mean=688.72, max=688.72, sum=1377.44 (2)", - "tab": "General information", - "score": 688.7198581560284 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1686.73, mean=1686.73, max=1686.73, sum=3373.46 (2)", - "tab": "General information", - "score": 1686.7301173402868 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=625.574, mean=625.574, max=625.574, sum=1251.147 (2)", - "tab": "General information", - "score": 625.5735294117648 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.113, mean=0.113, max=0.113, sum=0.226 (2)", - "tab": "Efficiency", - "score": 0.11315000000000004 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=461.12, mean=461.12, max=461.12, sum=922.24 (2)", - "tab": "General information", - "score": 461.12 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.645 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.232 (2)", - "tab": "Efficiency", - "score": 0.11597368421052637 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=628.112, mean=628.112, max=628.112, sum=1256.224 (2)", - "tab": "General information", - "score": 628.1118421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71, - "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.118, mean=0.118, max=0.118, sum=0.237 (2)", - "tab": "Efficiency", - "score": 0.11840000000000003 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=617.46, mean=617.46, max=617.46, sum=1234.92 (2)", - "tab": "General information", - "score": 617.46 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.751, - "details": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.115, mean=0.115, max=0.115, sum=0.23 (2)", - "tab": "Efficiency", - "score": 0.11494716981132078 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=451.925, mean=451.925, max=451.925, sum=903.849 (2)", - "tab": "General information", - "score": 451.92452830188677 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.105, mean=0.105, max=0.105, sum=0.21 (2)", - "tab": "Efficiency", - "score": 0.10520000000000002 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=341.723, mean=341.723, max=341.723, sum=683.447 (2)", - "tab": "General information", - "score": 341.72340425531917 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.109, mean=0.109, max=0.109, sum=0.218 (2)", - "tab": "Efficiency", - "score": 0.10906896551724135 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=458.345, mean=458.345, max=458.345, sum=916.69 (2)", - "tab": "General information", - "score": 458.3448275862069 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.55, mean=0.55, max=0.55, sum=1.101 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.232 (2)", - "tab": "Efficiency", - "score": 0.11621164021164002 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=534.09, mean=534.09, max=534.09, sum=1068.18 (2)", - "tab": "General information", - "score": 534.0899470899471 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508, - "details": { - "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.113, mean=0.113, max=0.113, sum=0.226 (2)", - "tab": "Efficiency", - "score": 0.112968253968254 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=669, mean=669, max=669, sum=1338 (2)", - "tab": "General information", - "score": 669.0 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.112, mean=0.112, max=0.112, sum=0.224 (2)", - "tab": "Efficiency", - "score": 0.11209354838709669 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.113, mean=0.113, max=0.113, sum=0.226 (2)", - "tab": "Efficiency", - "score": 0.11317733990147788 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.12, mean=0.12, max=0.12, sum=0.24 (2)", - "tab": "Efficiency", - "score": 0.11999000000000004 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.152, mean=0.152, max=0.152, sum=0.303 (2)", - "tab": "Efficiency", - "score": 0.1516909090909091 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.11, mean=0.11, max=0.11, sum=0.22 (2)", - "tab": "Efficiency", - "score": 0.11011616161616171 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.108, mean=0.108, max=0.108, sum=0.216 (2)", - "tab": "Efficiency", - "score": 0.10789637305699486 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.11, mean=0.11, max=0.11, sum=0.221 (2)", - "tab": "Efficiency", - "score": 0.11032307692307693 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.113, mean=0.113, max=0.113, sum=0.226 (2)", - "tab": "Efficiency", - "score": 0.11290000000000003 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.11, mean=0.11, max=0.11, sum=0.219 (2)", - "tab": "Efficiency", - "score": 0.10956302521008413 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.231 (2)", - "tab": "Efficiency", - "score": 0.11561589403973516 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.11, mean=0.11, max=0.11, sum=0.22 (2)", - "tab": "Efficiency", - "score": 0.11005137614678874 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.116, mean=0.116, max=0.116, sum=0.233 (2)", - "tab": "Efficiency", - "score": 0.11631018518518522 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.139, mean=0.139, max=0.139, sum=0.279 (2)", - "tab": "Efficiency", - "score": 0.13944117647058826 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.134, mean=0.134, max=0.134, sum=0.268 (2)", - "tab": "Efficiency", - "score": 0.13399578059071726 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=568.748, mean=568.748, max=568.748, sum=1137.497 (2)", - "tab": "General information", - "score": 568.7483870967742 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=523.65, mean=523.65, max=523.65, sum=1047.3 (2)", - "tab": "General information", - "score": 523.6502463054187 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=932.15, mean=932.15, max=932.15, sum=1864.3 (2)", - "tab": "General information", - "score": 932.15 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2872.03, mean=2872.03, max=2872.03, sum=5744.061 (2)", - "tab": "General information", - "score": 2872.030303030303 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=425.646, mean=425.646, max=425.646, sum=851.293 (2)", - "tab": "General information", - "score": 425.64646464646466 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=506.073, mean=506.073, max=506.073, sum=1012.145 (2)", - "tab": "General information", - "score": 506.07253886010363 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=419.987, mean=419.987, max=419.987, sum=839.974 (2)", - "tab": "General information", - "score": 419.9871794871795 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=554.352, mean=554.352, max=554.352, sum=1108.704 (2)", - "tab": "General information", - "score": 554.3518518518518 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=439.055, mean=439.055, max=439.055, sum=878.109 (2)", - "tab": "General information", - "score": 439.0546218487395 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=581.669, mean=581.669, max=581.669, sum=1163.338 (2)", - "tab": "General information", - "score": 581.6688741721854 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=544.842, mean=544.842, max=544.842, sum=1089.684 (2)", - "tab": "General information", - "score": 544.8422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=833, mean=833, max=833, sum=1666 (2)", - "tab": "General information", - "score": 833.0 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2270.25, mean=2270.25, max=2270.25, sum=4540.5 (2)", - "tab": "General information", - "score": 2270.25 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1466.561, mean=1466.561, max=1466.561, sum=2933.122 (2)", - "tab": "General information", - "score": 1466.5611814345991 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.104, mean=0.104, max=0.104, sum=0.208 (2)", - "tab": "Efficiency", - "score": 0.10423766816143511 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.112, mean=0.112, max=0.112, sum=0.224 (2)", - "tab": "Efficiency", - "score": 0.11212213740458017 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=352.48, mean=352.48, max=352.48, sum=704.96 (2)", - "tab": "General information", - "score": 352.47982062780267 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=385.626, mean=385.626, max=385.626, sum=771.252 (2)", - "tab": "General information", - "score": 385.62595419847327 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.111, mean=0.111, max=0.111, sum=0.221 (2)", - "tab": "Efficiency", - "score": 0.11063636363636367 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=667.843, mean=667.843, max=667.843, sum=1335.686 (2)", - "tab": "General information", - "score": 667.8429752066115 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.798, - "details": { - "description": "min=0.798, mean=0.798, max=0.798, sum=1.595 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.111, mean=0.111, max=0.111, sum=0.221 (2)", - "tab": "Efficiency", - "score": 0.11058895705521476 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=482.227, mean=482.227, max=482.227, sum=964.454 (2)", - "tab": "General information", - "score": 482.2269938650307 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.115, mean=0.115, max=0.115, sum=0.231 (2)", - "tab": "Efficiency", - "score": 0.11541964285714289 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=699.598, mean=699.598, max=699.598, sum=1399.196 (2)", - "tab": "General information", - "score": 699.5982142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.816, - "details": { - "description": "min=0.816, mean=0.816, max=0.816, sum=1.631 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.102, mean=0.102, max=0.102, sum=0.205 (2)", - "tab": "Efficiency", - "score": 0.10230097087378638 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=320.34, mean=320.34, max=320.34, sum=640.68 (2)", - "tab": "General information", - "score": 320.3398058252427 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.112, mean=0.112, max=0.112, sum=0.223 (2)", - "tab": "Efficiency", - "score": 0.11152136752136761 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=466.697, mean=466.697, max=466.697, sum=933.393 (2)", - "tab": "General information", - "score": 466.6965811965812 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.106, mean=0.106, max=0.106, sum=0.212 (2)", - "tab": "Efficiency", - "score": 0.10620000000000003 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=380.71, mean=380.71, max=380.71, sum=761.42 (2)", - "tab": "General information", - "score": 380.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.105, mean=0.105, max=0.105, sum=0.21 (2)", - "tab": "Efficiency", - "score": 0.10505236270753474 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=342.847, mean=342.847, max=342.847, sum=685.693 (2)", - "tab": "General information", - "score": 342.84674329501917 - }, - "Miscellaneous - # output tokens": { - "description": "min=0.999, mean=0.999, max=0.999, sum=1.997 (2)", - "tab": "General information", - "score": 0.9987228607918263 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.464, - "details": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.927 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.112, mean=0.112, max=0.112, sum=0.225 (2)", - "tab": "Efficiency", - "score": 0.11246242774566474 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.112, mean=0.112, max=0.112, sum=0.223 (2)", - "tab": "Efficiency", - "score": 0.11168156424580966 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=525.329, mean=525.329, max=525.329, sum=1050.659 (2)", - "tab": "General information", - "score": 525.3294797687861 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=692.482, mean=692.482, max=692.482, sum=1384.963 (2)", - "tab": "General information", - "score": 692.4815642458101 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.109, mean=0.109, max=0.109, sum=0.219 (2)", - "tab": "Efficiency", - "score": 0.1093660130718955 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=612.69, mean=612.69, max=612.69, sum=1225.379 (2)", - "tab": "General information", - "score": 612.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.787, - "details": { - "description": "min=0.787, mean=0.787, max=0.787, sum=1.574 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.11, mean=0.11, max=0.11, sum=0.22 (2)", - "tab": "Efficiency", - "score": 0.1099814814814816 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=552.454, mean=552.454, max=552.454, sum=1104.907 (2)", - "tab": "General information", - "score": 552.4537037037037 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673, - "details": { - "description": "min=0.673, mean=0.673, max=0.673, sum=1.345 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.108, mean=0.108, max=0.108, sum=0.215 (2)", - "tab": "Efficiency", - "score": 0.1075000000000001 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=448.609, mean=448.609, max=448.609, sum=897.218 (2)", - "tab": "General information", - "score": 448.6090909090909 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=1.437 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.122, mean=0.122, max=0.122, sum=0.244 (2)", - "tab": "Efficiency", - "score": 0.12202448979591832 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1224.433, mean=1224.433, max=1224.433, sum=2448.865 (2)", - "tab": "General information", - "score": 1224.4326530612245 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.846, - "details": { - "description": "min=0.846, mean=0.846, max=0.846, sum=1.692 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.11, mean=0.11, max=0.11, sum=0.221 (2)", - "tab": "Efficiency", - "score": 0.11042288557213926 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=474.512, mean=474.512, max=474.512, sum=949.025 (2)", - "tab": "General information", - "score": 474.5124378109453 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524, - "details": { - "description": "min=0.524, mean=0.524, max=0.524, sum=1.048 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.104, mean=0.104, max=0.104, sum=0.209 (2)", - "tab": "Efficiency", - "score": 0.10432530120481927 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=380.753, mean=380.753, max=380.753, sum=761.506 (2)", - "tab": "General information", - "score": 380.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.104, mean=0.104, max=0.104, sum=0.208 (2)", - "tab": "Efficiency", - "score": 0.10395321637426902 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=305.386, mean=305.386, max=305.386, sum=610.772 (2)", - "tab": "General information", - "score": 305.3859649122807 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 1.0, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json deleted file mode 100644 index af30c4448..000000000 --- a/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amazon Nova Pro", - "id": "amazon/nova-pro-v1:0", - "developer": "amazon", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.54, mean=0.82, max=0.974, sum=93.477 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.128, mean=0.14, max=0.17, sum=15.944 (114)", - "tab": "Efficiency", - "score": 0.13986169479756677 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=305.386, mean=655.489, max=2872.03, sum=74725.746 (114)", - "tab": "General information", - "score": 655.4890026560713 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.152, mean=0.152, max=0.152, sum=0.305 (2)", - "tab": "Efficiency", - "score": 0.15239000000000003 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=399.38, mean=399.38, max=399.38, sum=798.76 (2)", - "tab": "General information", - "score": 399.38 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.138, mean=0.138, max=0.138, sum=0.275 (2)", - "tab": "Efficiency", - "score": 0.13757037037037034 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=400.081, mean=400.081, max=400.081, sum=800.163 (2)", - "tab": "General information", - "score": 400.0814814814815 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.647, - "details": { - "description": "min=0.647, mean=0.647, max=0.647, sum=1.294 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.148, mean=0.148, max=0.148, sum=0.296 (2)", - "tab": "Efficiency", - "score": 0.14806999999999998 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.148, mean=0.148, max=0.148, sum=0.296 (2)", - "tab": "Efficiency", - "score": 0.14820138888888884 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.152, mean=0.152, max=0.152, sum=0.305 (2)", - "tab": "Efficiency", - "score": 0.15245 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.151, mean=0.151, max=0.151, sum=0.303 (2)", - "tab": "Efficiency", - "score": 0.15141 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.143, mean=0.143, max=0.143, sum=0.287 (2)", - "tab": "Efficiency", - "score": 0.1433988439306358 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.146, mean=0.146, max=0.146, sum=0.292 (2)", - "tab": "Efficiency", - "score": 0.14623529411764705 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=573.4, mean=573.4, max=573.4, sum=1146.8 (2)", - "tab": "General information", - "score": 573.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=510.278, mean=510.278, max=510.278, sum=1020.556 (2)", - "tab": "General information", - "score": 510.27777777777777 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=880.15, mean=880.15, max=880.15, sum=1760.3 (2)", - "tab": "General information", - "score": 880.15 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=639.53, mean=639.53, max=639.53, sum=1279.06 (2)", - "tab": "General information", - "score": 639.53 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=558.301, mean=558.301, max=558.301, sum=1116.601 (2)", - "tab": "General information", - "score": 558.3005780346821 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=517.324, mean=517.324, max=517.324, sum=1034.647 (2)", - "tab": "General information", - "score": 517.3235294117648 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.141, mean=0.141, max=0.141, sum=0.281 (2)", - "tab": "Efficiency", - "score": 0.14067000000000005 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=415.4, mean=415.4, max=415.4, sum=830.8 (2)", - "tab": "General information", - "score": 415.4 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.142, mean=0.142, max=0.142, sum=0.285 (2)", - "tab": "Efficiency", - "score": 0.1423421052631579 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=652.07, mean=652.07, max=652.07, sum=1304.14 (2)", - "tab": "General information", - "score": 652.0701754385965 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54, - "details": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.142, mean=0.142, max=0.142, sum=0.283 (2)", - "tab": "Efficiency", - "score": 0.14153999999999997 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=426.42, mean=426.42, max=426.42, sum=852.84 (2)", - "tab": "General information", - "score": 426.42 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.141, mean=0.141, max=0.141, sum=0.282 (2)", - "tab": "Efficiency", - "score": 0.14100925925925917 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=446.722, mean=446.722, max=446.722, sum=893.444 (2)", - "tab": "General information", - "score": 446.72222222222223 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.131, mean=0.131, max=0.131, sum=0.261 (2)", - "tab": "Efficiency", - "score": 0.1307266881028939 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=381.704, mean=381.704, max=381.704, sum=763.408 (2)", - "tab": "General information", - "score": 381.7041800643087 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.729 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.145, mean=0.145, max=0.145, sum=0.291 (2)", - "tab": "Efficiency", - "score": 0.14530882352941174 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.139, mean=0.139, max=0.139, sum=0.278 (2)", - "tab": "Efficiency", - "score": 0.1388758865248228 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.146, mean=0.146, max=0.146, sum=0.292 (2)", - "tab": "Efficiency", - "score": 0.14584159061277666 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.264 (2)", - "tab": "Efficiency", - "score": 0.13185620915032703 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1146.287, mean=1146.287, max=1146.287, sum=2292.574 (2)", - "tab": "General information", - "score": 1146.2867647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=688.72, mean=688.72, max=688.72, sum=1377.44 (2)", - "tab": "General information", - "score": 688.7198581560284 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1686.73, mean=1686.73, max=1686.73, sum=3373.46 (2)", - "tab": "General information", - "score": 1686.7301173402868 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=625.574, mean=625.574, max=625.574, sum=1251.147 (2)", - "tab": "General information", - "score": 625.5735294117648 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.141, mean=0.141, max=0.141, sum=0.282 (2)", - "tab": "Efficiency", - "score": 0.14117999999999994 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=461.12, mean=461.12, max=461.12, sum=922.24 (2)", - "tab": "General information", - "score": 461.12 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.895, - "details": { - "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.141, mean=0.141, max=0.141, sum=0.282 (2)", - "tab": "Efficiency", - "score": 0.1411447368421052 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=628.112, mean=628.112, max=628.112, sum=1256.224 (2)", - "tab": "General information", - "score": 628.1118421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.144, mean=0.144, max=0.144, sum=0.288 (2)", - "tab": "Efficiency", - "score": 0.14414 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=617.46, mean=617.46, max=617.46, sum=1234.92 (2)", - "tab": "General information", - "score": 617.46 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "details": { - "description": "min=0.875, mean=0.875, max=0.875, sum=1.751 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.142, mean=0.142, max=0.142, sum=0.284 (2)", - "tab": "Efficiency", - "score": 0.14190943396226424 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=451.925, mean=451.925, max=451.925, sum=903.849 (2)", - "tab": "General information", - "score": 451.92452830188677 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.851, - "details": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.264 (2)", - "tab": "Efficiency", - "score": 0.13199148936170213 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=341.723, mean=341.723, max=341.723, sum=683.447 (2)", - "tab": "General information", - "score": 341.72340425531917 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)", - "tab": "Efficiency", - "score": 0.1350000000000001 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=458.345, mean=458.345, max=458.345, sum=916.69 (2)", - "tab": "General information", - "score": 458.3448275862069 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.831, - "details": { - "description": "min=0.831, mean=0.831, max=0.831, sum=1.661 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.142, mean=0.142, max=0.142, sum=0.285 (2)", - "tab": "Efficiency", - "score": 0.14232010582010587 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=534.09, mean=534.09, max=534.09, sum=1068.18 (2)", - "tab": "General information", - "score": 534.0899470899471 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.714, - "details": { - "description": "min=0.714, mean=0.714, max=0.714, sum=1.429 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.145, mean=0.145, max=0.145, sum=0.29 (2)", - "tab": "Efficiency", - "score": 0.1448888888888889 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=669, mean=669, max=669, sum=1338 (2)", - "tab": "General information", - "score": 669.0 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.139, mean=0.139, max=0.139, sum=0.278 (2)", - "tab": "Efficiency", - "score": 0.13894516129032267 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.139, mean=0.139, max=0.139, sum=0.278 (2)", - "tab": "Efficiency", - "score": 0.13885221674876858 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.148, mean=0.148, max=0.148, sum=0.296 (2)", - "tab": "Efficiency", - "score": 0.1479 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.17, mean=0.17, max=0.17, sum=0.341 (2)", - "tab": "Efficiency", - "score": 0.17033939393939396 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.133, mean=0.133, max=0.133, sum=0.266 (2)", - "tab": "Efficiency", - "score": 0.13296969696969696 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)", - "tab": "Efficiency", - "score": 0.1351139896373057 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.134, mean=0.134, max=0.134, sum=0.268 (2)", - "tab": "Efficiency", - "score": 0.1338025641025641 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.14, mean=0.14, max=0.14, sum=0.279 (2)", - "tab": "Efficiency", - "score": 0.13964074074074065 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.135, mean=0.135, max=0.135, sum=0.271 (2)", - "tab": "Efficiency", - "score": 0.1353235294117648 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.137, mean=0.137, max=0.137, sum=0.274 (2)", - "tab": "Efficiency", - "score": 0.13686754966887416 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.136, mean=0.136, max=0.136, sum=0.272 (2)", - "tab": "Efficiency", - "score": 0.13622018348623863 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.143, mean=0.143, max=0.143, sum=0.286 (2)", - "tab": "Efficiency", - "score": 0.14287499999999997 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.158, mean=0.158, max=0.158, sum=0.317 (2)", - "tab": "Efficiency", - "score": 0.15845098039215685 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.152, mean=0.152, max=0.152, sum=0.304 (2)", - "tab": "Efficiency", - "score": 0.151776371308017 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=568.748, mean=568.748, max=568.748, sum=1137.497 (2)", - "tab": "General information", - "score": 568.7483870967742 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=523.65, mean=523.65, max=523.65, sum=1047.3 (2)", - "tab": "General information", - "score": 523.6502463054187 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=932.15, mean=932.15, max=932.15, sum=1864.3 (2)", - "tab": "General information", - "score": 932.15 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2872.03, mean=2872.03, max=2872.03, sum=5744.061 (2)", - "tab": "General information", - "score": 2872.030303030303 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=425.646, mean=425.646, max=425.646, sum=851.293 (2)", - "tab": "General information", - "score": 425.64646464646466 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=506.073, mean=506.073, max=506.073, sum=1012.145 (2)", - "tab": "General information", - "score": 506.07253886010363 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=419.987, mean=419.987, max=419.987, sum=839.974 (2)", - "tab": "General information", - "score": 419.9871794871795 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=554.352, mean=554.352, max=554.352, sum=1108.704 (2)", - "tab": "General information", - "score": 554.3518518518518 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=439.055, mean=439.055, max=439.055, sum=878.109 (2)", - "tab": "General information", - "score": 439.0546218487395 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=581.669, mean=581.669, max=581.669, sum=1163.338 (2)", - "tab": "General information", - "score": 581.6688741721854 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=544.842, mean=544.842, max=544.842, sum=1089.684 (2)", - "tab": "General information", - "score": 544.8422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=833, mean=833, max=833, sum=1666 (2)", - "tab": "General information", - "score": 833.0 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2270.25, mean=2270.25, max=2270.25, sum=4540.5 (2)", - "tab": "General information", - "score": 2270.25 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1466.561, mean=1466.561, max=1466.561, sum=2933.122 (2)", - "tab": "General information", - "score": 1466.5611814345991 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.771 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.128, mean=0.128, max=0.128, sum=0.257 (2)", - "tab": "Efficiency", - "score": 0.12830044843049326 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.263 (2)", - "tab": "Efficiency", - "score": 0.13163358778625955 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=352.48, mean=352.48, max=352.48, sum=704.96 (2)", - "tab": "General information", - "score": 352.47982062780267 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=385.626, mean=385.626, max=385.626, sum=771.252 (2)", - "tab": "General information", - "score": 385.62595419847327 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.139, mean=0.139, max=0.139, sum=0.277 (2)", - "tab": "Efficiency", - "score": 0.13855371900826452 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=667.843, mean=667.843, max=667.843, sum=1335.686 (2)", - "tab": "General information", - "score": 667.8429752066115 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.136, mean=0.136, max=0.136, sum=0.272 (2)", - "tab": "Efficiency", - "score": 0.13612269938650304 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=482.227, mean=482.227, max=482.227, sum=964.454 (2)", - "tab": "General information", - "score": 482.2269938650307 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.142, mean=0.142, max=0.142, sum=0.284 (2)", - "tab": "Efficiency", - "score": 0.14183035714285702 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=699.598, mean=699.598, max=699.598, sum=1399.196 (2)", - "tab": "General information", - "score": 699.5982142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.922, - "details": { - "description": "min=0.922, mean=0.922, max=0.922, sum=1.845 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.257 (2)", - "tab": "Efficiency", - "score": 0.12854368932038837 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=320.34, mean=320.34, max=320.34, sum=640.68 (2)", - "tab": "General information", - "score": 320.3398058252427 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.923, - "details": { - "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.264 (2)", - "tab": "Efficiency", - "score": 0.13224786324786314 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=466.697, mean=466.697, max=466.697, sum=933.393 (2)", - "tab": "General information", - "score": 466.6965811965812 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.133, mean=0.133, max=0.133, sum=0.266 (2)", - "tab": "Efficiency", - "score": 0.13288 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=380.71, mean=380.71, max=380.71, sum=761.42 (2)", - "tab": "General information", - "score": 380.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.912, - "details": { - "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.257 (2)", - "tab": "Efficiency", - "score": 0.12866538952745835 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=342.847, mean=342.847, max=342.847, sum=685.693 (2)", - "tab": "General information", - "score": 342.84674329501917 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)", - "tab": "Efficiency", - "score": 0.1350173410404623 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.138, mean=0.138, max=0.138, sum=0.277 (2)", - "tab": "Efficiency", - "score": 0.13844581005586606 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=525.329, mean=525.329, max=525.329, sum=1050.659 (2)", - "tab": "General information", - "score": 525.3294797687861 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=692.482, mean=692.482, max=692.482, sum=1384.963 (2)", - "tab": "General information", - "score": 692.4815642458101 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.866, - "details": { - "description": "min=0.866, mean=0.866, max=0.866, sum=1.732 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)", - "tab": "Efficiency", - "score": 0.13503921568627456 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=612.69, mean=612.69, max=612.69, sum=1225.379 (2)", - "tab": "General information", - "score": 612.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.926, - "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.135, mean=0.135, max=0.135, sum=0.271 (2)", - "tab": "Efficiency", - "score": 0.135388888888889 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=552.454, mean=552.454, max=552.454, sum=1104.907 (2)", - "tab": "General information", - "score": 552.4537037037037 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.132, mean=0.132, max=0.132, sum=0.265 (2)", - "tab": "Efficiency", - "score": 0.13249090909090908 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=448.609, mean=448.609, max=448.609, sum=897.218 (2)", - "tab": "General information", - "score": 448.6090909090909 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.143, mean=0.143, max=0.143, sum=0.285 (2)", - "tab": "Efficiency", - "score": 0.1427142857142858 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1224.433, mean=1224.433, max=1224.433, sum=2448.865 (2)", - "tab": "General information", - "score": 1224.4326530612245 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "details": { - "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.137, mean=0.137, max=0.137, sum=0.275 (2)", - "tab": "Efficiency", - "score": 0.13738308457711446 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=474.512, mean=474.512, max=474.512, sum=949.025 (2)", - "tab": "General information", - "score": 474.5124378109453 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)", - "tab": "Efficiency", - "score": 0.1290301204819277 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=380.753, mean=380.753, max=380.753, sum=761.506 (2)", - "tab": "General information", - "score": 380.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.128, mean=0.128, max=0.128, sum=0.257 (2)", - "tab": "Efficiency", - "score": 0.12828070175438594 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=305.386, mean=305.386, max=305.386, sum=610.772 (2)", - "tab": "General information", - "score": 305.3859649122807 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.975, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json deleted file mode 100644 index c2616d7f8..000000000 --- a/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 2.1", - "id": "anthropic/claude-2.1", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.352, mean=0.735, max=0.959, sum=83.762 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=1.934, mean=2.418, max=3.916, sum=275.693 (114)", - "tab": "Efficiency", - "score": 2.4183583522219108 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=358.018, mean=703.288, max=2952.576, sum=80174.875 (114)", - "tab": "General information", - "score": 703.2883793758955 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0.994, mean=1.0, max=1, sum=113.982 (114)", - "tab": "General information", - "score": 0.999841257531982 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4, - "details": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=2.043, mean=2.043, max=2.043, sum=4.087 (2)", - "tab": "Efficiency", - "score": 2.043452892303467 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=435.26, mean=435.26, max=435.26, sum=870.52 (2)", - "tab": "General information", - "score": 435.26 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "description": "min=0.726, mean=0.726, max=0.726, sum=1.452 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=2.071, mean=2.071, max=2.071, sum=4.142 (2)", - "tab": "Efficiency", - "score": 2.0710925843980577 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=435.8, mean=435.8, max=435.8, sum=871.6 (2)", - "tab": "General information", - "score": 435.8 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=2.579, mean=2.579, max=2.579, sum=5.158 (2)", - "tab": "Efficiency", - "score": 2.579245555400848 - }, - "College Biology - Observed inference time (s)": { - "description": "min=2.209, mean=2.209, max=2.209, sum=4.418 (2)", - "tab": "Efficiency", - "score": 2.2088319063186646 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=2.413, mean=2.413, max=2.413, sum=4.826 (2)", - "tab": "Efficiency", - "score": 2.4128634238243105 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=2.18, mean=2.18, max=2.18, sum=4.359 (2)", - "tab": "Efficiency", - "score": 2.179708275794983 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=2.324, mean=2.324, max=2.324, sum=4.648 (2)", - "tab": "Efficiency", - "score": 2.3239130339870564 - }, - "College Physics - Observed inference time (s)": { - "description": "min=2.145, mean=2.145, max=2.145, sum=4.289 (2)", - "tab": "Efficiency", - "score": 2.144603039704117 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=615.01, mean=615.01, max=615.01, sum=1230.02 (2)", - "tab": "General information", - "score": 615.01 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=555.347, mean=555.347, max=555.347, sum=1110.694 (2)", - "tab": "General information", - "score": 555.3472222222222 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=903.24, mean=903.24, max=903.24, sum=1806.48 (2)", - "tab": "General information", - "score": 903.24 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=669.19, mean=669.19, max=669.19, sum=1338.38 (2)", - "tab": "General information", - "score": 669.19 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=605.63, mean=605.63, max=605.63, sum=1211.26 (2)", - "tab": "General information", - "score": 605.6300578034682 - }, - "College Medicine - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)", - "tab": "General information", - "score": 0.9942196531791907 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=554.48, mean=554.48, max=554.48, sum=1108.961 (2)", - "tab": "General information", - "score": 554.4803921568628 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=2.244, mean=2.244, max=2.244, sum=4.487 (2)", - "tab": "Efficiency", - "score": 2.2435835003852844 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=463.62, mean=463.62, max=463.62, sum=927.24 (2)", - "tab": "General information", - "score": 463.62 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=2.615, mean=2.615, max=2.615, sum=5.23 (2)", - "tab": "Efficiency", - "score": 2.6147566636403403 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=684.596, mean=684.596, max=684.596, sum=1369.193 (2)", - "tab": "General information", - "score": 684.5964912280701 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=1.934, mean=1.934, max=1.934, sum=3.869 (2)", - "tab": "Efficiency", - "score": 1.934385061264038 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=476.61, mean=476.61, max=476.61, sum=953.22 (2)", - "tab": "General information", - "score": 476.61 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=2.042, mean=2.042, max=2.042, sum=4.084 (2)", - "tab": "Efficiency", - "score": 2.041935768392351 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=496.426, mean=496.426, max=496.426, sum=992.852 (2)", - "tab": "General information", - "score": 496.4259259259259 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.794, - "details": { - "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=2.326, mean=2.326, max=2.326, sum=4.652 (2)", - "tab": "Efficiency", - "score": 2.3260836739248787 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=424.965, mean=424.965, max=424.965, sum=849.929 (2)", - "tab": "General information", - "score": 424.9646302250804 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.797, - "details": { - "description": "min=0.797, mean=0.797, max=0.797, sum=1.595 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=2.936, mean=2.936, max=2.936, sum=5.871 (2)", - "tab": "Efficiency", - "score": 2.9355741520138348 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=2.529, mean=2.529, max=2.529, sum=5.058 (2)", - "tab": "Efficiency", - "score": 2.528953587755244 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=3.335, mean=3.335, max=3.335, sum=6.669 (2)", - "tab": "Efficiency", - "score": 3.3346744537975206 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=2.597, mean=2.597, max=2.597, sum=5.194 (2)", - "tab": "Efficiency", - "score": 2.5970658024931264 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1188.537, mean=1188.537, max=1188.537, sum=2377.074 (2)", - "tab": "General information", - "score": 1188.5367647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=730.422, mean=730.422, max=730.422, sum=1460.844 (2)", - "tab": "General information", - "score": 730.4219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1766.16, mean=1766.16, max=1766.16, sum=3532.321 (2)", - "tab": "General information", - "score": 1766.16036505867 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=668.168, mean=668.168, max=668.168, sum=1336.337 (2)", - "tab": "General information", - "score": 668.1683006535948 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=2.374, mean=2.374, max=2.374, sum=4.747 (2)", - "tab": "Efficiency", - "score": 2.37366126537323 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=520.25, mean=520.25, max=520.25, sum=1040.5 (2)", - "tab": "General information", - "score": 520.25 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.711 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=2.346, mean=2.346, max=2.346, sum=4.692 (2)", - "tab": "Efficiency", - "score": 2.345861089857001 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=669.493, mean=669.493, max=669.493, sum=1338.987 (2)", - "tab": "General information", - "score": 669.4934210526316 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=2.35, mean=2.35, max=2.35, sum=4.701 (2)", - "tab": "Efficiency", - "score": 2.3504813623428347 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=665.02, mean=665.02, max=665.02, sum=1330.04 (2)", - "tab": "General information", - "score": 665.02 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=2.28, mean=2.28, max=2.28, sum=4.56 (2)", - "tab": "Efficiency", - "score": 2.279950815776609 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=494.457, mean=494.457, max=494.457, sum=988.913 (2)", - "tab": "General information", - "score": 494.4566037735849 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.766, - "details": { - "description": "min=0.766, mean=0.766, max=0.766, sum=1.532 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=2.125, mean=2.125, max=2.125, sum=4.25 (2)", - "tab": "Efficiency", - "score": 2.1249657225101553 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=388.536, mean=388.536, max=388.536, sum=777.072 (2)", - "tab": "General information", - "score": 388.53617021276597 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724, - "details": { - "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=2.336, mean=2.336, max=2.336, sum=4.672 (2)", - "tab": "Efficiency", - "score": 2.3361403728353567 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=502.041, mean=502.041, max=502.041, sum=1004.083 (2)", - "tab": "General information", - "score": 502.04137931034484 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.521, - "details": { - "description": "min=0.521, mean=0.521, max=0.521, sum=1.042 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=2.399, mean=2.399, max=2.399, sum=4.798 (2)", - "tab": "Efficiency", - "score": 2.398875941044439 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=576.066, mean=576.066, max=576.066, sum=1152.132 (2)", - "tab": "General information", - "score": 576.0661375661375 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=2.294, mean=2.294, max=2.294, sum=4.587 (2)", - "tab": "Efficiency", - "score": 2.293650850417122 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=711.746, mean=711.746, max=711.746, sum=1423.492 (2)", - "tab": "General information", - "score": 711.7460317460317 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=2.36, mean=2.36, max=2.36, sum=4.72 (2)", - "tab": "Efficiency", - "score": 2.360204086765166 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=2.324, mean=2.324, max=2.324, sum=4.647 (2)", - "tab": "Efficiency", - "score": 2.3235761426352517 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=2.353, mean=2.353, max=2.353, sum=4.707 (2)", - "tab": "Efficiency", - "score": 2.3532658934593202 - }, - "High School European History - Observed inference time (s)": { - "description": "min=3.916, mean=3.916, max=3.916, sum=7.832 (2)", - "tab": "Efficiency", - "score": 3.915820397752704 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=2.217, mean=2.217, max=2.217, sum=4.434 (2)", - "tab": "Efficiency", - "score": 2.217141205614263 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=2.403, mean=2.403, max=2.403, sum=4.807 (2)", - "tab": "Efficiency", - "score": 2.4034566397493986 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=2.329, mean=2.329, max=2.329, sum=4.658 (2)", - "tab": "Efficiency", - "score": 2.3290999345290353 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=2.45, mean=2.45, max=2.45, sum=4.9 (2)", - "tab": "Efficiency", - "score": 2.4497611089988993 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=2.492, mean=2.492, max=2.492, sum=4.984 (2)", - "tab": "Efficiency", - "score": 2.492123728038884 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=2.268, mean=2.268, max=2.268, sum=4.536 (2)", - "tab": "Efficiency", - "score": 2.267898343256767 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=2.45, mean=2.45, max=2.45, sum=4.901 (2)", - "tab": "Efficiency", - "score": 2.4503073394845387 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=2.554, mean=2.554, max=2.554, sum=5.107 (2)", - "tab": "Efficiency", - "score": 2.5535844012543008 - }, - "High School US History - Observed inference time (s)": { - "description": "min=3.541, mean=3.541, max=3.541, sum=7.081 (2)", - "tab": "Efficiency", - "score": 3.540712014132855 - }, - "High School World History - Observed inference time (s)": { - "description": "min=3.012, mean=3.012, max=3.012, sum=6.025 (2)", - "tab": "Efficiency", - "score": 3.0123110571994056 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=599.577, mean=599.577, max=599.577, sum=1199.155 (2)", - "tab": "General information", - "score": 599.5774193548388 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=562.921, mean=562.921, max=562.921, sum=1125.842 (2)", - "tab": "General information", - "score": 562.9211822660099 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=947.4, mean=947.4, max=947.4, sum=1894.8 (2)", - "tab": "General information", - "score": 947.4 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2952.576, mean=2952.576, max=2952.576, sum=5905.152 (2)", - "tab": "General information", - "score": 2952.5757575757575 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=477.268, mean=477.268, max=477.268, sum=954.535 (2)", - "tab": "General information", - "score": 477.2676767676768 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=556.104, mean=556.104, max=556.104, sum=1112.207 (2)", - "tab": "General information", - "score": 556.1036269430052 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=471.036, mean=471.036, max=471.036, sum=942.072 (2)", - "tab": "General information", - "score": 471.0358974358974 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=584.881, mean=584.881, max=584.881, sum=1169.763 (2)", - "tab": "General information", - "score": 584.8814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=485.513, mean=485.513, max=485.513, sum=971.025 (2)", - "tab": "General information", - "score": 485.5126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=623.841, mean=623.841, max=623.841, sum=1247.682 (2)", - "tab": "General information", - "score": 623.841059602649 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=586.42, mean=586.42, max=586.42, sum=1172.84 (2)", - "tab": "General information", - "score": 586.4201834862386 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=871.963, mean=871.963, max=871.963, sum=1743.926 (2)", - "tab": "General information", - "score": 871.9629629629629 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2353.49, mean=2353.49, max=2353.49, sum=4706.98 (2)", - "tab": "General information", - "score": 2353.4901960784314 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1540.932, mean=1540.932, max=1540.932, sum=3081.865 (2)", - "tab": "General information", - "score": 1540.9324894514768 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.847, - "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=2.287, mean=2.287, max=2.287, sum=4.573 (2)", - "tab": "Efficiency", - "score": 2.286549251710353 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=2.14, mean=2.14, max=2.14, sum=4.28 (2)", - "tab": "Efficiency", - "score": 2.1399855577308715 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=400.955, mean=400.955, max=400.955, sum=801.91 (2)", - "tab": "General information", - "score": 400.95515695067263 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=436.496, mean=436.496, max=436.496, sum=872.992 (2)", - "tab": "General information", - "score": 436.4961832061069 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=2.339, mean=2.339, max=2.339, sum=4.679 (2)", - "tab": "Efficiency", - "score": 2.3394163206589123 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=729.165, mean=729.165, max=729.165, sum=1458.331 (2)", - "tab": "General information", - "score": 729.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=2.313, mean=2.313, max=2.313, sum=4.627 (2)", - "tab": "Efficiency", - "score": 2.3134736488201866 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=535.276, mean=535.276, max=535.276, sum=1070.552 (2)", - "tab": "General information", - "score": 535.2760736196319 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482, - "details": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.964 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=2.246, mean=2.246, max=2.246, sum=4.492 (2)", - "tab": "Efficiency", - "score": 2.246019565633365 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=741.518, mean=741.518, max=741.518, sum=1483.036 (2)", - "tab": "General information", - "score": 741.5178571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.65 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=2.02, mean=2.02, max=2.02, sum=4.041 (2)", - "tab": "Efficiency", - "score": 2.0203486507378736 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=366.282, mean=366.282, max=366.282, sum=732.563 (2)", - "tab": "General information", - "score": 366.28155339805824 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.923, - "details": { - "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=2.371, mean=2.371, max=2.371, sum=4.741 (2)", - "tab": "Efficiency", - "score": 2.370740459515498 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)", - "tab": "General information", - "score": 513.0641025641025 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=2.213, mean=2.213, max=2.213, sum=4.426 (2)", - "tab": "Efficiency", - "score": 2.213027362823486 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=419.88, mean=419.88, max=419.88, sum=839.76 (2)", - "tab": "General information", - "score": 419.88 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=2.421, mean=2.421, max=2.421, sum=4.843 (2)", - "tab": "Efficiency", - "score": 2.421274871813992 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=393.628, mean=393.628, max=393.628, sum=787.257 (2)", - "tab": "General information", - "score": 393.62835249042143 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.039 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=2.478, mean=2.478, max=2.478, sum=4.955 (2)", - "tab": "Efficiency", - "score": 2.4775779054344045 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=2.624, mean=2.624, max=2.624, sum=5.248 (2)", - "tab": "Efficiency", - "score": 2.624200687994504 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=576.789, mean=576.789, max=576.789, sum=1153.578 (2)", - "tab": "General information", - "score": 576.7890173410404 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=741.949, mean=741.949, max=741.949, sum=1483.897 (2)", - "tab": "General information", - "score": 741.9486033519553 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.781, - "details": { - "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=2.516, mean=2.516, max=2.516, sum=5.033 (2)", - "tab": "Efficiency", - "score": 2.516486873813704 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=682.065, mean=682.065, max=682.065, sum=1364.131 (2)", - "tab": "General information", - "score": 682.0653594771242 - }, - "Nutrition - # output tokens": { - "description": "min=0.997, mean=0.997, max=0.997, sum=1.993 (2)", - "tab": "General information", - "score": 0.9967320261437909 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.821, - "details": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.642 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=2.431, mean=2.431, max=2.431, sum=4.862 (2)", - "tab": "Efficiency", - "score": 2.4310101116145097 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=610.639, mean=610.639, max=610.639, sum=1221.278 (2)", - "tab": "General information", - "score": 610.6388888888889 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=2.068, mean=2.068, max=2.068, sum=4.136 (2)", - "tab": "Efficiency", - "score": 2.067864069071683 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=497.991, mean=497.991, max=497.991, sum=995.982 (2)", - "tab": "General information", - "score": 497.9909090909091 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=2.854, mean=2.854, max=2.854, sum=5.708 (2)", - "tab": "Efficiency", - "score": 2.8541687430167686 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1308.804, mean=1308.804, max=1308.804, sum=2617.608 (2)", - "tab": "General information", - "score": 1308.8040816326532 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=2.362, mean=2.362, max=2.362, sum=4.725 (2)", - "tab": "Efficiency", - "score": 2.362461663004178 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=532.274, mean=532.274, max=532.274, sum=1064.547 (2)", - "tab": "General information", - "score": 532.273631840796 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=2.231, mean=2.231, max=2.231, sum=4.462 (2)", - "tab": "Efficiency", - "score": 2.2311078037124084 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=427.651, mean=427.651, max=427.651, sum=855.301 (2)", - "tab": "General information", - "score": 427.65060240963857 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=2.237, mean=2.237, max=2.237, sum=4.474 (2)", - "tab": "Efficiency", - "score": 2.2371394411165113 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=358.018, mean=358.018, max=358.018, sum=716.035 (2)", - "tab": "General information", - "score": 358.0175438596491 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.048, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json deleted file mode 100644 index 76628bf51..000000000 --- a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3.5 Haiku 20241022", - "id": "anthropic/claude-3-5-haiku-20241022", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.359, mean=0.743, max=0.94, sum=84.719 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.909, mean=1.108, max=1.572, sum=126.32 (114)", - "tab": "Efficiency", - "score": 1.1080717974066416 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)", - "tab": "General information", - "score": 638.2883793758953 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47, - "details": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.819 (2)", - "tab": "Efficiency", - "score": 0.9094081521034241 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)", - "tab": "General information", - "score": 370.26 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=1.124, mean=1.124, max=1.124, sum=2.247 (2)", - "tab": "Efficiency", - "score": 1.1236292309231228 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)", - "tab": "General information", - "score": 370.8 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.039 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=1.196, mean=1.196, max=1.196, sum=2.392 (2)", - "tab": "Efficiency", - "score": 1.1962119388580321 - }, - "College Biology - Observed inference time (s)": { - "description": "min=1.247, mean=1.247, max=1.247, sum=2.494 (2)", - "tab": "Efficiency", - "score": 1.2467927502261267 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=1.572, mean=1.572, max=1.572, sum=3.144 (2)", - "tab": "Efficiency", - "score": 1.5719245457649231 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=1.13, mean=1.13, max=1.13, sum=2.26 (2)", - "tab": "Efficiency", - "score": 1.1302329087257386 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=1.259, mean=1.259, max=1.259, sum=2.517 (2)", - "tab": "Efficiency", - "score": 1.2587321479885565 - }, - "College Physics - Observed inference time (s)": { - "description": "min=1.261, mean=1.261, max=1.261, sum=2.521 (2)", - "tab": "Efficiency", - "score": 1.2606473857281255 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)", - "tab": "General information", - "score": 550.01 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)", - "tab": "General information", - "score": 490.34722222222223 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)", - "tab": "General information", - "score": 838.24 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)", - "tab": "General information", - "score": 604.19 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)", - "tab": "General information", - "score": 540.6300578034682 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)", - "tab": "General information", - "score": 489.48039215686276 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.013, mean=1.013, max=1.013, sum=2.027 (2)", - "tab": "Efficiency", - "score": 1.0133756017684936 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)", - "tab": "General information", - "score": 398.62 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.922, mean=0.922, max=0.922, sum=1.845 (2)", - "tab": "Efficiency", - "score": 0.9224813549142135 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)", - "tab": "General information", - "score": 619.5964912280701 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=1.101, mean=1.101, max=1.101, sum=2.201 (2)", - "tab": "Efficiency", - "score": 1.1007365608215331 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)", - "tab": "General information", - "score": 411.61 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=1.104, mean=1.104, max=1.104, sum=2.209 (2)", - "tab": "Efficiency", - "score": 1.1042848251484059 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)", - "tab": "General information", - "score": 431.4259259259259 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.823, - "details": { - "description": "min=0.823, mean=0.823, max=0.823, sum=1.646 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=1.117, mean=1.117, max=1.117, sum=2.233 (2)", - "tab": "Efficiency", - "score": 1.1165370488856767 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)", - "tab": "General information", - "score": 359.9646302250804 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.65 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.412, mean=1.412, max=1.412, sum=2.824 (2)", - "tab": "Efficiency", - "score": 1.4119182877680834 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.984, mean=0.984, max=0.984, sum=1.967 (2)", - "tab": "Efficiency", - "score": 0.9836687187776498 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.016, mean=1.016, max=1.016, sum=2.032 (2)", - "tab": "Efficiency", - "score": 1.0160297585901412 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.979, mean=0.979, max=0.979, sum=1.958 (2)", - "tab": "Efficiency", - "score": 0.9789344672284095 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)", - "tab": "General information", - "score": 1123.5367647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)", - "tab": "General information", - "score": 665.4219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)", - "tab": "General information", - "score": 1701.16036505867 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)", - "tab": "General information", - "score": 603.1683006535948 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.97, mean=0.97, max=0.97, sum=1.941 (2)", - "tab": "Efficiency", - "score": 0.9703591632843017 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)", - "tab": "General information", - "score": 455.25 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=1.18, mean=1.18, max=1.18, sum=2.36 (2)", - "tab": "Efficiency", - "score": 1.1798271034893237 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)", - "tab": "General information", - "score": 604.4934210526316 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=1.147, mean=1.147, max=1.147, sum=2.295 (2)", - "tab": "Efficiency", - "score": 1.1473834657669066 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)", - "tab": "General information", - "score": 600.02 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.823, - "details": { - "description": "min=0.823, mean=0.823, max=0.823, sum=1.645 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=1.099, mean=1.099, max=1.099, sum=2.198 (2)", - "tab": "Efficiency", - "score": 1.0991604094235403 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)", - "tab": "General information", - "score": 429.4566037735849 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723, - "details": { - "description": "min=0.723, mean=0.723, max=0.723, sum=1.447 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=1.537, mean=1.537, max=1.537, sum=3.074 (2)", - "tab": "Efficiency", - "score": 1.536949543242759 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)", - "tab": "General information", - "score": 323.53617021276597 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.717, - "details": { - "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=1.249, mean=1.249, max=1.249, sum=2.497 (2)", - "tab": "Efficiency", - "score": 1.2485630594450852 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)", - "tab": "General information", - "score": 437.04137931034484 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.122 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=1.558, mean=1.558, max=1.558, sum=3.116 (2)", - "tab": "Efficiency", - "score": 1.5580224965615248 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)", - "tab": "General information", - "score": 511.06613756613757 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.619, - "details": { - "description": "min=0.619, mean=0.619, max=0.619, sum=1.238 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=1.526, mean=1.526, max=1.526, sum=3.052 (2)", - "tab": "Efficiency", - "score": 1.5258309424869598 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)", - "tab": "General information", - "score": 646.7460317460317 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882, - "details": { - "description": "min=0.882, mean=0.882, max=0.882, sum=1.764 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=1.15, mean=1.15, max=1.15, sum=2.299 (2)", - "tab": "Efficiency", - "score": 1.1497065974820044 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=1.227, mean=1.227, max=1.227, sum=2.454 (2)", - "tab": "Efficiency", - "score": 1.2272211636228514 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=1.014, mean=1.014, max=1.014, sum=2.027 (2)", - "tab": "Efficiency", - "score": 1.0136730527877809 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.024, mean=1.024, max=1.024, sum=2.047 (2)", - "tab": "Efficiency", - "score": 1.0236461119218305 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=1.059, mean=1.059, max=1.059, sum=2.119 (2)", - "tab": "Efficiency", - "score": 1.0594979368074975 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=1.138, mean=1.138, max=1.138, sum=2.275 (2)", - "tab": "Efficiency", - "score": 1.1376265478875354 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=1.107, mean=1.107, max=1.107, sum=2.214 (2)", - "tab": "Efficiency", - "score": 1.1069551357856164 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=1.094, mean=1.094, max=1.094, sum=2.188 (2)", - "tab": "Efficiency", - "score": 1.0940863344404432 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=1.034, mean=1.034, max=1.034, sum=2.068 (2)", - "tab": "Efficiency", - "score": 1.03420967815303 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=1.059, mean=1.059, max=1.059, sum=2.119 (2)", - "tab": "Efficiency", - "score": 1.0594944227610203 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=1.074, mean=1.074, max=1.074, sum=2.149 (2)", - "tab": "Efficiency", - "score": 1.07433808177983 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=1.053, mean=1.053, max=1.053, sum=2.107 (2)", - "tab": "Efficiency", - "score": 1.0534564554691315 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.101, mean=1.101, max=1.101, sum=2.201 (2)", - "tab": "Efficiency", - "score": 1.1006785748051662 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.093, mean=1.093, max=1.093, sum=2.186 (2)", - "tab": "Efficiency", - "score": 1.0931011674776359 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)", - "tab": "General information", - "score": 534.5774193548388 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)", - "tab": "General information", - "score": 497.92118226600985 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)", - "tab": "General information", - "score": 882.4 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)", - "tab": "General information", - "score": 2887.5757575757575 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)", - "tab": "General information", - "score": 412.2676767676768 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)", - "tab": "General information", - "score": 491.10362694300517 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)", - "tab": "General information", - "score": 406.0358974358974 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)", - "tab": "General information", - "score": 519.8814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)", - "tab": "General information", - "score": 420.5126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)", - "tab": "General information", - "score": 558.841059602649 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)", - "tab": "General information", - "score": 521.4201834862386 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)", - "tab": "General information", - "score": 806.9629629629629 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)", - "tab": "General information", - "score": 2288.4901960784314 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)", - "tab": "General information", - "score": 1475.9324894514768 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.771 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=1.084, mean=1.084, max=1.084, sum=2.169 (2)", - "tab": "Efficiency", - "score": 1.0844623775225584 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=1.056, mean=1.056, max=1.056, sum=2.112 (2)", - "tab": "Efficiency", - "score": 1.0560545211529915 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)", - "tab": "General information", - "score": 335.95515695067263 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)", - "tab": "General information", - "score": 371.4961832061069 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=1.112, mean=1.112, max=1.112, sum=2.225 (2)", - "tab": "Efficiency", - "score": 1.1124236544301687 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)", - "tab": "General information", - "score": 664.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=1.015, mean=1.015, max=1.015, sum=2.03 (2)", - "tab": "Efficiency", - "score": 1.0148307984591993 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)", - "tab": "General information", - "score": 470.2760736196319 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518, - "details": { - "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=1.067, mean=1.067, max=1.067, sum=2.135 (2)", - "tab": "Efficiency", - "score": 1.0673569909163885 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)", - "tab": "General information", - "score": 676.5178571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=1.038, mean=1.038, max=1.038, sum=2.076 (2)", - "tab": "Efficiency", - "score": 1.0377622229381673 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)", - "tab": "General information", - "score": 301.28155339805824 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.897, - "details": { - "description": "min=0.897, mean=0.897, max=0.897, sum=1.795 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.993, mean=0.993, max=0.993, sum=1.986 (2)", - "tab": "Efficiency", - "score": 0.9929133276654105 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)", - "tab": "General information", - "score": 448.06410256410254 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=1.041, mean=1.041, max=1.041, sum=2.082 (2)", - "tab": "Efficiency", - "score": 1.041243133544922 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)", - "tab": "General information", - "score": 354.88 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "details": { - "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=1.043, mean=1.043, max=1.043, sum=2.086 (2)", - "tab": "Efficiency", - "score": 1.0429492231225297 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)", - "tab": "General information", - "score": 328.62835249042143 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.476, - "details": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=1.044, mean=1.044, max=1.044, sum=2.088 (2)", - "tab": "Efficiency", - "score": 1.0438106094481627 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.919 (2)", - "tab": "Efficiency", - "score": 0.95963474492121 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)", - "tab": "General information", - "score": 511.78901734104045 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)", - "tab": "General information", - "score": 676.9486033519553 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.846, - "details": { - "description": "min=0.846, mean=0.846, max=0.846, sum=1.693 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.981, mean=0.981, max=0.981, sum=1.962 (2)", - "tab": "Efficiency", - "score": 0.9811088399949417 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)", - "tab": "General information", - "score": 617.0653594771242 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.753 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=1.003, mean=1.003, max=1.003, sum=2.006 (2)", - "tab": "Efficiency", - "score": 1.0031694571177165 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)", - "tab": "General information", - "score": 545.6388888888889 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Efficiency", - "score": 0.9410657709295099 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)", - "tab": "General information", - "score": 432.9909090909091 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.792, - "details": { - "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=1.016, mean=1.016, max=1.016, sum=2.033 (2)", - "tab": "Efficiency", - "score": 1.0164005843960509 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)", - "tab": "General information", - "score": 1243.8040816326532 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "details": { - "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.976, mean=0.976, max=0.976, sum=1.952 (2)", - "tab": "Efficiency", - "score": 0.9757713939420026 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)", - "tab": "General information", - "score": 467.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566, - "details": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.929, mean=0.929, max=0.929, sum=1.858 (2)", - "tab": "Efficiency", - "score": 0.9289331062730536 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)", - "tab": "General information", - "score": 362.65060240963857 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=1.021, mean=1.021, max=1.021, sum=2.042 (2)", - "tab": "Efficiency", - "score": 1.0208685663011339 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)", - "tab": "General information", - "score": 293.0175438596491 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.128, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json deleted file mode 100644 index 9d9557efc..000000000 --- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3.5 Sonnet 20240620", - "id": "anthropic/claude-3-5-sonnet-20240620", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.58, mean=0.865, max=0.98, sum=98.656 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.765, mean=1.1, max=3.433, sum=125.349 (114)", - "tab": "Efficiency", - "score": 1.099552619745469 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=302.018, mean=647.288, max=2896.576, sum=73790.875 (114)", - "tab": "General information", - "score": 647.2883793758954 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)", - "tab": "Efficiency", - "score": 0.7789034700393677 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=379.26, mean=379.26, max=379.26, sum=758.52 (2)", - "tab": "General information", - "score": 379.26 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.844, - "details": { - "description": "min=0.844, mean=0.844, max=0.844, sum=1.689 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.777, mean=0.777, max=0.777, sum=1.553 (2)", - "tab": "Efficiency", - "score": 0.7767299599117703 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=379.8, mean=379.8, max=379.8, sum=759.6 (2)", - "tab": "General information", - "score": 379.8 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.797, mean=0.797, max=0.797, sum=1.594 (2)", - "tab": "Efficiency", - "score": 0.7968128871917725 - }, - "College Biology - Observed inference time (s)": { - "description": "min=1.09, mean=1.09, max=1.09, sum=2.18 (2)", - "tab": "Efficiency", - "score": 1.0898179478115506 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=1.27, mean=1.27, max=1.27, sum=2.539 (2)", - "tab": "Efficiency", - "score": 1.2695734238624572 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=1.72, mean=1.72, max=1.72, sum=3.439 (2)", - "tab": "Efficiency", - "score": 1.7196030735969543 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=1.28, mean=1.28, max=1.28, sum=2.559 (2)", - "tab": "Efficiency", - "score": 1.2795469209637944 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.591 (2)", - "tab": "Efficiency", - "score": 0.7955308311125812 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=559.01, mean=559.01, max=559.01, sum=1118.02 (2)", - "tab": "General information", - "score": 559.01 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=499.347, mean=499.347, max=499.347, sum=998.694 (2)", - "tab": "General information", - "score": 499.34722222222223 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=847.24, mean=847.24, max=847.24, sum=1694.48 (2)", - "tab": "General information", - "score": 847.24 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=613.19, mean=613.19, max=613.19, sum=1226.38 (2)", - "tab": "General information", - "score": 613.19 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=549.63, mean=549.63, max=549.63, sum=1099.26 (2)", - "tab": "General information", - "score": 549.6300578034682 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=498.48, mean=498.48, max=498.48, sum=996.961 (2)", - "tab": "General information", - "score": 498.48039215686276 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.765, mean=0.765, max=0.765, sum=1.531 (2)", - "tab": "Efficiency", - "score": 0.7653794264793397 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=407.62, mean=407.62, max=407.62, sum=815.24 (2)", - "tab": "General information", - "score": 407.62 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.615 (2)", - "tab": "Efficiency", - "score": 0.8075556734152007 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=628.596, mean=628.596, max=628.596, sum=1257.193 (2)", - "tab": "General information", - "score": 628.5964912280701 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)", - "tab": "Efficiency", - "score": 0.785265531539917 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=420.61, mean=420.61, max=420.61, sum=841.22 (2)", - "tab": "General information", - "score": 420.61 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.781, mean=0.781, max=0.781, sum=1.563 (2)", - "tab": "Efficiency", - "score": 0.7813034631587841 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=440.426, mean=440.426, max=440.426, sum=880.852 (2)", - "tab": "General information", - "score": 440.4259259259259 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.891, - "details": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=2.168, mean=2.168, max=2.168, sum=4.336 (2)", - "tab": "Efficiency", - "score": 2.1680153757812892 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=368.965, mean=368.965, max=368.965, sum=737.929 (2)", - "tab": "General information", - "score": 368.9646302250804 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.922, - "details": { - "description": "min=0.922, mean=0.922, max=0.922, sum=1.843 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=2.144, mean=2.144, max=2.144, sum=4.287 (2)", - "tab": "Efficiency", - "score": 2.1436235790743545 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=2.085, mean=2.085, max=2.085, sum=4.169 (2)", - "tab": "Efficiency", - "score": 2.084580805284757 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.308, mean=1.308, max=1.308, sum=2.616 (2)", - "tab": "Efficiency", - "score": 1.3078198053690726 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=1.15, mean=1.15, max=1.15, sum=2.301 (2)", - "tab": "Efficiency", - "score": 1.1502779430034114 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1132.537, mean=1132.537, max=1132.537, sum=2265.074 (2)", - "tab": "General information", - "score": 1132.5367647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=674.422, mean=674.422, max=674.422, sum=1348.844 (2)", - "tab": "General information", - "score": 674.4219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1710.16, mean=1710.16, max=1710.16, sum=3420.321 (2)", - "tab": "General information", - "score": 1710.16036505867 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=612.168, mean=612.168, max=612.168, sum=1224.337 (2)", - "tab": "General information", - "score": 612.1683006535948 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.973, mean=0.973, max=0.973, sum=1.946 (2)", - "tab": "Efficiency", - "score": 0.9727654385566712 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=464.25, mean=464.25, max=464.25, sum=928.5 (2)", - "tab": "General information", - "score": 464.25 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.961, - "details": { - "description": "min=0.961, mean=0.961, max=0.961, sum=1.921 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=1.35, mean=1.35, max=1.35, sum=2.7 (2)", - "tab": "Efficiency", - "score": 1.3501500989261426 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=613.493, mean=613.493, max=613.493, sum=1226.987 (2)", - "tab": "General information", - "score": 613.4934210526316 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=1.326, mean=1.326, max=1.326, sum=2.652 (2)", - "tab": "Efficiency", - "score": 1.325816671848297 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=609.02, mean=609.02, max=609.02, sum=1218.04 (2)", - "tab": "General information", - "score": 609.02 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=1.379, mean=1.379, max=1.379, sum=2.757 (2)", - "tab": "Efficiency", - "score": 1.3787489792086043 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=438.457, mean=438.457, max=438.457, sum=876.913 (2)", - "tab": "General information", - "score": 438.4566037735849 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Efficiency", - "score": 0.7780434922969087 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=332.536, mean=332.536, max=332.536, sum=665.072 (2)", - "tab": "General information", - "score": 332.53617021276597 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=1.655 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Efficiency", - "score": 0.789771790340029 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=446.041, mean=446.041, max=446.041, sum=892.083 (2)", - "tab": "General information", - "score": 446.04137931034484 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.892, - "details": { - "description": "min=0.892, mean=0.892, max=0.892, sum=1.783 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)", - "tab": "Efficiency", - "score": 0.8060284802522609 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=520.066, mean=520.066, max=520.066, sum=1040.132 (2)", - "tab": "General information", - "score": 520.0661375661375 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.698, - "details": { - "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.811, mean=0.811, max=0.811, sum=1.623 (2)", - "tab": "Efficiency", - "score": 0.8114165843479217 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=655.746, mean=655.746, max=655.746, sum=1311.492 (2)", - "tab": "General information", - "score": 655.7460317460317 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.954, - "details": { - "description": "min=0.954, mean=0.954, max=0.954, sum=1.907 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.605 (2)", - "tab": "Efficiency", - "score": 0.8022696918056857 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)", - "tab": "Efficiency", - "score": 0.8062427619407917 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Efficiency", - "score": 0.8532347416877747 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.183, mean=1.183, max=1.183, sum=2.366 (2)", - "tab": "Efficiency", - "score": 1.1831647526134144 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.776, mean=0.776, max=0.776, sum=1.553 (2)", - "tab": "Efficiency", - "score": 0.7764992966796412 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.603 (2)", - "tab": "Efficiency", - "score": 0.8015919287587695 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.782, mean=0.782, max=0.782, sum=1.563 (2)", - "tab": "Efficiency", - "score": 0.781673603791457 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.805, mean=0.805, max=0.805, sum=1.61 (2)", - "tab": "Efficiency", - "score": 0.80511144178885 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)", - "tab": "Efficiency", - "score": 0.7879440243504628 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)", - "tab": "Efficiency", - "score": 0.8290448062467259 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)", - "tab": "Efficiency", - "score": 0.8071829231507187 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)", - "tab": "Efficiency", - "score": 0.8119496272669898 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.938, mean=0.938, max=0.938, sum=1.877 (2)", - "tab": "Efficiency", - "score": 0.9383000193857679 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.097, mean=1.097, max=1.097, sum=2.194 (2)", - "tab": "Efficiency", - "score": 1.0968722401791986 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=543.577, mean=543.577, max=543.577, sum=1087.155 (2)", - "tab": "General information", - "score": 543.5774193548388 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=506.921, mean=506.921, max=506.921, sum=1013.842 (2)", - "tab": "General information", - "score": 506.92118226600985 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=891.4, mean=891.4, max=891.4, sum=1782.8 (2)", - "tab": "General information", - "score": 891.4 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2896.576, mean=2896.576, max=2896.576, sum=5793.152 (2)", - "tab": "General information", - "score": 2896.5757575757575 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=421.268, mean=421.268, max=421.268, sum=842.535 (2)", - "tab": "General information", - "score": 421.2676767676768 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=500.104, mean=500.104, max=500.104, sum=1000.207 (2)", - "tab": "General information", - "score": 500.10362694300517 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=415.036, mean=415.036, max=415.036, sum=830.072 (2)", - "tab": "General information", - "score": 415.0358974358974 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=528.881, mean=528.881, max=528.881, sum=1057.763 (2)", - "tab": "General information", - "score": 528.8814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=429.513, mean=429.513, max=429.513, sum=859.025 (2)", - "tab": "General information", - "score": 429.5126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=567.841, mean=567.841, max=567.841, sum=1135.682 (2)", - "tab": "General information", - "score": 567.841059602649 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=530.42, mean=530.42, max=530.42, sum=1060.84 (2)", - "tab": "General information", - "score": 530.4201834862386 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=815.963, mean=815.963, max=815.963, sum=1631.926 (2)", - "tab": "General information", - "score": 815.9629629629629 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2297.49, mean=2297.49, max=2297.49, sum=4594.98 (2)", - "tab": "General information", - "score": 2297.4901960784314 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1484.932, mean=1484.932, max=1484.932, sum=2969.865 (2)", - "tab": "General information", - "score": 1484.9324894514768 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.939, - "details": { - "description": "min=0.939, mean=0.939, max=0.939, sum=1.878 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.569 (2)", - "tab": "Efficiency", - "score": 0.7847084699724822 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.811, mean=0.811, max=0.811, sum=1.622 (2)", - "tab": "Efficiency", - "score": 0.8110958565282458 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=344.955, mean=344.955, max=344.955, sum=689.91 (2)", - "tab": "General information", - "score": 344.95515695067263 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=380.496, mean=380.496, max=380.496, sum=760.992 (2)", - "tab": "General information", - "score": 380.4961832061069 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.959, - "details": { - "description": "min=0.959, mean=0.959, max=0.959, sum=1.917 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", - "tab": "Efficiency", - "score": 0.8220856209431798 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=673.165, mean=673.165, max=673.165, sum=1346.331 (2)", - "tab": "General information", - "score": 673.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.926, - "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.853 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Efficiency", - "score": 0.778087305876375 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=479.276, mean=479.276, max=479.276, sum=958.552 (2)", - "tab": "General information", - "score": 479.2760736196319 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.571 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.619 (2)", - "tab": "Efficiency", - "score": 0.809621695961271 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=685.518, mean=685.518, max=685.518, sum=1371.036 (2)", - "tab": "General information", - "score": 685.5178571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.942, - "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)", - "tab": "Efficiency", - "score": 0.8480523350169358 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=310.282, mean=310.282, max=310.282, sum=620.563 (2)", - "tab": "General information", - "score": 310.28155339805824 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=2.55, mean=2.55, max=2.55, sum=5.1 (2)", - "tab": "Efficiency", - "score": 2.550003965695699 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=457.064, mean=457.064, max=457.064, sum=914.128 (2)", - "tab": "General information", - "score": 457.06410256410254 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.98, - "details": { - "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=3.433, mean=3.433, max=3.433, sum=6.867 (2)", - "tab": "Efficiency", - "score": 3.4333492875099183 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=363.88, mean=363.88, max=363.88, sum=727.76 (2)", - "tab": "General information", - "score": 363.88 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.962, - "details": { - "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=1.474, mean=1.474, max=1.474, sum=2.949 (2)", - "tab": "Efficiency", - "score": 1.4744500937285248 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=337.628, mean=337.628, max=337.628, sum=675.257 (2)", - "tab": "General information", - "score": 337.62835249042143 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882, - "details": { - "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.817, mean=0.817, max=0.817, sum=1.635 (2)", - "tab": "Efficiency", - "score": 0.8173547728213272 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=1.043, mean=1.043, max=1.043, sum=2.085 (2)", - "tab": "Efficiency", - "score": 1.0425983404980026 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=520.789, mean=520.789, max=520.789, sum=1041.578 (2)", - "tab": "General information", - "score": 520.7890173410404 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=685.949, mean=685.949, max=685.949, sum=1371.897 (2)", - "tab": "General information", - "score": 685.9486033519553 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.912, - "details": { - "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.987, mean=0.987, max=0.987, sum=1.973 (2)", - "tab": "Efficiency", - "score": 0.9867353338042116 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=626.065, mean=626.065, max=626.065, sum=1252.131 (2)", - "tab": "General information", - "score": 626.0653594771242 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.951, - "details": { - "description": "min=0.951, mean=0.951, max=0.951, sum=1.901 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)", - "tab": "Efficiency", - "score": 0.8874673313564725 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=554.639, mean=554.639, max=554.639, sum=1109.278 (2)", - "tab": "General information", - "score": 554.6388888888889 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.709 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=1.124, mean=1.124, max=1.124, sum=2.248 (2)", - "tab": "Efficiency", - "score": 1.1237782673402266 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=441.991, mean=441.991, max=441.991, sum=883.982 (2)", - "tab": "General information", - "score": 441.9909090909091 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=1.219, mean=1.219, max=1.219, sum=2.438 (2)", - "tab": "Efficiency", - "score": 1.2191707075858602 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1252.804, mean=1252.804, max=1252.804, sum=2505.608 (2)", - "tab": "General information", - "score": 1252.8040816326532 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=1.141, mean=1.141, max=1.141, sum=2.282 (2)", - "tab": "Efficiency", - "score": 1.141001319410789 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=476.274, mean=476.274, max=476.274, sum=952.547 (2)", - "tab": "General information", - "score": 476.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602, - "details": { - "description": "min=0.602, mean=0.602, max=0.602, sum=1.205 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=1.15, mean=1.15, max=1.15, sum=2.3 (2)", - "tab": "Efficiency", - "score": 1.1499209547617348 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=371.651, mean=371.651, max=371.651, sum=743.301 (2)", - "tab": "General information", - "score": 371.65060240963857 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=1.201, mean=1.201, max=1.201, sum=2.402 (2)", - "tab": "Efficiency", - "score": 1.200854153661003 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=302.018, mean=302.018, max=302.018, sum=604.035 (2)", - "tab": "General information", - "score": 302.0175438596491 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.17, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json deleted file mode 100644 index 35be68aa6..000000000 --- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3.5 Sonnet 20241022", - "id": "anthropic/claude-3-5-sonnet-20241022", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.873, - "details": { - "description": "min=0.584, mean=0.873, max=0.984, sum=99.491 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.615, mean=0.688, max=1.002, sum=78.403 (114)", - "tab": "Efficiency", - "score": 0.6877486861856626 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)", - "tab": "General information", - "score": 638.2883793758953 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.673, mean=0.673, max=0.673, sum=1.345 (2)", - "tab": "Efficiency", - "score": 0.672634687423706 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)", - "tab": "General information", - "score": 370.26 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.719 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.654, mean=0.654, max=0.654, sum=1.308 (2)", - "tab": "Efficiency", - "score": 0.653886115109479 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)", - "tab": "General information", - "score": 370.8 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "details": { - "description": "min=0.775, mean=0.775, max=0.775, sum=1.549 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.689, mean=0.689, max=0.689, sum=1.379 (2)", - "tab": "Efficiency", - "score": 0.6893502926826477 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Efficiency", - "score": 0.6600197752316793 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.673, mean=0.673, max=0.673, sum=1.345 (2)", - "tab": "Efficiency", - "score": 0.6726715517044067 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)", - "tab": "Efficiency", - "score": 0.6890151953697204 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.668, mean=0.668, max=0.668, sum=1.337 (2)", - "tab": "Efficiency", - "score": 0.6682831924085673 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)", - "tab": "Efficiency", - "score": 0.7037388226565193 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)", - "tab": "General information", - "score": 550.01 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)", - "tab": "General information", - "score": 490.34722222222223 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)", - "tab": "General information", - "score": 838.24 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)", - "tab": "General information", - "score": 604.19 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)", - "tab": "General information", - "score": 540.6300578034682 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)", - "tab": "General information", - "score": 489.48039215686276 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.661, mean=0.661, max=0.661, sum=1.322 (2)", - "tab": "Efficiency", - "score": 0.6610880661010742 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)", - "tab": "General information", - "score": 398.62 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.367 (2)", - "tab": "Efficiency", - "score": 0.6837067018475449 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)", - "tab": "General information", - "score": 619.5964912280701 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.637, mean=0.637, max=0.637, sum=1.274 (2)", - "tab": "Efficiency", - "score": 0.6369614601135254 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)", - "tab": "General information", - "score": 411.61 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898, - "details": { - "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", - "tab": "Efficiency", - "score": 0.6427947613928053 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)", - "tab": "General information", - "score": 431.4259259259259 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.891, - "details": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.645, mean=0.645, max=0.645, sum=1.291 (2)", - "tab": "Efficiency", - "score": 0.6454648833566157 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)", - "tab": "General information", - "score": 359.9646302250804 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.922, - "details": { - "description": "min=0.922, mean=0.922, max=0.922, sum=1.843 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.622, mean=0.622, max=0.622, sum=1.243 (2)", - "tab": "Efficiency", - "score": 0.6215311034637339 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Efficiency", - "score": 0.6900012104223806 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.002, mean=1.002, max=1.002, sum=2.004 (2)", - "tab": "Efficiency", - "score": 1.002109061319483 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)", - "tab": "Efficiency", - "score": 0.6821525521527708 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)", - "tab": "General information", - "score": 1123.5367647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)", - "tab": "General information", - "score": 665.4219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)", - "tab": "General information", - "score": 1701.16036505867 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)", - "tab": "General information", - "score": 603.1683006535948 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Efficiency", - "score": 0.660010986328125 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)", - "tab": "General information", - "score": 455.25 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.974, - "details": { - "description": "min=0.974, mean=0.974, max=0.974, sum=1.947 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.672, mean=0.672, max=0.672, sum=1.344 (2)", - "tab": "Efficiency", - "score": 0.6717779793237385 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)", - "tab": "General information", - "score": 604.4934210526316 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)", - "tab": "Efficiency", - "score": 0.6511244606971741 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)", - "tab": "General information", - "score": 600.02 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", - "tab": "Efficiency", - "score": 0.6499361712977572 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)", - "tab": "General information", - "score": 429.4566037735849 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.906, - "details": { - "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.615, mean=0.615, max=0.615, sum=1.229 (2)", - "tab": "Efficiency", - "score": 0.6146096341153409 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)", - "tab": "General information", - "score": 323.53617021276597 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.848, - "details": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.697 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.646, mean=0.646, max=0.646, sum=1.292 (2)", - "tab": "Efficiency", - "score": 0.6462178690680143 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)", - "tab": "General information", - "score": 437.04137931034484 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.918, - "details": { - "description": "min=0.918, mean=0.918, max=0.918, sum=1.836 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)", - "tab": "Efficiency", - "score": 0.7089652012264918 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)", - "tab": "General information", - "score": 511.06613756613757 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.571 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.692, mean=0.692, max=0.692, sum=1.384 (2)", - "tab": "Efficiency", - "score": 0.691912295326354 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)", - "tab": "General information", - "score": 646.7460317460317 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.958, - "details": { - "description": "min=0.958, mean=0.958, max=0.958, sum=1.916 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)", - "tab": "Efficiency", - "score": 0.6689629408621018 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)", - "tab": "Efficiency", - "score": 0.6729868444903143 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.678, mean=0.678, max=0.678, sum=1.356 (2)", - "tab": "Efficiency", - "score": 0.677822756767273 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.395 (2)", - "tab": "Efficiency", - "score": 0.6973154544830322 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)", - "tab": "Efficiency", - "score": 0.6404741051221134 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.661, mean=0.661, max=0.661, sum=1.323 (2)", - "tab": "Efficiency", - "score": 0.6613641341115527 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.631, mean=0.631, max=0.631, sum=1.261 (2)", - "tab": "Efficiency", - "score": 0.6305418686989026 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)", - "tab": "Efficiency", - "score": 0.6677727399048982 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)", - "tab": "Efficiency", - "score": 0.6559101263014209 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.676, mean=0.676, max=0.676, sum=1.353 (2)", - "tab": "Efficiency", - "score": 0.6763939494328783 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)", - "tab": "Efficiency", - "score": 0.6708623107420195 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)", - "tab": "Efficiency", - "score": 0.7019402329568509 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.646, mean=0.646, max=0.646, sum=1.293 (2)", - "tab": "Efficiency", - "score": 0.6463189136748221 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.695, mean=0.695, max=0.695, sum=1.39 (2)", - "tab": "Efficiency", - "score": 0.6947573730211217 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)", - "tab": "General information", - "score": 534.5774193548388 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)", - "tab": "General information", - "score": 497.92118226600985 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)", - "tab": "General information", - "score": 882.4 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)", - "tab": "General information", - "score": 2887.5757575757575 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)", - "tab": "General information", - "score": 412.2676767676768 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)", - "tab": "General information", - "score": 491.10362694300517 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)", - "tab": "General information", - "score": 406.0358974358974 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)", - "tab": "General information", - "score": 519.8814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)", - "tab": "General information", - "score": 420.5126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)", - "tab": "General information", - "score": 558.841059602649 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)", - "tab": "General information", - "score": 521.4201834862386 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)", - "tab": "General information", - "score": 806.9629629629629 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)", - "tab": "General information", - "score": 2288.4901960784314 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)", - "tab": "General information", - "score": 1475.9324894514768 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.939, - "details": { - "description": "min=0.939, mean=0.939, max=0.939, sum=1.878 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)", - "tab": "Efficiency", - "score": 0.6560797862407872 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.686, mean=0.686, max=0.686, sum=1.372 (2)", - "tab": "Efficiency", - "score": 0.6857976003457572 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)", - "tab": "General information", - "score": 335.95515695067263 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)", - "tab": "General information", - "score": 371.4961832061069 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.959, - "details": { - "description": "min=0.959, mean=0.959, max=0.959, sum=1.917 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)", - "tab": "Efficiency", - "score": 0.7129175268914089 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)", - "tab": "General information", - "score": 664.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.642 (2)", - "tab": "Efficiency", - "score": 0.8211235926926501 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)", - "tab": "General information", - "score": 470.2760736196319 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.839, - "details": { - "description": "min=0.839, mean=0.839, max=0.839, sum=1.679 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.393 (2)", - "tab": "Efficiency", - "score": 0.69659323990345 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)", - "tab": "General information", - "score": 676.5178571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=1.864 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)", - "tab": "Efficiency", - "score": 0.7021607287879129 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)", - "tab": "General information", - "score": 301.28155339805824 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.953, - "details": { - "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Efficiency", - "score": 0.8333144401892637 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)", - "tab": "General information", - "score": 448.06410256410254 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", - "tab": "Efficiency", - "score": 0.7894818639755249 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)", - "tab": "General information", - "score": 354.88 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.964, - "details": { - "description": "min=0.964, mean=0.964, max=0.964, sum=1.928 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)", - "tab": "Efficiency", - "score": 0.8030681811073274 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)", - "tab": "General information", - "score": 328.62835249042143 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.888, - "details": { - "description": "min=0.888, mean=0.888, max=0.888, sum=1.777 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)", - "tab": "Efficiency", - "score": 0.6983739172103088 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.393 (2)", - "tab": "Efficiency", - "score": 0.6965836058781799 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)", - "tab": "General information", - "score": 511.78901734104045 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)", - "tab": "General information", - "score": 676.9486033519553 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.922, - "details": { - "description": "min=0.922, mean=0.922, max=0.922, sum=1.843 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.695, mean=0.695, max=0.695, sum=1.389 (2)", - "tab": "Efficiency", - "score": 0.6946531822478849 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)", - "tab": "General information", - "score": 617.0653594771242 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.883 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.365 (2)", - "tab": "Efficiency", - "score": 0.6824756529596117 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)", - "tab": "General information", - "score": 545.6388888888889 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.626, mean=0.626, max=0.626, sum=1.252 (2)", - "tab": "Efficiency", - "score": 0.6258317015387795 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)", - "tab": "General information", - "score": 432.9909090909091 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882, - "details": { - "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.744, mean=0.744, max=0.744, sum=1.489 (2)", - "tab": "Efficiency", - "score": 0.7442785263061523 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)", - "tab": "General information", - "score": 1243.8040816326532 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.955, - "details": { - "description": "min=0.955, mean=0.955, max=0.955, sum=1.91 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.695, mean=0.695, max=0.695, sum=1.389 (2)", - "tab": "Efficiency", - "score": 0.6946055438388047 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)", - "tab": "General information", - "score": 467.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.68, mean=0.68, max=0.68, sum=1.361 (2)", - "tab": "Efficiency", - "score": 0.6803859400461956 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)", - "tab": "General information", - "score": 362.65060240963857 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.651, mean=0.651, max=0.651, sum=1.301 (2)", - "tab": "Efficiency", - "score": 0.6505623017138208 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)", - "tab": "General information", - "score": 293.0175438596491 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.311, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json deleted file mode 100644 index 969900aba..000000000 --- a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3 Haiku 20240307", - "id": "anthropic/claude-3-haiku-20240307", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.738, - "details": { - "description": "min=0.37, mean=0.738, max=0.95, sum=84.132 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.662, mean=0.734, max=1.711, sum=83.657 (114)", - "tab": "Efficiency", - "score": 0.7338373689865249 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)", - "tab": "General information", - "score": 638.2883793758953 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42, - "details": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)", - "tab": "Efficiency", - "score": 0.6928385472297669 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)", - "tab": "General information", - "score": 370.26 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)", - "tab": "Efficiency", - "score": 0.6677785749788637 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)", - "tab": "General information", - "score": 370.8 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48, - "details": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.961 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.692, mean=0.692, max=0.692, sum=1.385 (2)", - "tab": "Efficiency", - "score": 0.6923453903198242 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)", - "tab": "Efficiency", - "score": 0.7022541695170932 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.735, mean=0.735, max=0.735, sum=1.47 (2)", - "tab": "Efficiency", - "score": 0.7352152991294861 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.715, mean=0.715, max=0.715, sum=1.43 (2)", - "tab": "Efficiency", - "score": 0.7152474927902222 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.425 (2)", - "tab": "Efficiency", - "score": 0.7125603780581083 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.726, mean=0.726, max=0.726, sum=1.453 (2)", - "tab": "Efficiency", - "score": 0.7264628340216244 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)", - "tab": "General information", - "score": 550.01 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)", - "tab": "General information", - "score": 490.34722222222223 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)", - "tab": "General information", - "score": 838.24 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)", - "tab": "General information", - "score": 604.19 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)", - "tab": "General information", - "score": 540.6300578034682 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)", - "tab": "General information", - "score": 489.48039215686276 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.686, mean=0.686, max=0.686, sum=1.371 (2)", - "tab": "Efficiency", - "score": 0.6855517983436584 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)", - "tab": "General information", - "score": 398.62 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.632, - "details": { - "description": "min=0.632, mean=0.632, max=0.632, sum=1.263 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.721, mean=0.721, max=0.721, sum=1.442 (2)", - "tab": "Efficiency", - "score": 0.720871933719568 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)", - "tab": "General information", - "score": 619.5964912280701 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47, - "details": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)", - "tab": "Efficiency", - "score": 0.6710420751571655 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)", - "tab": "General information", - "score": 411.61 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.717, mean=0.717, max=0.717, sum=1.435 (2)", - "tab": "Efficiency", - "score": 0.7174532214800516 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)", - "tab": "General information", - "score": 431.4259259259259 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.814, - "details": { - "description": "min=0.814, mean=0.814, max=0.814, sum=1.627 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)", - "tab": "Efficiency", - "score": 0.7023597537896258 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)", - "tab": "General information", - "score": 359.9646302250804 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802, - "details": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.605 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)", - "tab": "Efficiency", - "score": 0.7859190036268795 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.771, mean=0.771, max=0.771, sum=1.542 (2)", - "tab": "Efficiency", - "score": 0.7710303414797952 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.652 (2)", - "tab": "Efficiency", - "score": 0.8259650812310687 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=1.711, mean=1.711, max=1.711, sum=3.422 (2)", - "tab": "Efficiency", - "score": 1.7109862737406314 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)", - "tab": "General information", - "score": 1123.5367647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)", - "tab": "General information", - "score": 665.4219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)", - "tab": "General information", - "score": 1701.16036505867 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)", - "tab": "General information", - "score": 603.1683006535948 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)", - "tab": "Efficiency", - "score": 0.6937756729125977 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)", - "tab": "General information", - "score": 455.25 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.803 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.707, mean=0.707, max=0.707, sum=1.415 (2)", - "tab": "Efficiency", - "score": 0.7072845524863193 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)", - "tab": "General information", - "score": 604.4934210526316 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.705, mean=0.705, max=0.705, sum=1.411 (2)", - "tab": "Efficiency", - "score": 0.7054399585723877 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)", - "tab": "General information", - "score": 600.02 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.789, - "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.577 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.716, mean=0.716, max=0.716, sum=1.432 (2)", - "tab": "Efficiency", - "score": 0.7159239804969644 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)", - "tab": "General information", - "score": 429.4566037735849 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.715, - "details": { - "description": "min=0.715, mean=0.715, max=0.715, sum=1.43 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)", - "tab": "Efficiency", - "score": 0.686391481440118 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)", - "tab": "General information", - "score": 323.53617021276597 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.379 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Efficiency", - "score": 0.6958530524681354 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)", - "tab": "General information", - "score": 437.04137931034484 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.558, - "details": { - "description": "min=0.558, mean=0.558, max=0.558, sum=1.116 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.734, mean=0.734, max=0.734, sum=1.468 (2)", - "tab": "Efficiency", - "score": 0.73423323177156 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)", - "tab": "General information", - "score": 511.06613756613757 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)", - "tab": "Efficiency", - "score": 0.7307745880550809 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)", - "tab": "General information", - "score": 646.7460317460317 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.714, mean=0.714, max=0.714, sum=1.428 (2)", - "tab": "Efficiency", - "score": 0.7141557578117617 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.701, mean=0.701, max=0.701, sum=1.403 (2)", - "tab": "Efficiency", - "score": 0.7014370187750003 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.747, mean=0.747, max=0.747, sum=1.494 (2)", - "tab": "Efficiency", - "score": 0.7470939707756042 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.966, mean=0.966, max=0.966, sum=1.932 (2)", - "tab": "Efficiency", - "score": 0.9658473159327652 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.663, mean=0.663, max=0.663, sum=1.326 (2)", - "tab": "Efficiency", - "score": 0.6627856938525883 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)", - "tab": "Efficiency", - "score": 0.6863837884497767 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.681, mean=0.681, max=0.681, sum=1.361 (2)", - "tab": "Efficiency", - "score": 0.6806940922370324 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.708, mean=0.708, max=0.708, sum=1.416 (2)", - "tab": "Efficiency", - "score": 0.7079638242721558 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)", - "tab": "Efficiency", - "score": 0.6742001541522371 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.411 (2)", - "tab": "Efficiency", - "score": 0.7056786966639639 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Efficiency", - "score": 0.6960603683366688 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.726, mean=0.726, max=0.726, sum=1.452 (2)", - "tab": "Efficiency", - "score": 0.7262004735293212 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)", - "tab": "Efficiency", - "score": 0.8757836842069439 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)", - "tab": "Efficiency", - "score": 0.7852678007214381 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)", - "tab": "General information", - "score": 534.5774193548388 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)", - "tab": "General information", - "score": 497.92118226600985 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)", - "tab": "General information", - "score": 882.4 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)", - "tab": "General information", - "score": 2887.5757575757575 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)", - "tab": "General information", - "score": 412.2676767676768 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)", - "tab": "General information", - "score": 491.10362694300517 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)", - "tab": "General information", - "score": 406.0358974358974 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)", - "tab": "General information", - "score": 519.8814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)", - "tab": "General information", - "score": 420.5126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)", - "tab": "General information", - "score": 558.841059602649 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)", - "tab": "General information", - "score": 521.4201834862386 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)", - "tab": "General information", - "score": 806.9629629629629 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)", - "tab": "General information", - "score": 2288.4901960784314 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)", - "tab": "General information", - "score": 1475.9324894514768 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)", - "tab": "Efficiency", - "score": 0.6907867818669888 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.665, mean=0.665, max=0.665, sum=1.331 (2)", - "tab": "Efficiency", - "score": 0.6653509722411177 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)", - "tab": "General information", - "score": 335.95515695067263 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)", - "tab": "General information", - "score": 371.4961832061069 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.723, mean=0.723, max=0.723, sum=1.446 (2)", - "tab": "Efficiency", - "score": 0.7232089219999708 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)", - "tab": "General information", - "score": 664.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.665, mean=0.665, max=0.665, sum=1.331 (2)", - "tab": "Efficiency", - "score": 0.6653785354520646 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)", - "tab": "General information", - "score": 470.2760736196319 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589, - "details": { - "description": "min=0.589, mean=0.589, max=0.589, sum=1.179 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.722, mean=0.722, max=0.722, sum=1.444 (2)", - "tab": "Efficiency", - "score": 0.7220823402915683 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)", - "tab": "General information", - "score": 676.5178571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.874, - "details": { - "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.663, mean=0.663, max=0.663, sum=1.327 (2)", - "tab": "Efficiency", - "score": 0.6634428709456064 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)", - "tab": "General information", - "score": 301.28155339805824 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.665, mean=0.665, max=0.665, sum=1.33 (2)", - "tab": "Efficiency", - "score": 0.6648106361046816 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)", - "tab": "General information", - "score": 448.06410256410254 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)", - "tab": "Efficiency", - "score": 0.6621059203147888 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)", - "tab": "General information", - "score": 354.88 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.678, mean=0.678, max=0.678, sum=1.357 (2)", - "tab": "Efficiency", - "score": 0.6782779660109207 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)", - "tab": "General information", - "score": 328.62835249042143 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502, - "details": { - "description": "min=0.502, mean=0.502, max=0.502, sum=1.003 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.419 (2)", - "tab": "Efficiency", - "score": 0.7093146880927114 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.716, mean=0.716, max=0.716, sum=1.432 (2)", - "tab": "Efficiency", - "score": 0.7158833943265777 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)", - "tab": "General information", - "score": 511.78901734104045 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)", - "tab": "General information", - "score": 676.9486033519553 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.441 (2)", - "tab": "Efficiency", - "score": 0.720291394813388 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)", - "tab": "General information", - "score": 617.0653594771242 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.427 (2)", - "tab": "Efficiency", - "score": 0.7133041966108629 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)", - "tab": "General information", - "score": 545.6388888888889 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.369 (2)", - "tab": "Efficiency", - "score": 0.6844336206262762 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)", - "tab": "General information", - "score": 432.9909090909091 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Efficiency", - "score": 0.7701463602027114 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)", - "tab": "General information", - "score": 1243.8040816326532 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Efficiency", - "score": 0.6899205867330827 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)", - "tab": "General information", - "score": 467.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542, - "details": { - "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)", - "tab": "Efficiency", - "score": 0.7279246169400503 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)", - "tab": "General information", - "score": 362.65060240963857 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.454 (2)", - "tab": "Efficiency", - "score": 0.7269549021246837 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)", - "tab": "General information", - "score": 293.0175438596491 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.28, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json deleted file mode 100644 index 230be4291..000000000 --- a/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3 Opus 20240229", - "id": "anthropic/claude-3-opus-20240229", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.846, - "details": { - "description": "min=0.55, mean=0.846, max=0.979, sum=96.412 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=3.782, mean=4.077, max=5.005, sum=464.781 (114)", - "tab": "Efficiency", - "score": 4.077024270463863 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)", - "tab": "General information", - "score": 638.2883793758953 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=4.182, mean=4.182, max=4.182, sum=8.364 (2)", - "tab": "Efficiency", - "score": 4.182226595878601 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)", - "tab": "General information", - "score": 370.26 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=4.115, mean=4.115, max=4.115, sum=8.23 (2)", - "tab": "Efficiency", - "score": 4.114818896187677 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)", - "tab": "General information", - "score": 370.8 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.716, - "details": { - "description": "min=0.716, mean=0.716, max=0.716, sum=1.431 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=4.373, mean=4.373, max=4.373, sum=8.745 (2)", - "tab": "Efficiency", - "score": 4.372743592262268 - }, - "College Biology - Observed inference time (s)": { - "description": "min=4.045, mean=4.045, max=4.045, sum=8.09 (2)", - "tab": "Efficiency", - "score": 4.044814482331276 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=4.326, mean=4.326, max=4.326, sum=8.652 (2)", - "tab": "Efficiency", - "score": 4.3260163617134095 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=4.209, mean=4.209, max=4.209, sum=8.417 (2)", - "tab": "Efficiency", - "score": 4.208740277290344 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=3.994, mean=3.994, max=3.994, sum=7.988 (2)", - "tab": "Efficiency", - "score": 3.9939607113082976 - }, - "College Physics - Observed inference time (s)": { - "description": "min=3.982, mean=3.982, max=3.982, sum=7.965 (2)", - "tab": "Efficiency", - "score": 3.9823715172561944 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)", - "tab": "General information", - "score": 550.01 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)", - "tab": "General information", - "score": 490.34722222222223 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)", - "tab": "General information", - "score": 838.24 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)", - "tab": "General information", - "score": 604.19 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)", - "tab": "General information", - "score": 540.6300578034682 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)", - "tab": "General information", - "score": 489.48039215686276 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=4.105, mean=4.105, max=4.105, sum=8.211 (2)", - "tab": "Efficiency", - "score": 4.105417683124542 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)", - "tab": "General information", - "score": 398.62 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.789, - "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=4.284, mean=4.284, max=4.284, sum=8.569 (2)", - "tab": "Efficiency", - "score": 4.284419020016988 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)", - "tab": "General information", - "score": 619.5964912280701 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=4.232, mean=4.232, max=4.232, sum=8.465 (2)", - "tab": "Efficiency", - "score": 4.232321140766143 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)", - "tab": "General information", - "score": 411.61 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=3.872, mean=3.872, max=3.872, sum=7.744 (2)", - "tab": "Efficiency", - "score": 3.8720074185618647 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)", - "tab": "General information", - "score": 431.4259259259259 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=3.967, mean=3.967, max=3.967, sum=7.935 (2)", - "tab": "Efficiency", - "score": 3.9672668930801933 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)", - "tab": "General information", - "score": 359.9646302250804 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.904, - "details": { - "description": "min=0.904, mean=0.904, max=0.904, sum=1.807 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=4.358, mean=4.358, max=4.358, sum=8.715 (2)", - "tab": "Efficiency", - "score": 4.357662654974881 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=3.982, mean=3.982, max=3.982, sum=7.965 (2)", - "tab": "Efficiency", - "score": 3.9823869661236486 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=4.483, mean=4.483, max=4.483, sum=8.967 (2)", - "tab": "Efficiency", - "score": 4.483374906953963 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=4.006, mean=4.006, max=4.006, sum=8.012 (2)", - "tab": "Efficiency", - "score": 4.0058385706415365 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)", - "tab": "General information", - "score": 1123.5367647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)", - "tab": "General information", - "score": 665.4219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)", - "tab": "General information", - "score": 1701.16036505867 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)", - "tab": "General information", - "score": 603.1683006535948 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=4.003, mean=4.003, max=4.003, sum=8.006 (2)", - "tab": "Efficiency", - "score": 4.002964313030243 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)", - "tab": "General information", - "score": 455.25 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.967, - "details": { - "description": "min=0.967, mean=0.967, max=0.967, sum=1.934 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=4.099, mean=4.099, max=4.099, sum=8.198 (2)", - "tab": "Efficiency", - "score": 4.099087294779326 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)", - "tab": "General information", - "score": 604.4934210526316 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=4.102, mean=4.102, max=4.102, sum=8.204 (2)", - "tab": "Efficiency", - "score": 4.102163214683532 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)", - "tab": "General information", - "score": 600.02 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.879, - "details": { - "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=3.976, mean=3.976, max=3.976, sum=7.952 (2)", - "tab": "Efficiency", - "score": 3.9762323631430574 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)", - "tab": "General information", - "score": 429.4566037735849 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.881, - "details": { - "description": "min=0.881, mean=0.881, max=0.881, sum=1.762 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=3.959, mean=3.959, max=3.959, sum=7.918 (2)", - "tab": "Efficiency", - "score": 3.9589331109473047 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)", - "tab": "General information", - "score": 323.53617021276597 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.814, - "details": { - "description": "min=0.814, mean=0.814, max=0.814, sum=1.628 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=4.017, mean=4.017, max=4.017, sum=8.035 (2)", - "tab": "Efficiency", - "score": 4.017465997564382 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)", - "tab": "General information", - "score": 437.04137931034484 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.862, - "details": { - "description": "min=0.862, mean=0.862, max=0.862, sum=1.725 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=3.937, mean=3.937, max=3.937, sum=7.874 (2)", - "tab": "Efficiency", - "score": 3.937073076212848 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)", - "tab": "General information", - "score": 511.06613756613757 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.698, - "details": { - "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=4.178, mean=4.178, max=4.178, sum=8.356 (2)", - "tab": "Efficiency", - "score": 4.177885971372089 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)", - "tab": "General information", - "score": 646.7460317460317 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=4.184, mean=4.184, max=4.184, sum=8.368 (2)", - "tab": "Efficiency", - "score": 4.183918527633913 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=4.027, mean=4.027, max=4.027, sum=8.055 (2)", - "tab": "Efficiency", - "score": 4.027491113822449 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=3.929, mean=3.929, max=3.929, sum=7.858 (2)", - "tab": "Efficiency", - "score": 3.929041051864624 - }, - "High School European History - Observed inference time (s)": { - "description": "min=5.005, mean=5.005, max=5.005, sum=10.009 (2)", - "tab": "Efficiency", - "score": 5.004520618554317 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=3.872, mean=3.872, max=3.872, sum=7.743 (2)", - "tab": "Efficiency", - "score": 3.87151506332436 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=3.936, mean=3.936, max=3.936, sum=7.872 (2)", - "tab": "Efficiency", - "score": 3.936160638542373 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=3.782, mean=3.782, max=3.782, sum=7.563 (2)", - "tab": "Efficiency", - "score": 3.781650854379703 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=4.061, mean=4.061, max=4.061, sum=8.122 (2)", - "tab": "Efficiency", - "score": 4.0608021259307865 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=3.861, mean=3.861, max=3.861, sum=7.722 (2)", - "tab": "Efficiency", - "score": 3.860906556874764 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=3.938, mean=3.938, max=3.938, sum=7.876 (2)", - "tab": "Efficiency", - "score": 3.9381139499462203 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=4.059, mean=4.059, max=4.059, sum=8.118 (2)", - "tab": "Efficiency", - "score": 4.058962697282843 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=4.024, mean=4.024, max=4.024, sum=8.047 (2)", - "tab": "Efficiency", - "score": 4.023671524392234 - }, - "High School US History - Observed inference time (s)": { - "description": "min=4.606, mean=4.606, max=4.606, sum=9.213 (2)", - "tab": "Efficiency", - "score": 4.606354508914199 - }, - "High School World History - Observed inference time (s)": { - "description": "min=4.336, mean=4.336, max=4.336, sum=8.672 (2)", - "tab": "Efficiency", - "score": 4.335798429537423 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)", - "tab": "General information", - "score": 534.5774193548388 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)", - "tab": "General information", - "score": 497.92118226600985 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)", - "tab": "General information", - "score": 882.4 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)", - "tab": "General information", - "score": 2887.5757575757575 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)", - "tab": "General information", - "score": 412.2676767676768 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)", - "tab": "General information", - "score": 491.10362694300517 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)", - "tab": "General information", - "score": 406.0358974358974 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)", - "tab": "General information", - "score": 519.8814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)", - "tab": "General information", - "score": 420.5126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)", - "tab": "General information", - "score": 558.841059602649 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)", - "tab": "General information", - "score": 521.4201834862386 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)", - "tab": "General information", - "score": 806.9629629629629 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)", - "tab": "General information", - "score": 2288.4901960784314 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)", - "tab": "General information", - "score": 1475.9324894514768 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=3.859, mean=3.859, max=3.859, sum=7.719 (2)", - "tab": "Efficiency", - "score": 3.8594313245183147 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=3.96, mean=3.96, max=3.96, sum=7.92 (2)", - "tab": "Efficiency", - "score": 3.9598546119136664 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)", - "tab": "General information", - "score": 335.95515695067263 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)", - "tab": "General information", - "score": 371.4961832061069 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=3.884, mean=3.884, max=3.884, sum=7.767 (2)", - "tab": "Efficiency", - "score": 3.8836900754408403 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)", - "tab": "General information", - "score": 664.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.896, - "details": { - "description": "min=0.896, mean=0.896, max=0.896, sum=1.791 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=3.913, mean=3.913, max=3.913, sum=7.826 (2)", - "tab": "Efficiency", - "score": 3.9131746394502605 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)", - "tab": "General information", - "score": 470.2760736196319 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=4.19, mean=4.19, max=4.19, sum=8.379 (2)", - "tab": "Efficiency", - "score": 4.189559940780912 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)", - "tab": "General information", - "score": 676.5178571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.942, - "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=4.01, mean=4.01, max=4.01, sum=8.02 (2)", - "tab": "Efficiency", - "score": 4.009768469819745 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)", - "tab": "General information", - "score": 301.28155339805824 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.944, - "details": { - "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=3.988, mean=3.988, max=3.988, sum=7.975 (2)", - "tab": "Efficiency", - "score": 3.9875136002516136 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)", - "tab": "General information", - "score": 448.06410256410254 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=3.913, mean=3.913, max=3.913, sum=7.827 (2)", - "tab": "Efficiency", - "score": 3.913457498550415 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)", - "tab": "General information", - "score": 354.88 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.951, - "details": { - "description": "min=0.951, mean=0.951, max=0.951, sum=1.903 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=3.945, mean=3.945, max=3.945, sum=7.889 (2)", - "tab": "Efficiency", - "score": 3.9445087267216747 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)", - "tab": "General information", - "score": 328.62835249042143 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=4.057, mean=4.057, max=4.057, sum=8.113 (2)", - "tab": "Efficiency", - "score": 4.0566764987273025 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=4.082, mean=4.082, max=4.082, sum=8.165 (2)", - "tab": "Efficiency", - "score": 4.082338048892314 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)", - "tab": "General information", - "score": 511.78901734104045 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)", - "tab": "General information", - "score": 676.9486033519553 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.925, - "details": { - "description": "min=0.925, mean=0.925, max=0.925, sum=1.85 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=4.106, mean=4.106, max=4.106, sum=8.213 (2)", - "tab": "Efficiency", - "score": 4.106359853464014 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)", - "tab": "General information", - "score": 617.0653594771242 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.883 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=3.998, mean=3.998, max=3.998, sum=7.996 (2)", - "tab": "Efficiency", - "score": 3.998204750779234 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)", - "tab": "General information", - "score": 545.6388888888889 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827, - "details": { - "description": "min=0.827, mean=0.827, max=0.827, sum=1.655 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=3.843, mean=3.843, max=3.843, sum=7.685 (2)", - "tab": "Efficiency", - "score": 3.8426286415620283 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)", - "tab": "General information", - "score": 432.9909090909091 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=4.346, mean=4.346, max=4.346, sum=8.692 (2)", - "tab": "Efficiency", - "score": 4.3459005385029075 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)", - "tab": "General information", - "score": 1243.8040816326532 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.881 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=3.946, mean=3.946, max=3.946, sum=7.893 (2)", - "tab": "Efficiency", - "score": 3.94632918561869 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)", - "tab": "General information", - "score": 467.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=3.932, mean=3.932, max=3.932, sum=7.865 (2)", - "tab": "Efficiency", - "score": 3.9324641141546777 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)", - "tab": "General information", - "score": 362.65060240963857 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=4.011, mean=4.011, max=4.011, sum=8.023 (2)", - "tab": "Efficiency", - "score": 4.011422206086722 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)", - "tab": "General information", - "score": 293.0175438596491 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.014, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json deleted file mode 100644 index dd7543ecb..000000000 --- a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude 3 Sonnet 20240229", - "id": "anthropic/claude-3-sonnet-20240229", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.759, - "details": { - "description": "min=0.39, mean=0.759, max=0.959, sum=86.545 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=1.21, mean=1.468, max=8.072, sum=167.341 (114)", - "tab": "Efficiency", - "score": 1.4679056233464987 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)", - "tab": "General information", - "score": 638.2883793758953 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=1.248, mean=1.248, max=1.248, sum=2.495 (2)", - "tab": "Efficiency", - "score": 1.2476251411437989 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)", - "tab": "General information", - "score": 370.26 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=1.225, mean=1.225, max=1.225, sum=2.45 (2)", - "tab": "Efficiency", - "score": 1.224808097768713 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)", - "tab": "General information", - "score": 370.8 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=1.33, mean=1.33, max=1.33, sum=2.659 (2)", - "tab": "Efficiency", - "score": 1.3297029423713684 - }, - "College Biology - Observed inference time (s)": { - "description": "min=1.293, mean=1.293, max=1.293, sum=2.585 (2)", - "tab": "Efficiency", - "score": 1.2926498336924448 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=1.494, mean=1.494, max=1.494, sum=2.988 (2)", - "tab": "Efficiency", - "score": 1.493921182155609 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=1.346, mean=1.346, max=1.346, sum=2.693 (2)", - "tab": "Efficiency", - "score": 1.346416823863983 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=1.316, mean=1.316, max=1.316, sum=2.632 (2)", - "tab": "Efficiency", - "score": 1.315991141203511 - }, - "College Physics - Observed inference time (s)": { - "description": "min=1.286, mean=1.286, max=1.286, sum=2.573 (2)", - "tab": "Efficiency", - "score": 1.2863672691233017 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)", - "tab": "General information", - "score": 550.01 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)", - "tab": "General information", - "score": 490.34722222222223 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)", - "tab": "General information", - "score": 838.24 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)", - "tab": "General information", - "score": 604.19 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)", - "tab": "General information", - "score": 540.6300578034682 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)", - "tab": "General information", - "score": 489.48039215686276 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.228, mean=1.228, max=1.228, sum=2.456 (2)", - "tab": "Efficiency", - "score": 1.2280330896377563 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)", - "tab": "General information", - "score": 398.62 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=1.341, mean=1.341, max=1.341, sum=2.682 (2)", - "tab": "Efficiency", - "score": 1.3410238989612513 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)", - "tab": "General information", - "score": 619.5964912280701 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=1.253, mean=1.253, max=1.253, sum=2.505 (2)", - "tab": "Efficiency", - "score": 1.2527140331268312 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)", - "tab": "General information", - "score": 411.61 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=1.248, mean=1.248, max=1.248, sum=2.496 (2)", - "tab": "Efficiency", - "score": 1.2482430162253204 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)", - "tab": "General information", - "score": 431.4259259259259 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=1.221, mean=1.221, max=1.221, sum=2.442 (2)", - "tab": "Efficiency", - "score": 1.22093992217944 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)", - "tab": "General information", - "score": 359.9646302250804 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.814, - "details": { - "description": "min=0.814, mean=0.814, max=0.814, sum=1.627 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.608, mean=1.608, max=1.608, sum=3.216 (2)", - "tab": "Efficiency", - "score": 1.6081139156047035 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=1.391, mean=1.391, max=1.391, sum=2.781 (2)", - "tab": "Efficiency", - "score": 1.3905252252064697 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.87, mean=1.87, max=1.87, sum=3.741 (2)", - "tab": "Efficiency", - "score": 1.8703640130539139 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=1.297, mean=1.297, max=1.297, sum=2.593 (2)", - "tab": "Efficiency", - "score": 1.2967337436146207 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)", - "tab": "General information", - "score": 1123.5367647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)", - "tab": "General information", - "score": 665.4219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)", - "tab": "General information", - "score": 1701.16036505867 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)", - "tab": "General information", - "score": 603.1683006535948 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=1.245, mean=1.245, max=1.245, sum=2.489 (2)", - "tab": "Efficiency", - "score": 1.2445136380195618 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)", - "tab": "General information", - "score": 455.25 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.711 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=1.303, mean=1.303, max=1.303, sum=2.607 (2)", - "tab": "Efficiency", - "score": 1.3033642768859863 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)", - "tab": "General information", - "score": 604.4934210526316 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=1.304, mean=1.304, max=1.304, sum=2.607 (2)", - "tab": "Efficiency", - "score": 1.3036250400543212 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)", - "tab": "General information", - "score": 600.02 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=1.24, mean=1.24, max=1.24, sum=2.48 (2)", - "tab": "Efficiency", - "score": 1.2399591086045751 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)", - "tab": "General information", - "score": 429.4566037735849 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.774, - "details": { - "description": "min=0.774, mean=0.774, max=0.774, sum=1.549 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=1.256, mean=1.256, max=1.256, sum=2.513 (2)", - "tab": "Efficiency", - "score": 1.2563625832821461 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)", - "tab": "General information", - "score": 323.53617021276597 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.703, - "details": { - "description": "min=0.703, mean=0.703, max=0.703, sum=1.407 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=1.276, mean=1.276, max=1.276, sum=2.553 (2)", - "tab": "Efficiency", - "score": 1.276360561107767 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)", - "tab": "General information", - "score": 437.04137931034484 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.635, - "details": { - "description": "min=0.635, mean=0.635, max=0.635, sum=1.27 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=1.301, mean=1.301, max=1.301, sum=2.602 (2)", - "tab": "Efficiency", - "score": 1.3010439260926827 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)", - "tab": "General information", - "score": 511.06613756613757 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=1.369, mean=1.369, max=1.369, sum=2.738 (2)", - "tab": "Efficiency", - "score": 1.3692201716559274 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)", - "tab": "General information", - "score": 646.7460317460317 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.895, - "details": { - "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=1.292, mean=1.292, max=1.292, sum=2.585 (2)", - "tab": "Efficiency", - "score": 1.2923692734010759 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=1.339, mean=1.339, max=1.339, sum=2.678 (2)", - "tab": "Efficiency", - "score": 1.3387701969428603 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=1.51, mean=1.51, max=1.51, sum=3.02 (2)", - "tab": "Efficiency", - "score": 1.5097803854942322 - }, - "High School European History - Observed inference time (s)": { - "description": "min=2.456, mean=2.456, max=2.456, sum=4.912 (2)", - "tab": "Efficiency", - "score": 2.4561073808959035 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=1.269, mean=1.269, max=1.269, sum=2.537 (2)", - "tab": "Efficiency", - "score": 1.2686388372170805 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=1.287, mean=1.287, max=1.287, sum=2.574 (2)", - "tab": "Efficiency", - "score": 1.2869715455900201 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=1.266, mean=1.266, max=1.266, sum=2.533 (2)", - "tab": "Efficiency", - "score": 1.2664643880648492 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=1.313, mean=1.313, max=1.313, sum=2.626 (2)", - "tab": "Efficiency", - "score": 1.3131960109428122 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=1.261, mean=1.261, max=1.261, sum=2.521 (2)", - "tab": "Efficiency", - "score": 1.260614112645638 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=1.302, mean=1.302, max=1.302, sum=2.603 (2)", - "tab": "Efficiency", - "score": 1.3015588419326882 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=1.304, mean=1.304, max=1.304, sum=2.607 (2)", - "tab": "Efficiency", - "score": 1.3036036592011058 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=1.512, mean=1.512, max=1.512, sum=3.025 (2)", - "tab": "Efficiency", - "score": 1.512356918167185 - }, - "High School US History - Observed inference time (s)": { - "description": "min=8.072, mean=8.072, max=8.072, sum=16.145 (2)", - "tab": "Efficiency", - "score": 8.072314507821027 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.746, mean=1.746, max=1.746, sum=3.491 (2)", - "tab": "Efficiency", - "score": 1.74568142066022 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)", - "tab": "General information", - "score": 534.5774193548388 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)", - "tab": "General information", - "score": 497.92118226600985 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)", - "tab": "General information", - "score": 882.4 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)", - "tab": "General information", - "score": 2887.5757575757575 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)", - "tab": "General information", - "score": 412.2676767676768 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)", - "tab": "General information", - "score": 491.10362694300517 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)", - "tab": "General information", - "score": 406.0358974358974 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)", - "tab": "General information", - "score": 519.8814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)", - "tab": "General information", - "score": 420.5126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)", - "tab": "General information", - "score": 558.841059602649 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)", - "tab": "General information", - "score": 521.4201834862386 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)", - "tab": "General information", - "score": 806.9629629629629 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)", - "tab": "General information", - "score": 2288.4901960784314 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)", - "tab": "General information", - "score": 1475.9324894514768 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=1.21, mean=1.21, max=1.21, sum=2.42 (2)", - "tab": "Efficiency", - "score": 1.2099821963117796 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=1.255, mean=1.255, max=1.255, sum=2.509 (2)", - "tab": "Efficiency", - "score": 1.2545511012768928 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)", - "tab": "General information", - "score": 335.95515695067263 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)", - "tab": "General information", - "score": 371.4961832061069 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.909, - "details": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=1.375, mean=1.375, max=1.375, sum=2.751 (2)", - "tab": "Efficiency", - "score": 1.3753716256007675 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)", - "tab": "General information", - "score": 664.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.853, - "details": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=1.237, mean=1.237, max=1.237, sum=2.474 (2)", - "tab": "Efficiency", - "score": 1.23694542580587 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)", - "tab": "General information", - "score": 470.2760736196319 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=1.362, mean=1.362, max=1.362, sum=2.725 (2)", - "tab": "Efficiency", - "score": 1.3623365994010652 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)", - "tab": "General information", - "score": 676.5178571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.922, - "details": { - "description": "min=0.922, mean=0.922, max=0.922, sum=1.845 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=1.265, mean=1.265, max=1.265, sum=2.529 (2)", - "tab": "Efficiency", - "score": 1.2646709923605317 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)", - "tab": "General information", - "score": 301.28155339805824 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.701 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=1.251, mean=1.251, max=1.251, sum=2.503 (2)", - "tab": "Efficiency", - "score": 1.2514099310605953 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)", - "tab": "General information", - "score": 448.06410256410254 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=1.22, mean=1.22, max=1.22, sum=2.441 (2)", - "tab": "Efficiency", - "score": 1.2204306960105895 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)", - "tab": "General information", - "score": 354.88 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "description": "min=0.872, mean=0.872, max=0.872, sum=1.745 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=1.233, mean=1.233, max=1.233, sum=2.467 (2)", - "tab": "Efficiency", - "score": 1.2334287364516374 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)", - "tab": "General information", - "score": 328.62835249042143 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.626, - "details": { - "description": "min=0.626, mean=0.626, max=0.626, sum=1.251 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=1.287, mean=1.287, max=1.287, sum=2.575 (2)", - "tab": "Efficiency", - "score": 1.2873861700124134 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=1.361, mean=1.361, max=1.361, sum=2.722 (2)", - "tab": "Efficiency", - "score": 1.361004557156696 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)", - "tab": "General information", - "score": 511.78901734104045 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)", - "tab": "General information", - "score": 676.9486033519553 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=1.319, mean=1.319, max=1.319, sum=2.638 (2)", - "tab": "Efficiency", - "score": 1.3189228679619582 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)", - "tab": "General information", - "score": 617.0653594771242 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=1.305, mean=1.305, max=1.305, sum=2.611 (2)", - "tab": "Efficiency", - "score": 1.305255777306027 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)", - "tab": "General information", - "score": 545.6388888888889 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782, - "details": { - "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=1.25, mean=1.25, max=1.25, sum=2.5 (2)", - "tab": "Efficiency", - "score": 1.2497538588263772 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)", - "tab": "General information", - "score": 432.9909090909091 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=1.662, mean=1.662, max=1.662, sum=3.325 (2)", - "tab": "Efficiency", - "score": 1.6624354012158453 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)", - "tab": "General information", - "score": 1243.8040816326532 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "details": { - "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=1.268, mean=1.268, max=1.268, sum=2.535 (2)", - "tab": "Efficiency", - "score": 1.267556501265189 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)", - "tab": "General information", - "score": 467.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=1.321, mean=1.321, max=1.321, sum=2.642 (2)", - "tab": "Efficiency", - "score": 1.3211244660687733 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)", - "tab": "General information", - "score": 362.65060240963857 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=1.271, mean=1.271, max=1.271, sum=2.542 (2)", - "tab": "Efficiency", - "score": 1.2710035530447263 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)", - "tab": "General information", - "score": 293.0175438596491 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.082, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json deleted file mode 100644 index c9e9779b1..000000000 --- a/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Claude Instant 1.2", - "id": "anthropic/claude-instant-1.2", - "developer": "anthropic", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688, - "details": { - "description": "min=0.333, mean=0.688, max=0.902, sum=78.425 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.59, mean=0.932, max=1.62, sum=106.285 (114)", - "tab": "Efficiency", - "score": 0.9323255288146379 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=358.018, mean=703.288, max=2952.576, sum=80174.875 (114)", - "tab": "General information", - "score": 703.2883793758955 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0.994, mean=1.0, max=1, sum=113.988 (114)", - "tab": "General information", - "score": 0.9998985904066524 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37, - "details": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)", - "tab": "Efficiency", - "score": 0.5904157018661499 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=435.26, mean=435.26, max=435.26, sum=870.52 (2)", - "tab": "General information", - "score": 435.26 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "description": "min=0.637, mean=0.637, max=0.637, sum=1.274 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.827 (2)", - "tab": "Efficiency", - "score": 0.9135703210477476 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=435.8, mean=435.8, max=435.8, sum=871.6 (2)", - "tab": "General information", - "score": 435.8 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.636, mean=0.636, max=0.636, sum=1.272 (2)", - "tab": "Efficiency", - "score": 0.6360281848907471 - }, - "College Biology - Observed inference time (s)": { - "description": "min=1.016, mean=1.016, max=1.016, sum=2.033 (2)", - "tab": "Efficiency", - "score": 1.0163518455293443 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=1.153, mean=1.153, max=1.153, sum=2.306 (2)", - "tab": "Efficiency", - "score": 1.1530575346946716 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=1.157, mean=1.157, max=1.157, sum=2.314 (2)", - "tab": "Efficiency", - "score": 1.1569927215576172 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=1.086, mean=1.086, max=1.086, sum=2.173 (2)", - "tab": "Efficiency", - "score": 1.0863008636959715 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.938, mean=0.938, max=0.938, sum=1.875 (2)", - "tab": "Efficiency", - "score": 0.9376059443342919 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=615.01, mean=615.01, max=615.01, sum=1230.02 (2)", - "tab": "General information", - "score": 615.01 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=555.347, mean=555.347, max=555.347, sum=1110.694 (2)", - "tab": "General information", - "score": 555.3472222222222 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=903.24, mean=903.24, max=903.24, sum=1806.48 (2)", - "tab": "General information", - "score": 903.24 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=669.19, mean=669.19, max=669.19, sum=1338.38 (2)", - "tab": "General information", - "score": 669.19 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=605.63, mean=605.63, max=605.63, sum=1211.26 (2)", - "tab": "General information", - "score": 605.6300578034682 - }, - "College Medicine - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)", - "tab": "General information", - "score": 0.9942196531791907 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=554.48, mean=554.48, max=554.48, sum=1108.961 (2)", - "tab": "General information", - "score": 554.4803921568628 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.597, mean=0.597, max=0.597, sum=1.194 (2)", - "tab": "Efficiency", - "score": 0.596819703578949 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=463.62, mean=463.62, max=463.62, sum=927.24 (2)", - "tab": "General information", - "score": 463.62 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614, - "details": { - "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.633, mean=0.633, max=0.633, sum=1.267 (2)", - "tab": "Efficiency", - "score": 0.6333246440218206 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=684.596, mean=684.596, max=684.596, sum=1369.193 (2)", - "tab": "General information", - "score": 684.5964912280701 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38, - "details": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.975, mean=0.975, max=0.975, sum=1.949 (2)", - "tab": "Efficiency", - "score": 0.9746571969985962 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=476.61, mean=476.61, max=476.61, sum=953.22 (2)", - "tab": "General information", - "score": 476.61 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.811, mean=0.811, max=0.811, sum=1.621 (2)", - "tab": "Efficiency", - "score": 0.8107206269546792 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=496.426, mean=496.426, max=496.426, sum=992.852 (2)", - "tab": "General information", - "score": 496.4259259259259 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.756, - "details": { - "description": "min=0.756, mean=0.756, max=0.756, sum=1.511 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.832, mean=0.832, max=0.832, sum=1.664 (2)", - "tab": "Efficiency", - "score": 0.8319868075502647 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=424.965, mean=424.965, max=424.965, sum=849.929 (2)", - "tab": "General information", - "score": 424.9646302250804 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724, - "details": { - "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.073, mean=1.073, max=1.073, sum=2.146 (2)", - "tab": "Efficiency", - "score": 1.072824116138851 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.895, mean=0.895, max=0.895, sum=1.79 (2)", - "tab": "Efficiency", - "score": 0.8950984232814599 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.058, mean=1.058, max=1.058, sum=2.117 (2)", - "tab": "Efficiency", - "score": 1.0584386131754133 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.718 (2)", - "tab": "Efficiency", - "score": 0.8591087651408575 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1188.537, mean=1188.537, max=1188.537, sum=2377.074 (2)", - "tab": "General information", - "score": 1188.5367647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=730.422, mean=730.422, max=730.422, sum=1460.844 (2)", - "tab": "General information", - "score": 730.4219858156029 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1766.16, mean=1766.16, max=1766.16, sum=3532.321 (2)", - "tab": "General information", - "score": 1766.16036505867 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=668.168, mean=668.168, max=668.168, sum=1336.337 (2)", - "tab": "General information", - "score": 668.1683006535948 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)", - "tab": "Efficiency", - "score": 0.6128408885002137 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=520.25, mean=520.25, max=520.25, sum=1040.5 (2)", - "tab": "General information", - "score": 520.25 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=1.124, mean=1.124, max=1.124, sum=2.248 (2)", - "tab": "Efficiency", - "score": 1.123885358634748 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=669.493, mean=669.493, max=669.493, sum=1338.987 (2)", - "tab": "General information", - "score": 669.4934210526316 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=1.102, mean=1.102, max=1.102, sum=2.204 (2)", - "tab": "Efficiency", - "score": 1.101954047679901 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=665.02, mean=665.02, max=665.02, sum=1330.04 (2)", - "tab": "General information", - "score": 665.02 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.419 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.899, mean=0.899, max=0.899, sum=1.799 (2)", - "tab": "Efficiency", - "score": 0.8994299870616985 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=494.457, mean=494.457, max=494.457, sum=988.913 (2)", - "tab": "General information", - "score": 494.4566037735849 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.613, - "details": { - "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.773, mean=0.773, max=0.773, sum=1.546 (2)", - "tab": "Efficiency", - "score": 0.7728059119366585 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=388.536, mean=388.536, max=388.536, sum=777.072 (2)", - "tab": "General information", - "score": 388.53617021276597 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.641, - "details": { - "description": "min=0.641, mean=0.641, max=0.641, sum=1.283 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.932, mean=0.932, max=0.932, sum=1.865 (2)", - "tab": "Efficiency", - "score": 0.9323823583537134 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=502.041, mean=502.041, max=502.041, sum=1004.083 (2)", - "tab": "General information", - "score": 502.04137931034484 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45, - "details": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.899 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.945, mean=0.945, max=0.945, sum=1.891 (2)", - "tab": "Efficiency", - "score": 0.945274135423085 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=576.066, mean=576.066, max=576.066, sum=1152.132 (2)", - "tab": "General information", - "score": 576.0661375661375 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444, - "details": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=1.151, mean=1.151, max=1.151, sum=2.302 (2)", - "tab": "Efficiency", - "score": 1.1508805732878427 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=711.746, mean=711.746, max=711.746, sum=1423.492 (2)", - "tab": "General information", - "score": 711.7460317460317 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.998, mean=0.998, max=0.998, sum=1.996 (2)", - "tab": "Efficiency", - "score": 0.9978926274084275 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Efficiency", - "score": 0.9337695701956161 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=1.046, mean=1.046, max=1.046, sum=2.091 (2)", - "tab": "Efficiency", - "score": 1.0455269980430604 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.62, mean=1.62, max=1.62, sum=3.241 (2)", - "tab": "Efficiency", - "score": 1.6203449783903179 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", - "tab": "Efficiency", - "score": 0.876823568584943 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=1.037, mean=1.037, max=1.037, sum=2.074 (2)", - "tab": "Efficiency", - "score": 1.0370552873364385 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.925, mean=0.925, max=0.925, sum=1.849 (2)", - "tab": "Efficiency", - "score": 0.9246660091938117 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=1.014, mean=1.014, max=1.014, sum=2.027 (2)", - "tab": "Efficiency", - "score": 1.013659605273494 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=1.163, mean=1.163, max=1.163, sum=2.325 (2)", - "tab": "Efficiency", - "score": 1.1627413104562199 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.963, mean=0.963, max=0.963, sum=1.925 (2)", - "tab": "Efficiency", - "score": 0.9627095689836717 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)", - "tab": "Efficiency", - "score": 0.9471190351958668 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.929, mean=0.929, max=0.929, sum=1.857 (2)", - "tab": "Efficiency", - "score": 0.9286887921668865 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.383, mean=1.383, max=1.383, sum=2.766 (2)", - "tab": "Efficiency", - "score": 1.3831783030547349 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.246, mean=1.246, max=1.246, sum=2.492 (2)", - "tab": "Efficiency", - "score": 1.2459266769232127 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=599.577, mean=599.577, max=599.577, sum=1199.155 (2)", - "tab": "General information", - "score": 599.5774193548388 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=562.921, mean=562.921, max=562.921, sum=1125.842 (2)", - "tab": "General information", - "score": 562.9211822660099 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=947.4, mean=947.4, max=947.4, sum=1894.8 (2)", - "tab": "General information", - "score": 947.4 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2952.576, mean=2952.576, max=2952.576, sum=5905.152 (2)", - "tab": "General information", - "score": 2952.5757575757575 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=477.268, mean=477.268, max=477.268, sum=954.535 (2)", - "tab": "General information", - "score": 477.2676767676768 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=556.104, mean=556.104, max=556.104, sum=1112.207 (2)", - "tab": "General information", - "score": 556.1036269430052 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=471.036, mean=471.036, max=471.036, sum=942.072 (2)", - "tab": "General information", - "score": 471.0358974358974 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=584.881, mean=584.881, max=584.881, sum=1169.763 (2)", - "tab": "General information", - "score": 584.8814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=485.513, mean=485.513, max=485.513, sum=971.025 (2)", - "tab": "General information", - "score": 485.5126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=623.841, mean=623.841, max=623.841, sum=1247.682 (2)", - "tab": "General information", - "score": 623.841059602649 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=586.42, mean=586.42, max=586.42, sum=1172.84 (2)", - "tab": "General information", - "score": 586.4201834862386 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=871.963, mean=871.963, max=871.963, sum=1743.926 (2)", - "tab": "General information", - "score": 871.9629629629629 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2353.49, mean=2353.49, max=2353.49, sum=4706.98 (2)", - "tab": "General information", - "score": 2353.4901960784314 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1540.932, mean=1540.932, max=1540.932, sum=3081.865 (2)", - "tab": "General information", - "score": 1540.9324894514768 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.794, - "details": { - "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.782, mean=0.782, max=0.782, sum=1.563 (2)", - "tab": "Efficiency", - "score": 0.7815119557316528 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)", - "tab": "Efficiency", - "score": 0.7630931584889652 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=400.955, mean=400.955, max=400.955, sum=801.91 (2)", - "tab": "General information", - "score": 400.95515695067263 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=436.496, mean=436.496, max=436.496, sum=872.992 (2)", - "tab": "General information", - "score": 436.4961832061069 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.851, - "details": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.888, mean=0.888, max=0.888, sum=1.775 (2)", - "tab": "Efficiency", - "score": 0.8875030958948057 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=729.165, mean=729.165, max=729.165, sum=1458.331 (2)", - "tab": "General information", - "score": 729.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.939, mean=0.939, max=0.939, sum=1.878 (2)", - "tab": "Efficiency", - "score": 0.9389484660025754 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=535.276, mean=535.276, max=535.276, sum=1070.552 (2)", - "tab": "General information", - "score": 535.2760736196319 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=1.339 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.887, mean=0.887, max=0.887, sum=1.774 (2)", - "tab": "Efficiency", - "score": 0.8872403161866325 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=741.518, mean=741.518, max=741.518, sum=1483.036 (2)", - "tab": "General information", - "score": 741.5178571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.931, mean=0.931, max=0.931, sum=1.862 (2)", - "tab": "Efficiency", - "score": 0.9309975244466541 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=366.282, mean=366.282, max=366.282, sum=732.563 (2)", - "tab": "General information", - "score": 366.28155339805824 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.769 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.961, mean=0.961, max=0.961, sum=1.923 (2)", - "tab": "Efficiency", - "score": 0.9613573286268446 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)", - "tab": "General information", - "score": 513.0641025641025 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71, - "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.621 (2)", - "tab": "Efficiency", - "score": 0.8103219223022461 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=419.88, mean=419.88, max=419.88, sum=839.76 (2)", - "tab": "General information", - "score": 419.88 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=1.655 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.652 (2)", - "tab": "Efficiency", - "score": 0.8259343528503964 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=393.628, mean=393.628, max=393.628, sum=787.257 (2)", - "tab": "General information", - "score": 393.62835249042143 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.488, - "details": { - "description": "min=0.488, mean=0.488, max=0.488, sum=0.977 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.938, mean=0.938, max=0.938, sum=1.876 (2)", - "tab": "Efficiency", - "score": 0.937887375065357 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)", - "tab": "Efficiency", - "score": 0.8848049091893201 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=576.789, mean=576.789, max=576.789, sum=1153.578 (2)", - "tab": "General information", - "score": 576.7890173410404 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=741.949, mean=741.949, max=741.949, sum=1483.897 (2)", - "tab": "General information", - "score": 741.9486033519553 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.735, mean=0.735, max=0.735, sum=1.471 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.881, mean=0.881, max=0.881, sum=1.761 (2)", - "tab": "Efficiency", - "score": 0.8806839573617075 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=682.065, mean=682.065, max=682.065, sum=1364.131 (2)", - "tab": "General information", - "score": 682.0653594771242 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.762, mean=0.762, max=0.762, sum=1.525 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.819, mean=0.819, max=0.819, sum=1.638 (2)", - "tab": "Efficiency", - "score": 0.8192079758938448 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=610.639, mean=610.639, max=610.639, sum=1221.278 (2)", - "tab": "General information", - "score": 610.6388888888889 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.627, - "details": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.736, mean=0.736, max=0.736, sum=1.471 (2)", - "tab": "Efficiency", - "score": 0.735536317391829 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=497.991, mean=497.991, max=497.991, sum=995.982 (2)", - "tab": "General information", - "score": 497.9909090909091 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.784, - "details": { - "description": "min=0.784, mean=0.784, max=0.784, sum=1.567 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)", - "tab": "Efficiency", - "score": 0.9487942345288335 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1308.804, mean=1308.804, max=1308.804, sum=2617.608 (2)", - "tab": "General information", - "score": 1308.8040816326532 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.841, - "details": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.687 (2)", - "tab": "Efficiency", - "score": 0.8433953909138542 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=532.274, mean=532.274, max=532.274, sum=1064.547 (2)", - "tab": "General information", - "score": 532.273631840796 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548, - "details": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.096 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.425 (2)", - "tab": "Efficiency", - "score": 0.7126703147428581 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=427.651, mean=427.651, max=427.651, sum=855.301 (2)", - "tab": "General information", - "score": 427.65060240963857 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.784, - "details": { - "description": "min=0.784, mean=0.784, max=0.784, sum=1.567 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Efficiency", - "score": 0.7498089402739765 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=358.018, mean=358.018, max=358.018, sum=716.035 (2)", - "tab": "General information", - "score": 358.0175438596491 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.186, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json deleted file mode 100644 index 6bebd236d..000000000 --- a/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Command R Plus", - "id": "cohere/command-r-plus", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694, - "details": { - "description": "min=0.21, mean=0.694, max=0.927, sum=79.063 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.199, mean=0.305, max=0.74, sum=34.817 (114)", - "tab": "Efficiency", - "score": 0.30541327600292584 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=277.047, mean=648.571, max=2823.042, sum=73937.062 (114)", - "tab": "General information", - "score": 648.5707227335503 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21, - "details": { - "description": "min=0.21, mean=0.21, max=0.21, sum=0.42 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.521 (2)", - "tab": "Efficiency", - "score": 0.2603452730178833 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=397.66, mean=397.66, max=397.66, sum=795.32 (2)", - "tab": "General information", - "score": 397.66 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644, - "details": { - "description": "min=0.644, mean=0.644, max=0.644, sum=1.289 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Efficiency", - "score": 0.289820040596856 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=360.096, mean=360.096, max=360.096, sum=720.193 (2)", - "tab": "General information", - "score": 360.0962962962963 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.039 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.419, mean=0.419, max=0.419, sum=0.839 (2)", - "tab": "Efficiency", - "score": 0.41949598789215087 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)", - "tab": "Efficiency", - "score": 0.3188936991824044 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.262, mean=0.262, max=0.262, sum=0.525 (2)", - "tab": "Efficiency", - "score": 0.262396776676178 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)", - "tab": "Efficiency", - "score": 0.45980838298797605 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)", - "tab": "Efficiency", - "score": 0.32775250611277673 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)", - "tab": "Efficiency", - "score": 0.38314491861006794 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=586.57, mean=586.57, max=586.57, sum=1173.14 (2)", - "tab": "General information", - "score": 586.57 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=496.632, mean=496.632, max=496.632, sum=993.264 (2)", - "tab": "General information", - "score": 496.63194444444446 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=869.29, mean=869.29, max=869.29, sum=1738.58 (2)", - "tab": "General information", - "score": 869.29 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=645.25, mean=645.25, max=645.25, sum=1290.5 (2)", - "tab": "General information", - "score": 645.25 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=535.092, mean=535.092, max=535.092, sum=1070.185 (2)", - "tab": "General information", - "score": 535.0924855491329 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=530.382, mean=530.382, max=530.382, sum=1060.765 (2)", - "tab": "General information", - "score": 530.3823529411765 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.481, mean=0.481, max=0.481, sum=0.961 (2)", - "tab": "Efficiency", - "score": 0.4807459425926208 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=399.41, mean=399.41, max=399.41, sum=798.82 (2)", - "tab": "General information", - "score": 399.41 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.33940661162660835 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=661.579, mean=661.579, max=661.579, sum=1323.158 (2)", - "tab": "General information", - "score": 661.578947368421 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)", - "tab": "Efficiency", - "score": 0.2966678738594055 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=469.58, mean=469.58, max=469.58, sum=939.16 (2)", - "tab": "General information", - "score": 469.58 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.806, - "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.611 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.577 (2)", - "tab": "Efficiency", - "score": 0.2883643927397551 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=417.944, mean=417.944, max=417.944, sum=835.889 (2)", - "tab": "General information", - "score": 417.94444444444446 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.695, - "details": { - "description": "min=0.695, mean=0.695, max=0.695, sum=1.389 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)", - "tab": "Efficiency", - "score": 0.3079479507311364 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=355.508, mean=355.508, max=355.508, sum=711.016 (2)", - "tab": "General information", - "score": 355.508038585209 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.735, mean=0.735, max=0.735, sum=1.471 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.903 (2)", - "tab": "Efficiency", - "score": 0.45139760129592 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)", - "tab": "Efficiency", - "score": 0.2920728659798913 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.811 (2)", - "tab": "Efficiency", - "score": 0.4056029599524228 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.609 (2)", - "tab": "Efficiency", - "score": 0.30459034287072473 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1143.129, mean=1143.129, max=1143.129, sum=2286.257 (2)", - "tab": "General information", - "score": 1143.1286764705883 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=767.429, mean=767.429, max=767.429, sum=1534.858 (2)", - "tab": "General information", - "score": 767.4290780141844 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1697.692, mean=1697.692, max=1697.692, sum=3395.385 (2)", - "tab": "General information", - "score": 1697.6923076923076 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=609.167, mean=609.167, max=609.167, sum=1218.333 (2)", - "tab": "General information", - "score": 609.1666666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)", - "tab": "Efficiency", - "score": 0.29705020904541013 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=452.23, mean=452.23, max=452.23, sum=904.46 (2)", - "tab": "General information", - "score": 452.23 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)", - "tab": "Efficiency", - "score": 0.49223921016642924 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=615.276, mean=615.276, max=615.276, sum=1230.553 (2)", - "tab": "General information", - "score": 615.2763157894736 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.593 (2)", - "tab": "Efficiency", - "score": 0.2964653515815735 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=582.46, mean=582.46, max=582.46, sum=1164.92 (2)", - "tab": "General information", - "score": 582.46 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)", - "tab": "Efficiency", - "score": 0.33743472009334924 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=433.181, mean=433.181, max=433.181, sum=866.362 (2)", - "tab": "General information", - "score": 433.1811320754717 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.591, - "details": { - "description": "min=0.591, mean=0.591, max=0.591, sum=1.183 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.199, mean=0.199, max=0.199, sum=0.398 (2)", - "tab": "Efficiency", - "score": 0.19917301928743403 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=322.511, mean=322.511, max=322.511, sum=645.021 (2)", - "tab": "General information", - "score": 322.51063829787233 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71, - "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.421 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.476 (2)", - "tab": "Efficiency", - "score": 0.2378004501605856 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=494.648, mean=494.648, max=494.648, sum=989.297 (2)", - "tab": "General information", - "score": 494.64827586206894 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.474, - "details": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)", - "tab": "Efficiency", - "score": 0.2562026693707421 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=609.537, mean=609.537, max=609.537, sum=1219.074 (2)", - "tab": "General information", - "score": 609.5370370370371 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.484, - "details": { - "description": "min=0.484, mean=0.484, max=0.484, sum=0.968 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.57 (2)", - "tab": "Efficiency", - "score": 0.2847565715275114 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=630.992, mean=630.992, max=630.992, sum=1261.984 (2)", - "tab": "General information", - "score": 630.9920634920635 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827, - "details": { - "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)", - "tab": "Efficiency", - "score": 0.29477174051346317 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.225, mean=0.225, max=0.225, sum=0.451 (2)", - "tab": "Efficiency", - "score": 0.22539391071338372 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)", - "tab": "Efficiency", - "score": 0.26950850486755373 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)", - "tab": "Efficiency", - "score": 0.7398316253315319 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.205, mean=0.205, max=0.205, sum=0.41 (2)", - "tab": "Efficiency", - "score": 0.20521813570851027 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.243, mean=0.243, max=0.243, sum=0.487 (2)", - "tab": "Efficiency", - "score": 0.24341652430400948 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.221, mean=0.221, max=0.221, sum=0.442 (2)", - "tab": "Efficiency", - "score": 0.2207918637838119 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)", - "tab": "Efficiency", - "score": 0.29578982988993324 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.342765681883868 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.279, mean=0.279, max=0.279, sum=0.558 (2)", - "tab": "Efficiency", - "score": 0.2788162073552214 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.499 (2)", - "tab": "Efficiency", - "score": 0.2494196336203759 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)", - "tab": "Efficiency", - "score": 0.28620046377182007 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.934 (2)", - "tab": "Efficiency", - "score": 0.4672480844983868 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.748 (2)", - "tab": "Efficiency", - "score": 0.3738658830586365 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=527.213, mean=527.213, max=527.213, sum=1054.426 (2)", - "tab": "General information", - "score": 527.2129032258065 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=530.635, mean=530.635, max=530.635, sum=1061.271 (2)", - "tab": "General information", - "score": 530.6354679802955 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=932.02, mean=932.02, max=932.02, sum=1864.04 (2)", - "tab": "General information", - "score": 932.02 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2823.042, mean=2823.042, max=2823.042, sum=5646.085 (2)", - "tab": "General information", - "score": 2823.042424242424 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=407.818, mean=407.818, max=407.818, sum=815.636 (2)", - "tab": "General information", - "score": 407.8181818181818 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=489.155, mean=489.155, max=489.155, sum=978.311 (2)", - "tab": "General information", - "score": 489.1554404145078 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=407.654, mean=407.654, max=407.654, sum=815.308 (2)", - "tab": "General information", - "score": 407.65384615384613 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=589.774, mean=589.774, max=589.774, sum=1179.548 (2)", - "tab": "General information", - "score": 589.7740740740741 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=428.403, mean=428.403, max=428.403, sum=856.807 (2)", - "tab": "General information", - "score": 428.4033613445378 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=604.272, mean=604.272, max=604.272, sum=1208.543 (2)", - "tab": "General information", - "score": 604.2715231788079 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=516.004, mean=516.004, max=516.004, sum=1032.007 (2)", - "tab": "General information", - "score": 516.0036697247706 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=871.264, mean=871.264, max=871.264, sum=1742.528 (2)", - "tab": "General information", - "score": 871.2638888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2240.358, mean=2240.358, max=2240.358, sum=4480.716 (2)", - "tab": "General information", - "score": 2240.357843137255 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1443.321, mean=1443.321, max=1443.321, sum=2886.641 (2)", - "tab": "General information", - "score": 1443.3206751054852 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.199, mean=0.199, max=0.199, sum=0.399 (2)", - "tab": "Efficiency", - "score": 0.19925055482462384 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.227, mean=0.227, max=0.227, sum=0.454 (2)", - "tab": "Efficiency", - "score": 0.22696546925843217 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=336.09, mean=336.09, max=336.09, sum=672.179 (2)", - "tab": "General information", - "score": 336.0896860986547 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=367.16, mean=367.16, max=367.16, sum=734.321 (2)", - "tab": "General information", - "score": 367.1603053435114 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.247, mean=0.247, max=0.247, sum=0.494 (2)", - "tab": "Efficiency", - "score": 0.2467749296141065 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=653.612, mean=653.612, max=653.612, sum=1307.223 (2)", - "tab": "General information", - "score": 653.6115702479339 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)", - "tab": "Efficiency", - "score": 0.24988567463459413 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=463.773, mean=463.773, max=463.773, sum=927.546 (2)", - "tab": "General information", - "score": 463.7730061349693 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518, - "details": { - "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.265, mean=0.265, max=0.265, sum=0.529 (2)", - "tab": "Efficiency", - "score": 0.2645062953233719 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=716.438, mean=716.438, max=716.438, sum=1432.875 (2)", - "tab": "General information", - "score": 716.4375 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.409 (2)", - "tab": "Efficiency", - "score": 0.20434052735856437 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=294.456, mean=294.456, max=294.456, sum=588.913 (2)", - "tab": "General information", - "score": 294.45631067961165 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927, - "details": { - "description": "min=0.927, mean=0.927, max=0.927, sum=1.855 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.228, mean=0.228, max=0.228, sum=0.456 (2)", - "tab": "Efficiency", - "score": 0.22806417840158838 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=446.855, mean=446.855, max=446.855, sum=893.709 (2)", - "tab": "General information", - "score": 446.85470085470087 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Efficiency", - "score": 0.3072425937652588 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=357.02, mean=357.02, max=357.02, sum=714.04 (2)", - "tab": "General information", - "score": 357.02 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.844, - "details": { - "description": "min=0.844, mean=0.844, max=0.844, sum=1.688 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.208, mean=0.208, max=0.208, sum=0.417 (2)", - "tab": "Efficiency", - "score": 0.20840222990832566 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=325.76, mean=325.76, max=325.76, sum=651.52 (2)", - "tab": "General information", - "score": 325.75989782886336 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.585, - "details": { - "description": "min=0.585, mean=0.585, max=0.585, sum=1.171 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.457 (2)", - "tab": "Efficiency", - "score": 0.2285733340103502 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)", - "tab": "Efficiency", - "score": 0.2819661257653263 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=506.78, mean=506.78, max=506.78, sum=1013.561 (2)", - "tab": "General information", - "score": 506.78034682080926 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=699.344, mean=699.344, max=699.344, sum=1398.688 (2)", - "tab": "General information", - "score": 699.3441340782123 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.742, mean=0.742, max=0.742, sum=1.484 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.563 (2)", - "tab": "Efficiency", - "score": 0.2817091388640061 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=618.402, mean=618.402, max=618.402, sum=1236.804 (2)", - "tab": "General information", - "score": 618.4019607843137 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.821, - "details": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.642 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.574 (2)", - "tab": "Efficiency", - "score": 0.2871434423658583 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=549.235, mean=549.235, max=549.235, sum=1098.469 (2)", - "tab": "General information", - "score": 549.2345679012345 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.278, mean=0.278, max=0.278, sum=0.557 (2)", - "tab": "Efficiency", - "score": 0.27829633842815055 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=434.682, mean=434.682, max=434.682, sum=869.364 (2)", - "tab": "General information", - "score": 434.6818181818182 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.751, - "details": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)", - "tab": "Efficiency", - "score": 0.3448335861673161 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1207.494, mean=1207.494, max=1207.494, sum=2414.988 (2)", - "tab": "General information", - "score": 1207.4938775510204 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.751 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.591 (2)", - "tab": "Efficiency", - "score": 0.2956119153037 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=467.343, mean=467.343, max=467.343, sum=934.687 (2)", - "tab": "General information", - "score": 467.34328358208955 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)", - "tab": "Efficiency", - "score": 0.2874818997210767 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=352.861, mean=352.861, max=352.861, sum=705.723 (2)", - "tab": "General information", - "score": 352.8614457831325 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.205, mean=0.205, max=0.205, sum=0.41 (2)", - "tab": "Efficiency", - "score": 0.20489408119380126 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=277.047, mean=277.047, max=277.047, sum=554.094 (2)", - "tab": "General information", - "score": 277.046783625731 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json deleted file mode 100644 index e82639d82..000000000 --- a/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/cohere_command-r/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Command R", - "id": "cohere/command-r", - "developer": "cohere", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.326, mean=0.652, max=0.891, sum=74.329 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.145, mean=0.176, max=0.289, sum=20.061 (114)", - "tab": "Efficiency", - "score": 0.17597788408479575 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=277.047, mean=648.571, max=2823.042, sum=73937.062 (114)", - "tab": "General information", - "score": 648.5707227335503 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.162, mean=0.162, max=0.162, sum=0.324 (2)", - "tab": "Efficiency", - "score": 0.1620460057258606 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=397.66, mean=397.66, max=397.66, sum=795.32 (2)", - "tab": "General information", - "score": 397.66 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.615, - "details": { - "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.157, mean=0.157, max=0.157, sum=0.314 (2)", - "tab": "Efficiency", - "score": 0.15700986297042283 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=360.096, mean=360.096, max=360.096, sum=720.193 (2)", - "tab": "General information", - "score": 360.0962962962963 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382, - "details": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.185, mean=0.185, max=0.185, sum=0.37 (2)", - "tab": "Efficiency", - "score": 0.18501442193984985 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.163, mean=0.163, max=0.163, sum=0.325 (2)", - "tab": "Efficiency", - "score": 0.1627496729294459 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.182, mean=0.182, max=0.182, sum=0.363 (2)", - "tab": "Efficiency", - "score": 0.18159597158432006 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.173, mean=0.173, max=0.173, sum=0.346 (2)", - "tab": "Efficiency", - "score": 0.17305777072906495 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.167, mean=0.167, max=0.167, sum=0.334 (2)", - "tab": "Efficiency", - "score": 0.1671100668824477 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.169, mean=0.169, max=0.169, sum=0.339 (2)", - "tab": "Efficiency", - "score": 0.16945467041988 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=586.57, mean=586.57, max=586.57, sum=1173.14 (2)", - "tab": "General information", - "score": 586.57 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=496.632, mean=496.632, max=496.632, sum=993.264 (2)", - "tab": "General information", - "score": 496.63194444444446 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=869.29, mean=869.29, max=869.29, sum=1738.58 (2)", - "tab": "General information", - "score": 869.29 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=645.25, mean=645.25, max=645.25, sum=1290.5 (2)", - "tab": "General information", - "score": 645.25 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=535.092, mean=535.092, max=535.092, sum=1070.185 (2)", - "tab": "General information", - "score": 535.0924855491329 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=530.382, mean=530.382, max=530.382, sum=1060.765 (2)", - "tab": "General information", - "score": 530.3823529411765 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.163, mean=0.163, max=0.163, sum=0.327 (2)", - "tab": "Efficiency", - "score": 0.16325130462646484 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=399.41, mean=399.41, max=399.41, sum=798.82 (2)", - "tab": "General information", - "score": 399.41 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.456, - "details": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.912 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.174, mean=0.174, max=0.174, sum=0.347 (2)", - "tab": "Efficiency", - "score": 0.17368793905827037 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=661.579, mean=661.579, max=661.579, sum=1323.158 (2)", - "tab": "General information", - "score": 661.578947368421 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42, - "details": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.166, mean=0.166, max=0.166, sum=0.332 (2)", - "tab": "Efficiency", - "score": 0.16606518507003784 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=469.58, mean=469.58, max=469.58, sum=939.16 (2)", - "tab": "General information", - "score": 469.58 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.16, mean=0.16, max=0.16, sum=0.319 (2)", - "tab": "Efficiency", - "score": 0.15962726098519783 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=417.944, mean=417.944, max=417.944, sum=835.889 (2)", - "tab": "General information", - "score": 417.94444444444446 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.685, - "details": { - "description": "min=0.685, mean=0.685, max=0.685, sum=1.37 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.154, mean=0.154, max=0.154, sum=0.307 (2)", - "tab": "Efficiency", - "score": 0.1535167272451223 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=355.508, mean=355.508, max=355.508, sum=711.016 (2)", - "tab": "General information", - "score": 355.508038585209 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.681, - "details": { - "description": "min=0.681, mean=0.681, max=0.681, sum=1.363 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.195, mean=0.195, max=0.195, sum=0.389 (2)", - "tab": "Efficiency", - "score": 0.19464709828881657 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.177, mean=0.177, max=0.177, sum=0.354 (2)", - "tab": "Efficiency", - "score": 0.1770885929148248 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.469 (2)", - "tab": "Efficiency", - "score": 0.23427105509473262 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.211, mean=0.211, max=0.211, sum=0.423 (2)", - "tab": "Efficiency", - "score": 0.2114220471943126 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1143.129, mean=1143.129, max=1143.129, sum=2286.257 (2)", - "tab": "General information", - "score": 1143.1286764705883 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=767.429, mean=767.429, max=767.429, sum=1534.858 (2)", - "tab": "General information", - "score": 767.4290780141844 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1697.692, mean=1697.692, max=1697.692, sum=3395.385 (2)", - "tab": "General information", - "score": 1697.6923076923076 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=609.167, mean=609.167, max=609.167, sum=1218.333 (2)", - "tab": "General information", - "score": 609.1666666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.183, mean=0.183, max=0.183, sum=0.366 (2)", - "tab": "Efficiency", - "score": 0.18277841329574585 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=452.23, mean=452.23, max=452.23, sum=904.46 (2)", - "tab": "General information", - "score": 452.23 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.223, mean=0.223, max=0.223, sum=0.446 (2)", - "tab": "Efficiency", - "score": 0.22317567624543844 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=615.276, mean=615.276, max=615.276, sum=1230.553 (2)", - "tab": "General information", - "score": 615.2763157894736 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63, - "details": { - "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.17, mean=0.17, max=0.17, sum=0.34 (2)", - "tab": "Efficiency", - "score": 0.16991474628448486 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=582.46, mean=582.46, max=582.46, sum=1164.92 (2)", - "tab": "General information", - "score": 582.46 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.751, - "details": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.186, mean=0.186, max=0.186, sum=0.371 (2)", - "tab": "Efficiency", - "score": 0.1857448289979179 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=433.181, mean=433.181, max=433.181, sum=866.362 (2)", - "tab": "General information", - "score": 433.1811320754717 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.528, - "details": { - "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.146, mean=0.146, max=0.146, sum=0.293 (2)", - "tab": "Efficiency", - "score": 0.14639884360293123 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=322.511, mean=322.511, max=322.511, sum=645.021 (2)", - "tab": "General information", - "score": 322.51063829787233 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.593, - "details": { - "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.164, mean=0.164, max=0.164, sum=0.329 (2)", - "tab": "Efficiency", - "score": 0.16444927248461494 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=494.648, mean=494.648, max=494.648, sum=989.297 (2)", - "tab": "General information", - "score": 494.64827586206894 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437, - "details": { - "description": "min=0.437, mean=0.437, max=0.437, sum=0.873 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.171, mean=0.171, max=0.171, sum=0.342 (2)", - "tab": "Efficiency", - "score": 0.17102001079175838 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=609.537, mean=609.537, max=609.537, sum=1219.074 (2)", - "tab": "General information", - "score": 609.5370370370371 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.405, - "details": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.178, mean=0.178, max=0.178, sum=0.357 (2)", - "tab": "Efficiency", - "score": 0.17840472289494105 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=630.992, mean=630.992, max=630.992, sum=1261.984 (2)", - "tab": "General information", - "score": 630.9920634920635 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.168, mean=0.168, max=0.168, sum=0.337 (2)", - "tab": "Efficiency", - "score": 0.16835398827829667 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.171, mean=0.171, max=0.171, sum=0.341 (2)", - "tab": "Efficiency", - "score": 0.17066421649726154 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.183, mean=0.183, max=0.183, sum=0.367 (2)", - "tab": "Efficiency", - "score": 0.1834348964691162 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.577 (2)", - "tab": "Efficiency", - "score": 0.28851397543242485 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.155, mean=0.155, max=0.155, sum=0.31 (2)", - "tab": "Efficiency", - "score": 0.15488721987213752 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.169, mean=0.169, max=0.169, sum=0.338 (2)", - "tab": "Efficiency", - "score": 0.16877420331530002 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.159, mean=0.159, max=0.159, sum=0.318 (2)", - "tab": "Efficiency", - "score": 0.1590262247965886 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.169, mean=0.169, max=0.169, sum=0.337 (2)", - "tab": "Efficiency", - "score": 0.1685257187596074 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.156, mean=0.156, max=0.156, sum=0.312 (2)", - "tab": "Efficiency", - "score": 0.1562105868043018 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.165, mean=0.165, max=0.165, sum=0.33 (2)", - "tab": "Efficiency", - "score": 0.16475912277272206 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.169, mean=0.169, max=0.169, sum=0.339 (2)", - "tab": "Efficiency", - "score": 0.16945652830491373 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.184, mean=0.184, max=0.184, sum=0.368 (2)", - "tab": "Efficiency", - "score": 0.18419962348761382 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.571 (2)", - "tab": "Efficiency", - "score": 0.28542132938609405 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.216, mean=0.216, max=0.216, sum=0.433 (2)", - "tab": "Efficiency", - "score": 0.21634829698232658 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=527.213, mean=527.213, max=527.213, sum=1054.426 (2)", - "tab": "General information", - "score": 527.2129032258065 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=530.635, mean=530.635, max=530.635, sum=1061.271 (2)", - "tab": "General information", - "score": 530.6354679802955 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=932.02, mean=932.02, max=932.02, sum=1864.04 (2)", - "tab": "General information", - "score": 932.02 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2823.042, mean=2823.042, max=2823.042, sum=5646.085 (2)", - "tab": "General information", - "score": 2823.042424242424 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=407.818, mean=407.818, max=407.818, sum=815.636 (2)", - "tab": "General information", - "score": 407.8181818181818 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=489.155, mean=489.155, max=489.155, sum=978.311 (2)", - "tab": "General information", - "score": 489.1554404145078 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=407.654, mean=407.654, max=407.654, sum=815.308 (2)", - "tab": "General information", - "score": 407.65384615384613 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=589.774, mean=589.774, max=589.774, sum=1179.548 (2)", - "tab": "General information", - "score": 589.7740740740741 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=428.403, mean=428.403, max=428.403, sum=856.807 (2)", - "tab": "General information", - "score": 428.4033613445378 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=604.272, mean=604.272, max=604.272, sum=1208.543 (2)", - "tab": "General information", - "score": 604.2715231788079 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=516.004, mean=516.004, max=516.004, sum=1032.007 (2)", - "tab": "General information", - "score": 516.0036697247706 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=871.264, mean=871.264, max=871.264, sum=1742.528 (2)", - "tab": "General information", - "score": 871.2638888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2240.358, mean=2240.358, max=2240.358, sum=4480.716 (2)", - "tab": "General information", - "score": 2240.357843137255 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1443.321, mean=1443.321, max=1443.321, sum=2886.641 (2)", - "tab": "General information", - "score": 1443.3206751054852 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.763, - "details": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.527 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.154, mean=0.154, max=0.154, sum=0.308 (2)", - "tab": "Efficiency", - "score": 0.15405324649383134 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.159, mean=0.159, max=0.159, sum=0.318 (2)", - "tab": "Efficiency", - "score": 0.15911357275402274 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=336.09, mean=336.09, max=336.09, sum=672.179 (2)", - "tab": "General information", - "score": 336.0896860986547 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=367.16, mean=367.16, max=367.16, sum=734.321 (2)", - "tab": "General information", - "score": 367.1603053435114 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802, - "details": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.603 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.174, mean=0.174, max=0.174, sum=0.347 (2)", - "tab": "Efficiency", - "score": 0.1736255066453918 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=653.612, mean=653.612, max=653.612, sum=1307.223 (2)", - "tab": "General information", - "score": 653.6115702479339 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.798, - "details": { - "description": "min=0.798, mean=0.798, max=0.798, sum=1.595 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.164, mean=0.164, max=0.164, sum=0.327 (2)", - "tab": "Efficiency", - "score": 0.16361909117435386 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=463.773, mean=463.773, max=463.773, sum=927.546 (2)", - "tab": "General information", - "score": 463.7730061349693 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446, - "details": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.893 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.176, mean=0.176, max=0.176, sum=0.352 (2)", - "tab": "Efficiency", - "score": 0.17583884937422617 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=716.438, mean=716.438, max=716.438, sum=1432.875 (2)", - "tab": "General information", - "score": 716.4375 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.156, mean=0.156, max=0.156, sum=0.312 (2)", - "tab": "Efficiency", - "score": 0.15610716875317027 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=294.456, mean=294.456, max=294.456, sum=588.913 (2)", - "tab": "General information", - "score": 294.45631067961165 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "description": "min=0.872, mean=0.872, max=0.872, sum=1.744 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.161, mean=0.161, max=0.161, sum=0.321 (2)", - "tab": "Efficiency", - "score": 0.16073521895286363 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=446.855, mean=446.855, max=446.855, sum=893.709 (2)", - "tab": "General information", - "score": 446.85470085470087 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.16, mean=0.16, max=0.16, sum=0.319 (2)", - "tab": "Efficiency", - "score": 0.15951916217803955 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=357.02, mean=357.02, max=357.02, sum=714.04 (2)", - "tab": "General information", - "score": 357.02 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.848, - "details": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.153, mean=0.153, max=0.153, sum=0.307 (2)", - "tab": "Efficiency", - "score": 0.15332558511317462 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=325.76, mean=325.76, max=325.76, sum=651.52 (2)", - "tab": "General information", - "score": 325.75989782886336 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451, - "details": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.903 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.17, mean=0.17, max=0.17, sum=0.339 (2)", - "tab": "Efficiency", - "score": 0.16953640452699165 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.176, mean=0.176, max=0.176, sum=0.351 (2)", - "tab": "Efficiency", - "score": 0.1756493640345568 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=506.78, mean=506.78, max=506.78, sum=1013.561 (2)", - "tab": "General information", - "score": 506.78034682080926 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=699.344, mean=699.344, max=699.344, sum=1398.688 (2)", - "tab": "General information", - "score": 699.3441340782123 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.703, - "details": { - "description": "min=0.703, mean=0.703, max=0.703, sum=1.405 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.171, mean=0.171, max=0.171, sum=0.342 (2)", - "tab": "Efficiency", - "score": 0.17089871020098918 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=618.402, mean=618.402, max=618.402, sum=1236.804 (2)", - "tab": "General information", - "score": 618.4019607843137 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.728, - "details": { - "description": "min=0.728, mean=0.728, max=0.728, sum=1.457 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.167, mean=0.167, max=0.167, sum=0.333 (2)", - "tab": "Efficiency", - "score": 0.16663335429297554 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=549.235, mean=549.235, max=549.235, sum=1098.469 (2)", - "tab": "General information", - "score": 549.2345679012345 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.17, mean=0.17, max=0.17, sum=0.341 (2)", - "tab": "Efficiency", - "score": 0.17039124532179398 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=434.682, mean=434.682, max=434.682, sum=869.364 (2)", - "tab": "General information", - "score": 434.6818181818182 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.714, - "details": { - "description": "min=0.714, mean=0.714, max=0.714, sum=1.429 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.203, mean=0.203, max=0.203, sum=0.405 (2)", - "tab": "Efficiency", - "score": 0.20251671927315848 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1207.494, mean=1207.494, max=1207.494, sum=2414.988 (2)", - "tab": "General information", - "score": 1207.4938775510204 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.866, - "details": { - "description": "min=0.866, mean=0.866, max=0.866, sum=1.731 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.164, mean=0.164, max=0.164, sum=0.327 (2)", - "tab": "Efficiency", - "score": 0.16369761163322485 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=467.343, mean=467.343, max=467.343, sum=934.687 (2)", - "tab": "General information", - "score": 467.34328358208955 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542, - "details": { - "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.158, mean=0.158, max=0.158, sum=0.316 (2)", - "tab": "Efficiency", - "score": 0.15811713919582138 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=352.861, mean=352.861, max=352.861, sum=705.723 (2)", - "tab": "General information", - "score": 352.8614457831325 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.813, - "details": { - "description": "min=0.813, mean=0.813, max=0.813, sum=1.626 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.145, mean=0.145, max=0.145, sum=0.291 (2)", - "tab": "Efficiency", - "score": 0.1452833434991669 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=277.047, mean=277.047, max=277.047, sum=554.094 (2)", - "tab": "General information", - "score": 277.046783625731 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.959, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json deleted file mode 100644 index d5f73b61f..000000000 --- a/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DBRX Instruct", - "id": "databricks/dbrx-instruct", - "developer": "databricks", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.34, mean=0.741, max=0.953, sum=84.475 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.356, mean=0.459, max=1.347, sum=52.272 (114)", - "tab": "Efficiency", - "score": 0.4585284510595002 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)", - "tab": "General information", - "score": 607.851634217556 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.34, - "details": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)", - "tab": "Efficiency", - "score": 0.4316913342475891 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=366.44, mean=366.44, max=366.44, sum=732.88 (2)", - "tab": "General information", - "score": 366.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.385, mean=0.385, max=0.385, sum=0.771 (2)", - "tab": "Efficiency", - "score": 0.38546188672383624 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)", - "tab": "General information", - "score": 346.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539, - "details": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.789 (2)", - "tab": "Efficiency", - "score": 0.39454248666763303 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.781 (2)", - "tab": "Efficiency", - "score": 0.3906625145011478 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.877 (2)", - "tab": "Efficiency", - "score": 0.438518271446228 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)", - "tab": "Efficiency", - "score": 0.3961342000961304 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.784 (2)", - "tab": "Efficiency", - "score": 0.39187397708782573 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.813 (2)", - "tab": "Efficiency", - "score": 0.4062807746962005 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)", - "tab": "General information", - "score": 542.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=466.917, mean=466.917, max=466.917, sum=933.833 (2)", - "tab": "General information", - "score": 466.9166666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)", - "tab": "General information", - "score": 821.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)", - "tab": "General information", - "score": 587.52 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=495.728, mean=495.728, max=495.728, sum=991.457 (2)", - "tab": "General information", - "score": 495.728323699422 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=496.608, mean=496.608, max=496.608, sum=993.216 (2)", - "tab": "General information", - "score": 496.6078431372549 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.4148012113571167 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=371.54, mean=371.54, max=371.54, sum=743.08 (2)", - "tab": "General information", - "score": 371.54 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.605, - "details": { - "description": "min=0.605, mean=0.605, max=0.605, sum=1.211 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.863 (2)", - "tab": "Efficiency", - "score": 0.43133983904855294 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)", - "tab": "General information", - "score": 607.4298245614035 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)", - "tab": "Efficiency", - "score": 0.4286450815200806 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)", - "tab": "General information", - "score": 392.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.685 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)", - "tab": "Efficiency", - "score": 0.43625413488458703 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.639, mean=387.639, max=387.639, sum=775.278 (2)", - "tab": "General information", - "score": 387.6388888888889 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Efficiency", - "score": 0.4079643100787589 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)", - "tab": "General information", - "score": 322.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.801, - "details": { - "description": "min=0.801, mean=0.801, max=0.801, sum=1.601 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Efficiency", - "score": 0.4699658164206673 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.791 (2)", - "tab": "Efficiency", - "score": 0.39532034532398197 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.556, mean=0.556, max=0.556, sum=1.113 (2)", - "tab": "Efficiency", - "score": 0.5564531824579451 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)", - "tab": "Efficiency", - "score": 0.3879917279567594 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)", - "tab": "General information", - "score": 1087.5845588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)", - "tab": "General information", - "score": 651.5921985815603 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)", - "tab": "General information", - "score": 1630.7868318122555 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)", - "tab": "General information", - "score": 568.1143790849674 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Efficiency", - "score": 0.3899818444252014 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)", - "tab": "General information", - "score": 415.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.671 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.857 (2)", - "tab": "Efficiency", - "score": 0.42830287625915126 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)", - "tab": "General information", - "score": 572.6907894736842 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.891 (2)", - "tab": "Efficiency", - "score": 0.44572278976440427 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)", - "tab": "General information", - "score": 562.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.789, - "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.577 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.404, mean=0.404, max=0.404, sum=0.807 (2)", - "tab": "Efficiency", - "score": 0.4037102978184538 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=390.947, mean=390.947, max=390.947, sum=781.894 (2)", - "tab": "General information", - "score": 390.94716981132075 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=1.481 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.758 (2)", - "tab": "Efficiency", - "score": 0.3791612523667356 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=297.838, mean=297.838, max=297.838, sum=595.677 (2)", - "tab": "General information", - "score": 297.83829787234043 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71, - "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.421 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.767 (2)", - "tab": "Efficiency", - "score": 0.3837302882095863 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=433.641, mean=433.641, max=433.641, sum=867.283 (2)", - "tab": "General information", - "score": 433.6413793103448 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.563, - "details": { - "description": "min=0.563, mean=0.563, max=0.563, sum=1.127 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.783 (2)", - "tab": "Efficiency", - "score": 0.3916708092210154 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)", - "tab": "General information", - "score": 524.8624338624338 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.563, - "details": { - "description": "min=0.563, mean=0.563, max=0.563, sum=1.127 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.419, mean=0.419, max=0.419, sum=0.837 (2)", - "tab": "Efficiency", - "score": 0.41872944339873297 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)", - "tab": "General information", - "score": 599.7619047619048 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.775 (2)", - "tab": "Efficiency", - "score": 0.3873311073549332 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.712 (2)", - "tab": "Efficiency", - "score": 0.356056117071894 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)", - "tab": "Efficiency", - "score": 0.4159617280960083 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.784, mean=0.784, max=0.784, sum=1.569 (2)", - "tab": "Efficiency", - "score": 0.7843083367203221 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.573, mean=0.573, max=0.573, sum=1.146 (2)", - "tab": "Efficiency", - "score": 0.573177902385442 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.522, mean=0.522, max=0.522, sum=1.043 (2)", - "tab": "Efficiency", - "score": 0.5217143093366079 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=1.013, mean=1.013, max=1.013, sum=2.025 (2)", - "tab": "Efficiency", - "score": 1.0127322582098155 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=1.347, mean=1.347, max=1.347, sum=2.694 (2)", - "tab": "Efficiency", - "score": 1.346758367397167 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)", - "tab": "Efficiency", - "score": 0.40513940819171296 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.915 (2)", - "tab": "Efficiency", - "score": 0.45727316433230775 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.427, mean=0.427, max=0.427, sum=0.855 (2)", - "tab": "Efficiency", - "score": 0.42725621625917765 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)", - "tab": "Efficiency", - "score": 0.4648557923458241 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.701, mean=0.701, max=0.701, sum=1.401 (2)", - "tab": "Efficiency", - "score": 0.7005175001481 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.543, mean=0.543, max=0.543, sum=1.085 (2)", - "tab": "Efficiency", - "score": 0.5426257642512583 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)", - "tab": "General information", - "score": 506.6774193548387 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=489.714, mean=489.714, max=489.714, sum=979.429 (2)", - "tab": "General information", - "score": 489.7142857142857 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)", - "tab": "General information", - "score": 860.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)", - "tab": "General information", - "score": 2791.072727272727 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.045, mean=365.045, max=365.045, sum=730.091 (2)", - "tab": "General information", - "score": 365.04545454545456 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)", - "tab": "General information", - "score": 458.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=364.562, mean=364.562, max=364.562, sum=729.123 (2)", - "tab": "General information", - "score": 364.5615384615385 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)", - "tab": "General information", - "score": 525.3740740740741 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=392.025, mean=392.025, max=392.025, sum=784.05 (2)", - "tab": "General information", - "score": 392.02521008403363 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)", - "tab": "General information", - "score": 553.4635761589404 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.246, mean=488.246, max=488.246, sum=976.492 (2)", - "tab": "General information", - "score": 488.24587155963303 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)", - "tab": "General information", - "score": 788.699074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)", - "tab": "General information", - "score": 2210.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)", - "tab": "General information", - "score": 1421.2700421940929 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.819 (2)", - "tab": "Efficiency", - "score": 0.4093097753054358 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.41, mean=0.41, max=0.41, sum=0.819 (2)", - "tab": "Efficiency", - "score": 0.40955095072738995 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=312.906, mean=312.906, max=312.906, sum=625.812 (2)", - "tab": "General information", - "score": 312.90582959641256 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.183, mean=334.183, max=334.183, sum=668.366 (2)", - "tab": "General information", - "score": 334.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)", - "tab": "Efficiency", - "score": 0.43540735284159005 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)", - "tab": "General information", - "score": 632.8512396694215 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.847, - "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.693 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.836 (2)", - "tab": "Efficiency", - "score": 0.4178658789652257 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.595, mean=442.595, max=442.595, sum=885.19 (2)", - "tab": "General information", - "score": 442.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)", - "tab": "Efficiency", - "score": 0.442230761051178 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)", - "tab": "General information", - "score": 661.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.709 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Efficiency", - "score": 0.42014194460748466 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.796, mean=276.796, max=276.796, sum=553.592 (2)", - "tab": "General information", - "score": 276.79611650485435 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)", - "tab": "Efficiency", - "score": 0.4250037354281825 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)", - "tab": "General information", - "score": 397.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.4227530717849731 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=334, mean=334, max=334, sum=668 (2)", - "tab": "General information", - "score": 334.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911, - "details": { - "description": "min=0.911, mean=0.911, max=0.911, sum=1.821 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.734 (2)", - "tab": "Efficiency", - "score": 0.3670404892162649 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=292.925, mean=292.925, max=292.925, sum=585.849 (2)", - "tab": "General information", - "score": 292.92464878671774 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.465, - "details": { - "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)", - "tab": "Efficiency", - "score": 0.3832114066691757 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.801 (2)", - "tab": "Efficiency", - "score": 0.400396443478888 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.145, mean=469.145, max=469.145, sum=938.289 (2)", - "tab": "General information", - "score": 469.1445086705202 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)", - "tab": "General information", - "score": 649.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.814, - "details": { - "description": "min=0.814, mean=0.814, max=0.814, sum=1.627 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.795 (2)", - "tab": "Efficiency", - "score": 0.39725586947272806 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)", - "tab": "General information", - "score": 579.8169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.763 (2)", - "tab": "Efficiency", - "score": 0.3814176806697139 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)", - "tab": "General information", - "score": 507.55864197530866 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691, - "details": { - "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.782 (2)", - "tab": "Efficiency", - "score": 0.3911652868444269 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)", - "tab": "General information", - "score": 398.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)", - "tab": "Efficiency", - "score": 0.46417581013270787 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)", - "tab": "General information", - "score": 1157.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.896, - "details": { - "description": "min=0.896, mean=0.896, max=0.896, sum=1.791 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.801 (2)", - "tab": "Efficiency", - "score": 0.4007088568673205 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=438.522, mean=438.522, max=438.522, sum=877.045 (2)", - "tab": "General information", - "score": 438.5223880597015 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566, - "details": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)", - "tab": "Efficiency", - "score": 0.38554139022367545 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.09, mean=336.09, max=336.09, sum=672.181 (2)", - "tab": "General information", - "score": 336.0903614457831 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)", - "tab": "Efficiency", - "score": 0.3823263380262587 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.561, mean=268.561, max=268.561, sum=537.123 (2)", - "tab": "General information", - "score": 268.56140350877195 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json deleted file mode 100644 index 7ec071041..000000000 --- a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek LLM Chat 67B", - "id": "deepseek-ai/deepseek-llm-67b-chat", - "developer": "deepseek-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725, - "details": { - "description": "min=0.363, mean=0.725, max=0.964, sum=82.655 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.432, mean=0.591, max=1.828, sum=67.401 (114)", - "tab": "Efficiency", - "score": 0.5912370078072168 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=282.398, mean=644.941, max=2845.339, sum=73523.251 (114)", - "tab": "General information", - "score": 644.9407984438222 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44, - "details": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.485, mean=0.485, max=0.485, sum=0.97 (2)", - "tab": "Efficiency", - "score": 0.4850481009483337 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=382.07, mean=382.07, max=382.07, sum=764.14 (2)", - "tab": "General information", - "score": 382.07 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.903 (2)", - "tab": "Efficiency", - "score": 0.4513168688173647 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=363.059, mean=363.059, max=363.059, sum=726.119 (2)", - "tab": "General information", - "score": 363.05925925925925 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363, - "details": { - "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.522, mean=0.522, max=0.522, sum=1.045 (2)", - "tab": "Efficiency", - "score": 0.5224089217185974 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.513, mean=0.513, max=0.513, sum=1.026 (2)", - "tab": "Efficiency", - "score": 0.5128465278281106 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.674, mean=0.674, max=0.674, sum=1.347 (2)", - "tab": "Efficiency", - "score": 0.6736601734161377 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.552, mean=0.552, max=0.552, sum=1.103 (2)", - "tab": "Efficiency", - "score": 0.5516978883743286 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)", - "tab": "Efficiency", - "score": 0.5147825513960999 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.558, mean=0.558, max=0.558, sum=1.116 (2)", - "tab": "Efficiency", - "score": 0.5581503288418639 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=578.1, mean=578.1, max=578.1, sum=1156.2 (2)", - "tab": "General information", - "score": 578.1 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=502.611, mean=502.611, max=502.611, sum=1005.222 (2)", - "tab": "General information", - "score": 502.6111111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=864.55, mean=864.55, max=864.55, sum=1729.1 (2)", - "tab": "General information", - "score": 864.55 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=630.13, mean=630.13, max=630.13, sum=1260.26 (2)", - "tab": "General information", - "score": 630.13 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=538.613, mean=538.613, max=538.613, sum=1077.225 (2)", - "tab": "General information", - "score": 538.6127167630058 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=507.157, mean=507.157, max=507.157, sum=1014.314 (2)", - "tab": "General information", - "score": 507.15686274509807 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)", - "tab": "Efficiency", - "score": 0.48023970127105714 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=394.36, mean=394.36, max=394.36, sum=788.72 (2)", - "tab": "General information", - "score": 394.36 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.553, - "details": { - "description": "min=0.553, mean=0.553, max=0.553, sum=1.105 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.551, mean=0.551, max=0.551, sum=1.102 (2)", - "tab": "Efficiency", - "score": 0.5508757557785302 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=646.667, mean=646.667, max=646.667, sum=1293.333 (2)", - "tab": "General information", - "score": 646.6666666666666 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=1.013 (2)", - "tab": "Efficiency", - "score": 0.5062541460990906 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=457.97, mean=457.97, max=457.97, sum=915.94 (2)", - "tab": "General information", - "score": 457.97 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.954 (2)", - "tab": "Efficiency", - "score": 0.47704599963294136 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=415.861, mean=415.861, max=415.861, sum=831.722 (2)", - "tab": "General information", - "score": 415.8611111111111 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.801, - "details": { - "description": "min=0.801, mean=0.801, max=0.801, sum=1.601 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)", - "tab": "Efficiency", - "score": 0.43181402736921404 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=347.907, mean=347.907, max=347.907, sum=695.814 (2)", - "tab": "General information", - "score": 347.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)", - "tab": "Efficiency", - "score": 0.8414969829952016 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.641, mean=0.641, max=0.641, sum=1.282 (2)", - "tab": "Efficiency", - "score": 0.6411697010621957 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.161, mean=1.161, max=1.161, sum=2.323 (2)", - "tab": "Efficiency", - "score": 1.1613836899263763 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.532, mean=0.532, max=0.532, sum=1.064 (2)", - "tab": "Efficiency", - "score": 0.5318081830841264 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1166.062, mean=1166.062, max=1166.062, sum=2332.125 (2)", - "tab": "General information", - "score": 1166.0625 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=759.752, mean=759.752, max=759.752, sum=1519.504 (2)", - "tab": "General information", - "score": 759.7517730496454 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1711.27, mean=1711.27, max=1711.27, sum=3422.54 (2)", - "tab": "General information", - "score": 1711.2698826597132 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=599.475, mean=599.475, max=599.475, sum=1198.951 (2)", - "tab": "General information", - "score": 599.4754901960785 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.504, mean=0.504, max=0.504, sum=1.007 (2)", - "tab": "Efficiency", - "score": 0.5037446546554566 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=453.51, mean=453.51, max=453.51, sum=907.02 (2)", - "tab": "General information", - "score": 453.51 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.645 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.527, mean=0.527, max=0.527, sum=1.054 (2)", - "tab": "Efficiency", - "score": 0.5270162303196756 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=613.967, mean=613.967, max=613.967, sum=1227.934 (2)", - "tab": "General information", - "score": 613.9671052631579 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Efficiency", - "score": 0.5199160981178284 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=575.68, mean=575.68, max=575.68, sum=1151.36 (2)", - "tab": "General information", - "score": 575.68 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.979 (2)", - "tab": "Efficiency", - "score": 0.48968217777756023 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=436.902, mean=436.902, max=436.902, sum=873.804 (2)", - "tab": "General information", - "score": 436.90188679245284 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723, - "details": { - "description": "min=0.723, mean=0.723, max=0.723, sum=1.447 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.883 (2)", - "tab": "Efficiency", - "score": 0.441747319444697 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=311.583, mean=311.583, max=311.583, sum=623.166 (2)", - "tab": "General information", - "score": 311.58297872340427 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.669, - "details": { - "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.498, mean=0.498, max=0.498, sum=0.995 (2)", - "tab": "Efficiency", - "score": 0.4975001285816061 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=476.407, mean=476.407, max=476.407, sum=952.814 (2)", - "tab": "General information", - "score": 476.4068965517241 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548, - "details": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.095 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.534, mean=0.534, max=0.534, sum=1.068 (2)", - "tab": "Efficiency", - "score": 0.5340847508617179 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=597.267, mean=597.267, max=597.267, sum=1194.534 (2)", - "tab": "General information", - "score": 597.2671957671957 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548, - "details": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.095 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.555, mean=0.555, max=0.555, sum=1.11 (2)", - "tab": "Efficiency", - "score": 0.5548424853218926 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=655.698, mean=655.698, max=655.698, sum=1311.397 (2)", - "tab": "General information", - "score": 655.6984126984127 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911, - "details": { - "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.507, mean=0.507, max=0.507, sum=1.014 (2)", - "tab": "Efficiency", - "score": 0.5071036392642606 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.511, mean=0.511, max=0.511, sum=1.023 (2)", - "tab": "Efficiency", - "score": 0.5113655968839899 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.707, mean=0.707, max=0.707, sum=1.415 (2)", - "tab": "Efficiency", - "score": 0.707279555797577 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.828, mean=1.828, max=1.828, sum=3.657 (2)", - "tab": "Efficiency", - "score": 1.8283701000791608 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.483, mean=0.483, max=0.483, sum=0.967 (2)", - "tab": "Efficiency", - "score": 0.48332409545628713 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.483, mean=0.483, max=0.483, sum=0.967 (2)", - "tab": "Efficiency", - "score": 0.48336509719413795 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.973 (2)", - "tab": "Efficiency", - "score": 0.4863407966418144 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)", - "tab": "Efficiency", - "score": 0.5308889477341263 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.503, mean=0.503, max=0.503, sum=1.006 (2)", - "tab": "Efficiency", - "score": 0.50309332478948 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.533, mean=0.533, max=0.533, sum=1.066 (2)", - "tab": "Efficiency", - "score": 0.5327805051740432 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.519, mean=0.519, max=0.519, sum=1.039 (2)", - "tab": "Efficiency", - "score": 0.5194539997555794 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.662, mean=0.662, max=0.662, sum=1.323 (2)", - "tab": "Efficiency", - "score": 0.6615116441691363 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.442, mean=1.442, max=1.442, sum=2.885 (2)", - "tab": "Efficiency", - "score": 1.4423445556678025 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.033, mean=1.033, max=1.033, sum=2.067 (2)", - "tab": "Efficiency", - "score": 1.033272183897123 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=517.116, mean=517.116, max=517.116, sum=1034.232 (2)", - "tab": "General information", - "score": 517.1161290322581 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=527.305, mean=527.305, max=527.305, sum=1054.611 (2)", - "tab": "General information", - "score": 527.3054187192118 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=937.05, mean=937.05, max=937.05, sum=1874.1 (2)", - "tab": "General information", - "score": 937.05 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2845.339, mean=2845.339, max=2845.339, sum=5690.679 (2)", - "tab": "General information", - "score": 2845.339393939394 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=397.934, mean=397.934, max=397.934, sum=795.869 (2)", - "tab": "General information", - "score": 397.9343434343434 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=485.57, mean=485.57, max=485.57, sum=971.14 (2)", - "tab": "General information", - "score": 485.5699481865285 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=396.095, mean=396.095, max=396.095, sum=792.19 (2)", - "tab": "General information", - "score": 396.0948717948718 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=568.481, mean=568.481, max=568.481, sum=1136.963 (2)", - "tab": "General information", - "score": 568.4814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=416.857, mean=416.857, max=416.857, sum=833.714 (2)", - "tab": "General information", - "score": 416.85714285714283 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=590.212, mean=590.212, max=590.212, sum=1180.424 (2)", - "tab": "General information", - "score": 590.2119205298013 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=512.505, mean=512.505, max=512.505, sum=1025.009 (2)", - "tab": "General information", - "score": 512.5045871559633 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=861.625, mean=861.625, max=861.625, sum=1723.25 (2)", - "tab": "General information", - "score": 861.625 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2259.147, mean=2259.147, max=2259.147, sum=4518.294 (2)", - "tab": "General information", - "score": 2259.1470588235293 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1467.696, mean=1467.696, max=1467.696, sum=2935.392 (2)", - "tab": "General information", - "score": 1467.6962025316457 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)", - "tab": "Efficiency", - "score": 0.4340778626668614 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.499, mean=0.499, max=0.499, sum=0.999 (2)", - "tab": "Efficiency", - "score": 0.4992539391262841 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=327.825, mean=327.825, max=327.825, sum=655.65 (2)", - "tab": "General information", - "score": 327.82511210762334 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=366.824, mean=366.824, max=366.824, sum=733.649 (2)", - "tab": "General information", - "score": 366.82442748091603 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.851, - "details": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.571, mean=0.571, max=0.571, sum=1.142 (2)", - "tab": "Efficiency", - "score": 0.5709604842603699 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=652.669, mean=652.669, max=652.669, sum=1305.339 (2)", - "tab": "General information", - "score": 652.6694214876034 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.847, - "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.693 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)", - "tab": "Efficiency", - "score": 0.48975605028538616 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=452.098, mean=452.098, max=452.098, sum=904.196 (2)", - "tab": "General information", - "score": 452.09815950920245 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.632, mean=0.632, max=0.632, sum=1.264 (2)", - "tab": "Efficiency", - "score": 0.6320873349905014 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=702.054, mean=702.054, max=702.054, sum=1404.107 (2)", - "tab": "General information", - "score": 702.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.883 (2)", - "tab": "Efficiency", - "score": 0.4415167558540418 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=288.437, mean=288.437, max=288.437, sum=576.874 (2)", - "tab": "General information", - "score": 288.43689320388347 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.923, - "details": { - "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.489, mean=0.489, max=0.489, sum=0.979 (2)", - "tab": "Efficiency", - "score": 0.4894245363708235 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=435.603, mean=435.603, max=435.603, sum=871.205 (2)", - "tab": "General information", - "score": 435.6025641025641 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)", - "tab": "Efficiency", - "score": 0.47359968423843385 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=357.07, mean=357.07, max=357.07, sum=714.14 (2)", - "tab": "General information", - "score": 357.07 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.904, - "details": { - "description": "min=0.904, mean=0.904, max=0.904, sum=1.808 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)", - "tab": "Efficiency", - "score": 0.4352987403309361 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=320.964, mean=320.964, max=320.964, sum=641.928 (2)", - "tab": "General information", - "score": 320.9642401021711 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.544, - "details": { - "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.491, mean=0.491, max=0.491, sum=0.983 (2)", - "tab": "Efficiency", - "score": 0.49129951827098867 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.583, mean=0.583, max=0.583, sum=1.165 (2)", - "tab": "Efficiency", - "score": 0.5826290319751761 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=497.379, mean=497.379, max=497.379, sum=994.757 (2)", - "tab": "General information", - "score": 497.37861271676303 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=688.891, mean=688.891, max=688.891, sum=1377.781 (2)", - "tab": "General information", - "score": 688.890502793296 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.781, - "details": { - "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.096 (2)", - "tab": "Efficiency", - "score": 0.5477774073095882 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=619.314, mean=619.314, max=619.314, sum=1238.627 (2)", - "tab": "General information", - "score": 619.3137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.858, - "details": { - "description": "min=0.858, mean=0.858, max=0.858, sum=1.716 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.521, mean=0.521, max=0.521, sum=1.042 (2)", - "tab": "Efficiency", - "score": 0.5209115015135871 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=554.775, mean=554.775, max=554.775, sum=1109.549 (2)", - "tab": "General information", - "score": 554.7746913580247 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.473, mean=0.473, max=0.473, sum=0.945 (2)", - "tab": "Efficiency", - "score": 0.4725117553364147 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=431.673, mean=431.673, max=431.673, sum=863.345 (2)", - "tab": "General information", - "score": 431.6727272727273 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.819 (2)", - "tab": "Efficiency", - "score": 0.9094535496770119 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1204.906, mean=1204.906, max=1204.906, sum=2409.812 (2)", - "tab": "General information", - "score": 1204.9061224489797 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.751 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.502, mean=0.502, max=0.502, sum=1.003 (2)", - "tab": "Efficiency", - "score": 0.5015075396542525 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=457.751, mean=457.751, max=457.751, sum=915.502 (2)", - "tab": "General information", - "score": 457.7512437810945 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.582, mean=0.582, max=0.582, sum=1.165 (2)", - "tab": "Efficiency", - "score": 0.5824309874729938 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=351.434, mean=351.434, max=351.434, sum=702.867 (2)", - "tab": "General information", - "score": 351.43373493975906 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.87 (2)", - "tab": "Efficiency", - "score": 0.434985329533181 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=282.398, mean=282.398, max=282.398, sum=564.795 (2)", - "tab": "General information", - "score": 282.39766081871346 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json deleted file mode 100644 index 200a6e19c..000000000 --- a/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek v3", - "id": "deepseek-ai/deepseek-v3", - "developer": "deepseek-ai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "description": "min=0.596, mean=0.872, max=0.979, sum=99.412 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.495, mean=1.354, max=6.344, sum=154.309 (114)", - "tab": "Efficiency", - "score": 1.353587049503403 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.918, mean=607.861, max=2773.188, sum=69296.195 (114)", - "tab": "General information", - "score": 607.8613565650774 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.585, mean=0.585, max=0.585, sum=1.171 (2)", - "tab": "Efficiency", - "score": 0.5853858423233033 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.01, mean=373.01, max=373.01, sum=746.02 (2)", - "tab": "General information", - "score": 373.01 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.867, - "details": { - "description": "min=0.867, mean=0.867, max=0.867, sum=1.733 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=1.804, mean=1.804, max=1.804, sum=3.607 (2)", - "tab": "Efficiency", - "score": 1.8037012683020697 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=332.119, mean=332.119, max=332.119, sum=664.237 (2)", - "tab": "General information", - "score": 332.1185185185185 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.814, - "details": { - "description": "min=0.814, mean=0.814, max=0.814, sum=1.627 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.557, mean=0.557, max=0.557, sum=1.113 (2)", - "tab": "Efficiency", - "score": 0.5567307829856872 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.776, mean=0.776, max=0.776, sum=1.553 (2)", - "tab": "Efficiency", - "score": 0.7763584835661782 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.505, mean=0.505, max=0.505, sum=1.01 (2)", - "tab": "Efficiency", - "score": 0.5047655653953552 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.495, mean=0.495, max=0.495, sum=0.989 (2)", - "tab": "Efficiency", - "score": 0.4945454502105713 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=1.811, mean=1.811, max=1.811, sum=3.623 (2)", - "tab": "Efficiency", - "score": 1.8114735322191535 - }, - "College Physics - Observed inference time (s)": { - "description": "min=6.344, mean=6.344, max=6.344, sum=12.687 (2)", - "tab": "Efficiency", - "score": 6.343635446885052 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=541.32, mean=541.32, max=541.32, sum=1082.64 (2)", - "tab": "General information", - "score": 541.32 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=456.201, mean=456.201, max=456.201, sum=912.403 (2)", - "tab": "General information", - "score": 456.2013888888889 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.34, mean=828.34, max=828.34, sum=1656.68 (2)", - "tab": "General information", - "score": 828.34 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=592.74, mean=592.74, max=592.74, sum=1185.48 (2)", - "tab": "General information", - "score": 592.74 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=486.971, mean=486.971, max=486.971, sum=973.942 (2)", - "tab": "General information", - "score": 486.97109826589593 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=492.804, mean=492.804, max=492.804, sum=985.608 (2)", - "tab": "General information", - "score": 492.80392156862746 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.089 (2)", - "tab": "Efficiency", - "score": 0.5446710443496704 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=375.06, mean=375.06, max=375.06, sum=750.12 (2)", - "tab": "General information", - "score": 375.06 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.746, mean=0.746, max=0.746, sum=1.491 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)", - "tab": "Efficiency", - "score": 0.5537264849010267 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=613.535, mean=613.535, max=613.535, sum=1227.07 (2)", - "tab": "General information", - "score": 613.5350877192982 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68, - "details": { - "description": "min=0.68, mean=0.68, max=0.68, sum=1.36 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.978, mean=0.978, max=0.978, sum=1.955 (2)", - "tab": "Efficiency", - "score": 0.9775782990455627 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=398.63, mean=398.63, max=398.63, sum=797.26 (2)", - "tab": "General information", - "score": 398.63 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898, - "details": { - "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)", - "tab": "Efficiency", - "score": 0.8338986083313271 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.694, mean=387.694, max=387.694, sum=775.389 (2)", - "tab": "General information", - "score": 387.69444444444446 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.673 (2)", - "tab": "Efficiency", - "score": 0.836391413710125 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=323.569, mean=323.569, max=323.569, sum=647.138 (2)", - "tab": "General information", - "score": 323.56913183279744 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.887, - "details": { - "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.635, mean=0.635, max=0.635, sum=1.269 (2)", - "tab": "Efficiency", - "score": 0.6345776915550232 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=1.224, mean=1.224, max=1.224, sum=2.448 (2)", - "tab": "Efficiency", - "score": 1.2240875671941338 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.707, mean=0.707, max=0.707, sum=1.413 (2)", - "tab": "Efficiency", - "score": 0.7066206168941911 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.672, mean=0.672, max=0.672, sum=1.345 (2)", - "tab": "Efficiency", - "score": 0.6723053728053773 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1052.765, mean=1052.765, max=1052.765, sum=2105.529 (2)", - "tab": "General information", - "score": 1052.764705882353 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=659.613, mean=659.613, max=659.613, sum=1319.227 (2)", - "tab": "General information", - "score": 659.613475177305 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1629.421, mean=1629.421, max=1629.421, sum=3258.842 (2)", - "tab": "General information", - "score": 1629.4211212516298 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=574.508, mean=574.508, max=574.508, sum=1149.016 (2)", - "tab": "General information", - "score": 574.5081699346405 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.156 (2)", - "tab": "Efficiency", - "score": 0.5778071475028992 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=426.43, mean=426.43, max=426.43, sum=852.86 (2)", - "tab": "General information", - "score": 426.43 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.921, - "details": { - "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.681, mean=0.681, max=0.681, sum=1.363 (2)", - "tab": "Efficiency", - "score": 0.6812541327978435 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=575.836, mean=575.836, max=575.836, sum=1151.671 (2)", - "tab": "General information", - "score": 575.8355263157895 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=4.691, mean=4.691, max=4.691, sum=9.381 (2)", - "tab": "Efficiency", - "score": 4.690641319751739 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=550.46, mean=550.46, max=550.46, sum=1100.92 (2)", - "tab": "General information", - "score": 550.46 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.906, mean=0.906, max=0.906, sum=1.812 (2)", - "tab": "Efficiency", - "score": 0.9061050837894655 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=387.449, mean=387.449, max=387.449, sum=774.898 (2)", - "tab": "General information", - "score": 387.4490566037736 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.881 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.253 (2)", - "tab": "Efficiency", - "score": 0.6267383788494354 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=300.591, mean=300.591, max=300.591, sum=601.183 (2)", - "tab": "General information", - "score": 300.59148936170214 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.869, mean=0.869, max=0.869, sum=1.738 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=2.459, mean=2.459, max=2.459, sum=4.918 (2)", - "tab": "Efficiency", - "score": 2.4591504623150002 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=431.91, mean=431.91, max=431.91, sum=863.821 (2)", - "tab": "General information", - "score": 431.9103448275862 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.942, - "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=1.651, mean=1.651, max=1.651, sum=3.301 (2)", - "tab": "Efficiency", - "score": 1.650515148879359 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.693, mean=531.693, max=531.693, sum=1063.386 (2)", - "tab": "General information", - "score": 531.6931216931217 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.513, mean=0.513, max=0.513, sum=1.026 (2)", - "tab": "Efficiency", - "score": 0.5130742864003257 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=604.119, mean=604.119, max=604.119, sum=1208.238 (2)", - "tab": "General information", - "score": 604.1190476190476 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=2.647, mean=2.647, max=2.647, sum=5.294 (2)", - "tab": "Efficiency", - "score": 2.6472030393538937 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=3.847, mean=3.847, max=3.847, sum=7.695 (2)", - "tab": "Efficiency", - "score": 3.8474940337571018 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=2.761, mean=2.761, max=2.761, sum=5.523 (2)", - "tab": "Efficiency", - "score": 2.7613840389251707 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.944, mean=1.944, max=1.944, sum=3.888 (2)", - "tab": "Efficiency", - "score": 1.9442455436244155 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.607, mean=0.607, max=0.607, sum=1.215 (2)", - "tab": "Efficiency", - "score": 0.6073213755482375 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=2.403, mean=2.403, max=2.403, sum=4.805 (2)", - "tab": "Efficiency", - "score": 2.4025608480285485 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.654, mean=0.654, max=0.654, sum=1.308 (2)", - "tab": "Efficiency", - "score": 0.6539444972307255 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=2.285, mean=2.285, max=2.285, sum=4.57 (2)", - "tab": "Efficiency", - "score": 2.285083364557337 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=1.265, mean=1.265, max=1.265, sum=2.531 (2)", - "tab": "Efficiency", - "score": 1.2653034544792496 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=1.036, mean=1.036, max=1.036, sum=2.072 (2)", - "tab": "Efficiency", - "score": 1.0361600064283965 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=1.658, mean=1.658, max=1.658, sum=3.315 (2)", - "tab": "Efficiency", - "score": 1.6576398372650147 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.513, mean=0.513, max=0.513, sum=1.027 (2)", - "tab": "Efficiency", - "score": 0.5133153398831686 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)", - "tab": "Efficiency", - "score": 0.7908881224837958 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.65, mean=1.65, max=1.65, sum=3.301 (2)", - "tab": "Efficiency", - "score": 1.6504118030081318 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=505.561, mean=505.561, max=505.561, sum=1011.123 (2)", - "tab": "General information", - "score": 505.56129032258065 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=479.32, mean=479.32, max=479.32, sum=958.64 (2)", - "tab": "General information", - "score": 479.320197044335 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=871.42, mean=871.42, max=871.42, sum=1742.84 (2)", - "tab": "General information", - "score": 871.42 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2773.188, mean=2773.188, max=2773.188, sum=5546.376 (2)", - "tab": "General information", - "score": 2773.1878787878786 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=369.53, mean=369.53, max=369.53, sum=739.061 (2)", - "tab": "General information", - "score": 369.530303030303 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=463.767, mean=463.767, max=463.767, sum=927.534 (2)", - "tab": "General information", - "score": 463.76683937823833 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=370.418, mean=370.418, max=370.418, sum=740.836 (2)", - "tab": "General information", - "score": 370.4179487179487 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=520.57, mean=520.57, max=520.57, sum=1041.141 (2)", - "tab": "General information", - "score": 520.5703703703704 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.782, mean=399.782, max=399.782, sum=799.563 (2)", - "tab": "General information", - "score": 399.781512605042 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=559.967, mean=559.967, max=559.967, sum=1119.934 (2)", - "tab": "General information", - "score": 559.9668874172186 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=480.22, mean=480.22, max=480.22, sum=960.44 (2)", - "tab": "General information", - "score": 480.2201834862385 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=796.333, mean=796.333, max=796.333, sum=1592.667 (2)", - "tab": "General information", - "score": 796.3333333333334 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2202.103, mean=2202.103, max=2202.103, sum=4404.206 (2)", - "tab": "General information", - "score": 2202.1029411764707 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1403.051, mean=1403.051, max=1403.051, sum=2806.101 (2)", - "tab": "General information", - "score": 1403.0506329113923 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.553, mean=0.553, max=0.553, sum=1.106 (2)", - "tab": "Efficiency", - "score": 0.5531257503235821 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.511, mean=0.511, max=0.511, sum=1.022 (2)", - "tab": "Efficiency", - "score": 0.5109815524734613 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=315.26, mean=315.26, max=315.26, sum=630.52 (2)", - "tab": "General information", - "score": 315.26008968609864 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.29, mean=341.29, max=341.29, sum=682.58 (2)", - "tab": "General information", - "score": 341.29007633587787 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.901 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)", - "tab": "Efficiency", - "score": 0.8861682651456723 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.339, mean=639.339, max=639.339, sum=1278.678 (2)", - "tab": "General information", - "score": 639.3388429752066 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)", - "tab": "Efficiency", - "score": 0.9191862732354849 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.239, mean=442.239, max=442.239, sum=884.479 (2)", - "tab": "General information", - "score": 442.23926380368096 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.571 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)", - "tab": "Efficiency", - "score": 0.5179938631398338 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=666.277, mean=666.277, max=666.277, sum=1332.554 (2)", - "tab": "General information", - "score": 666.2767857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=4.248, mean=4.248, max=4.248, sum=8.497 (2)", - "tab": "Efficiency", - "score": 4.248399836345784 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=277.379, mean=277.379, max=277.379, sum=554.757 (2)", - "tab": "General information", - "score": 277.378640776699 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=1.645, mean=1.645, max=1.645, sum=3.29 (2)", - "tab": "Efficiency", - "score": 1.6448312304977677 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=398.675, mean=398.675, max=398.675, sum=797.35 (2)", - "tab": "General information", - "score": 398.6752136752137 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.527, mean=0.527, max=0.527, sum=1.054 (2)", - "tab": "Efficiency", - "score": 0.5272433400154114 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=328.48, mean=328.48, max=328.48, sum=656.96 (2)", - "tab": "General information", - "score": 328.48 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=2.642, mean=2.642, max=2.642, sum=5.284 (2)", - "tab": "Efficiency", - "score": 2.6419809954681006 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=296.626, mean=296.626, max=296.626, sum=593.252 (2)", - "tab": "General information", - "score": 296.6257982120051 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.637, mean=0.637, max=0.637, sum=1.275 (2)", - "tab": "Efficiency", - "score": 0.6374224183187319 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.624, mean=0.624, max=0.624, sum=1.247 (2)", - "tab": "Efficiency", - "score": 0.6235519771469372 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=477.78, mean=477.78, max=477.78, sum=955.561 (2)", - "tab": "General information", - "score": 477.78034682080926 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=662.517, mean=662.517, max=662.517, sum=1325.035 (2)", - "tab": "General information", - "score": 662.5173184357542 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.918, - "details": { - "description": "min=0.918, mean=0.918, max=0.918, sum=1.837 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=1.989, mean=1.989, max=1.989, sum=3.977 (2)", - "tab": "Efficiency", - "score": 1.9886824734070723 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=570.337, mean=570.337, max=570.337, sum=1140.673 (2)", - "tab": "General information", - "score": 570.3366013071895 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.923, - "details": { - "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.819 (2)", - "tab": "Efficiency", - "score": 0.9094557386857492 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=505.194, mean=505.194, max=505.194, sum=1010.389 (2)", - "tab": "General information", - "score": 505.19444444444446 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)", - "tab": "Efficiency", - "score": 0.8472580974752253 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=402.009, mean=402.009, max=402.009, sum=804.018 (2)", - "tab": "General information", - "score": 402.0090909090909 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.837, - "details": { - "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)", - "tab": "Efficiency", - "score": 0.6588058092156235 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1160.294, mean=1160.294, max=1160.294, sum=2320.588 (2)", - "tab": "General information", - "score": 1160.2938775510204 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.955, - "details": { - "description": "min=0.955, mean=0.955, max=0.955, sum=1.91 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=1.251, mean=1.251, max=1.251, sum=2.501 (2)", - "tab": "Efficiency", - "score": 1.2506972652169603 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=443.891, mean=443.891, max=443.891, sum=887.781 (2)", - "tab": "General information", - "score": 443.8905472636816 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.019 (2)", - "tab": "Efficiency", - "score": 0.5092598558908485 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=329.572, mean=329.572, max=329.572, sum=659.145 (2)", - "tab": "General information", - "score": 329.5722891566265 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.912, - "details": { - "description": "min=0.912, mean=0.912, max=0.912, sum=1.825 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=1.251, mean=1.251, max=1.251, sum=2.501 (2)", - "tab": "Efficiency", - "score": 1.2507223441586857 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.918, mean=268.918, max=268.918, sum=537.836 (2)", - "tab": "General information", - "score": 268.91812865497076 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.215, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json deleted file mode 100644 index 86096274a..000000000 --- a/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.0 Pro 001", - "id": "google/gemini-1.0-pro-001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.333, mean=0.7, max=0.933, sum=79.795 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.291, mean=0.385, max=0.991, sum=43.868 (114)", - "tab": "Efficiency", - "score": 0.3848050244039386 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=260.164, mean=624.617, max=2789.424, sum=71206.345 (114)", - "tab": "General information", - "score": 624.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.34, - "details": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.991, mean=0.991, max=0.991, sum=1.982 (2)", - "tab": "Efficiency", - "score": 0.9907678151130677 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=375.97, mean=375.97, max=375.97, sum=751.94 (2)", - "tab": "General information", - "score": 375.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.636 (2)", - "tab": "Efficiency", - "score": 0.3178748925526937 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=336.356, mean=336.356, max=336.356, sum=672.711 (2)", - "tab": "General information", - "score": 336.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.333, - "details": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.377, mean=0.377, max=0.377, sum=0.754 (2)", - "tab": "Efficiency", - "score": 0.37708688735961915 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.2937609056631724 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)", - "tab": "Efficiency", - "score": 0.37500447273254395 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.712 (2)", - "tab": "Efficiency", - "score": 0.35595274686813355 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)", - "tab": "Efficiency", - "score": 0.31358790535458253 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)", - "tab": "Efficiency", - "score": 0.3357745151893765 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=562.02, mean=562.02, max=562.02, sum=1124.04 (2)", - "tab": "General information", - "score": 562.02 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=474.799, mean=474.799, max=474.799, sum=949.597 (2)", - "tab": "General information", - "score": 474.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=849.86, mean=849.86, max=849.86, sum=1699.72 (2)", - "tab": "General information", - "score": 849.86 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=618.69, mean=618.69, max=618.69, sum=1237.38 (2)", - "tab": "General information", - "score": 618.69 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=505.37, mean=505.37, max=505.37, sum=1010.74 (2)", - "tab": "General information", - "score": 505.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=499.471, mean=499.471, max=499.471, sum=998.941 (2)", - "tab": "General information", - "score": 499.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)", - "tab": "Efficiency", - "score": 0.31363418102264407 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=372.91, mean=372.91, max=372.91, sum=745.82 (2)", - "tab": "General information", - "score": 372.91 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.553, - "details": { - "description": "min=0.553, mean=0.553, max=0.553, sum=1.105 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.377, mean=0.377, max=0.377, sum=0.754 (2)", - "tab": "Efficiency", - "score": 0.37716702620188397 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=626.553, mean=626.553, max=626.553, sum=1253.105 (2)", - "tab": "General information", - "score": 626.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.639 (2)", - "tab": "Efficiency", - "score": 0.3196276807785034 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=448.54, mean=448.54, max=448.54, sum=897.08 (2)", - "tab": "General information", - "score": 448.54 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.29897612112539784 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=399.87, mean=399.87, max=399.87, sum=799.741 (2)", - "tab": "General information", - "score": 399.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.762, mean=0.762, max=0.762, sum=1.524 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.636 (2)", - "tab": "Efficiency", - "score": 0.31779951221306607 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=332.907, mean=332.907, max=332.907, sum=665.814 (2)", - "tab": "General information", - "score": 332.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.498, mean=0.498, max=0.498, sum=0.997 (2)", - "tab": "Efficiency", - "score": 0.49840929939298173 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Efficiency", - "score": 0.3838615434389588 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.019 (2)", - "tab": "Efficiency", - "score": 0.5094701207541172 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.775 (2)", - "tab": "Efficiency", - "score": 0.3877133719230953 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1105.092, mean=1105.092, max=1105.092, sum=2210.184 (2)", - "tab": "General information", - "score": 1105.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=747.418, mean=747.418, max=747.418, sum=1494.837 (2)", - "tab": "General information", - "score": 747.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1677.119, mean=1677.119, max=1677.119, sum=3354.239 (2)", - "tab": "General information", - "score": 1677.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=586.363, mean=586.363, max=586.363, sum=1172.725 (2)", - "tab": "General information", - "score": 586.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.611 (2)", - "tab": "Efficiency", - "score": 0.30568787574768064 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=430.2, mean=430.2, max=430.2, sum=860.4 (2)", - "tab": "General information", - "score": 430.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.517, mean=0.517, max=0.517, sum=1.035 (2)", - "tab": "Efficiency", - "score": 0.5173565070880087 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=594.421, mean=594.421, max=594.421, sum=1188.842 (2)", - "tab": "General information", - "score": 594.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)", - "tab": "Efficiency", - "score": 0.38599337100982667 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=544.87, mean=544.87, max=544.87, sum=1089.74 (2)", - "tab": "General information", - "score": 544.87 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.758, - "details": { - "description": "min=0.758, mean=0.758, max=0.758, sum=1.517 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.29948959980370865 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=394.592, mean=394.592, max=394.592, sum=789.185 (2)", - "tab": "General information", - "score": 394.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.29394423606547904 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=301.213, mean=301.213, max=301.213, sum=602.426 (2)", - "tab": "General information", - "score": 301.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.379 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.581 (2)", - "tab": "Efficiency", - "score": 0.2906524740416428 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=466.786, mean=466.786, max=466.786, sum=933.572 (2)", - "tab": "General information", - "score": 466.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.476, - "details": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)", - "tab": "Efficiency", - "score": 0.3928584957879687 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=589.341, mean=589.341, max=589.341, sum=1178.683 (2)", - "tab": "General information", - "score": 589.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.468, - "details": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.937 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.797 (2)", - "tab": "Efficiency", - "score": 0.39849274120633565 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=611.563, mean=611.563, max=611.563, sum=1223.127 (2)", - "tab": "General information", - "score": 611.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.3214967135460146 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)", - "tab": "Efficiency", - "score": 0.3413804282108551 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.378, mean=0.378, max=0.378, sum=0.756 (2)", - "tab": "Efficiency", - "score": 0.37822843074798584 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.672 (2)", - "tab": "Efficiency", - "score": 0.836203297701749 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)", - "tab": "Efficiency", - "score": 0.3208902616693516 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Efficiency", - "score": 0.3069849088401992 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.641 (2)", - "tab": "Efficiency", - "score": 0.32043021275446965 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)", - "tab": "Efficiency", - "score": 0.38611255663412586 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.631 (2)", - "tab": "Efficiency", - "score": 0.31541170993772877 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.767 (2)", - "tab": "Efficiency", - "score": 0.3833695673784673 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)", - "tab": "Efficiency", - "score": 0.33389012427891 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Efficiency", - "score": 0.39985558611375316 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.827, mean=0.827, max=0.827, sum=1.655 (2)", - "tab": "Efficiency", - "score": 0.8272603574921104 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.517, mean=0.517, max=0.517, sum=1.035 (2)", - "tab": "Efficiency", - "score": 0.5172926987273784 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=492.958, mean=492.958, max=492.958, sum=985.916 (2)", - "tab": "General information", - "score": 492.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=505.064, mean=505.064, max=505.064, sum=1010.128 (2)", - "tab": "General information", - "score": 505.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=927.13, mean=927.13, max=927.13, sum=1854.26 (2)", - "tab": "General information", - "score": 927.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2789.424, mean=2789.424, max=2789.424, sum=5578.848 (2)", - "tab": "General information", - "score": 2789.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=386.773, mean=386.773, max=386.773, sum=773.545 (2)", - "tab": "General information", - "score": 386.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=471.301, mean=471.301, max=471.301, sum=942.601 (2)", - "tab": "General information", - "score": 471.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=388.541, mean=388.541, max=388.541, sum=777.082 (2)", - "tab": "General information", - "score": 388.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=558.822, mean=558.822, max=558.822, sum=1117.644 (2)", - "tab": "General information", - "score": 558.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=407.954, mean=407.954, max=407.954, sum=815.908 (2)", - "tab": "General information", - "score": 407.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=583.715, mean=583.715, max=583.715, sum=1167.43 (2)", - "tab": "General information", - "score": 583.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=494.604, mean=494.604, max=494.604, sum=989.207 (2)", - "tab": "General information", - "score": 494.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=850.931, mean=850.931, max=850.931, sum=1701.861 (2)", - "tab": "General information", - "score": 850.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2197.583, mean=2197.583, max=2197.583, sum=4395.167 (2)", - "tab": "General information", - "score": 2197.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1418.544, mean=1418.544, max=1418.544, sum=2837.089 (2)", - "tab": "General information", - "score": 1418.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.618, - "details": { - "description": "min=0.618, mean=0.618, max=0.618, sum=1.237 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)", - "tab": "Efficiency", - "score": 0.3080115040321521 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)", - "tab": "Efficiency", - "score": 0.29670037984848024 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=313.587, mean=313.587, max=313.587, sum=627.175 (2)", - "tab": "General information", - "score": 313.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=339.183, mean=339.183, max=339.183, sum=678.366 (2)", - "tab": "General information", - "score": 339.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.761 (2)", - "tab": "Efficiency", - "score": 0.3803488971773258 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=636.165, mean=636.165, max=636.165, sum=1272.331 (2)", - "tab": "General information", - "score": 636.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.607 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.30376981372482204 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.049, mean=442.049, max=442.049, sum=884.098 (2)", - "tab": "General information", - "score": 442.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.527, - "details": { - "description": "min=0.527, mean=0.527, max=0.527, sum=1.054 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.761 (2)", - "tab": "Efficiency", - "score": 0.3805731492383139 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=694.402, mean=694.402, max=694.402, sum=1388.804 (2)", - "tab": "General information", - "score": 694.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.603 (2)", - "tab": "Efficiency", - "score": 0.3013762247215197 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=273.301, mean=273.301, max=273.301, sum=546.602 (2)", - "tab": "General information", - "score": 273.3009708737864 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.615 (2)", - "tab": "Efficiency", - "score": 0.30740204122331405 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=420.35, mean=420.35, max=420.35, sum=840.701 (2)", - "tab": "General information", - "score": 420.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.369, mean=0.369, max=0.369, sum=0.738 (2)", - "tab": "Efficiency", - "score": 0.36919414043426513 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=330.89, mean=330.89, max=330.89, sum=661.78 (2)", - "tab": "General information", - "score": 330.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.851, - "details": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.701 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)", - "tab": "Efficiency", - "score": 0.30495573064528814 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=306.669, mean=306.669, max=306.669, sum=613.338 (2)", - "tab": "General information", - "score": 306.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.921 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.3512327629706763 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Efficiency", - "score": 0.3902203835572113 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=487.003, mean=487.003, max=487.003, sum=974.006 (2)", - "tab": "General information", - "score": 487.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=682.542, mean=682.542, max=682.542, sum=1365.084 (2)", - "tab": "General information", - "score": 682.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.575 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.767 (2)", - "tab": "Efficiency", - "score": 0.3834058817695169 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=577.48, mean=577.48, max=577.48, sum=1154.961 (2)", - "tab": "General information", - "score": 577.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802, - "details": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.605 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.845 (2)", - "tab": "Efficiency", - "score": 0.42272565026342135 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=532.198, mean=532.198, max=532.198, sum=1064.395 (2)", - "tab": "General information", - "score": 532.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691, - "details": { - "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)", - "tab": "Efficiency", - "score": 0.3049524025483565 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=418.655, mean=418.655, max=418.655, sum=837.309 (2)", - "tab": "General information", - "score": 418.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.523, mean=0.523, max=0.523, sum=1.046 (2)", - "tab": "Efficiency", - "score": 0.5228155525363222 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1185.869, mean=1185.869, max=1185.869, sum=2371.739 (2)", - "tab": "General information", - "score": 1185.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.32126195395170754 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=448.274, mean=448.274, max=448.274, sum=896.547 (2)", - "tab": "General information", - "score": 448.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536, - "details": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)", - "tab": "Efficiency", - "score": 0.30881378018712424 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=328.753, mean=328.753, max=328.753, sum=657.506 (2)", - "tab": "General information", - "score": 328.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)", - "tab": "Efficiency", - "score": 0.3363749897270872 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=260.164, mean=260.164, max=260.164, sum=520.327 (2)", - "tab": "General information", - "score": 260.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json deleted file mode 100644 index 7aac2d734..000000000 --- a/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Flash 001", - "id": "google/gemini-1.5-flash-001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.374, mean=0.779, max=0.974, sum=88.804 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.386, mean=0.487, max=0.665, sum=55.55 (114)", - "tab": "Efficiency", - "score": 0.4872786268013793 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)", - "tab": "General information", - "score": 632.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.596, mean=0.596, max=0.596, sum=1.191 (2)", - "tab": "Efficiency", - "score": 0.595533971786499 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)", - "tab": "General information", - "score": 383.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.071 (2)", - "tab": "Efficiency", - "score": 0.5356822949868661 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)", - "tab": "General information", - "score": 344.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)", - "tab": "Efficiency", - "score": 0.6201749587059021 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.497, mean=0.497, max=0.497, sum=0.995 (2)", - "tab": "Efficiency", - "score": 0.4974212066994773 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.571, mean=0.571, max=0.571, sum=1.143 (2)", - "tab": "Efficiency", - "score": 0.5714822864532471 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.543, mean=0.543, max=0.543, sum=1.085 (2)", - "tab": "Efficiency", - "score": 0.5425397109985352 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.487, mean=0.487, max=0.487, sum=0.975 (2)", - "tab": "Efficiency", - "score": 0.48738120056990253 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.608, mean=0.608, max=0.608, sum=1.215 (2)", - "tab": "Efficiency", - "score": 0.6076285418342141 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)", - "tab": "General information", - "score": 570.02 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)", - "tab": "General information", - "score": 482.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)", - "tab": "General information", - "score": 857.86 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)", - "tab": "General information", - "score": 626.69 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)", - "tab": "General information", - "score": 513.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)", - "tab": "General information", - "score": 507.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.538, mean=0.538, max=0.538, sum=1.075 (2)", - "tab": "Efficiency", - "score": 0.537526171207428 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)", - "tab": "General information", - "score": 380.91 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614, - "details": { - "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.564, mean=0.564, max=0.564, sum=1.128 (2)", - "tab": "Efficiency", - "score": 0.5637641475911725 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)", - "tab": "General information", - "score": 634.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.549, mean=0.549, max=0.549, sum=1.097 (2)", - "tab": "Efficiency", - "score": 0.5487277007102966 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)", - "tab": "General information", - "score": 456.54 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.501, mean=0.501, max=0.501, sum=1.002 (2)", - "tab": "Efficiency", - "score": 0.5009041649323923 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)", - "tab": "General information", - "score": 407.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)", - "tab": "Efficiency", - "score": 0.48008891700548373 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)", - "tab": "General information", - "score": 340.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=1.657 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.955 (2)", - "tab": "Efficiency", - "score": 0.47726698907099085 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)", - "tab": "Efficiency", - "score": 0.4398383096600255 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)", - "tab": "Efficiency", - "score": 0.42376324315969854 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.773 (2)", - "tab": "Efficiency", - "score": 0.3864205361981141 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)", - "tab": "General information", - "score": 1113.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)", - "tab": "General information", - "score": 755.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)", - "tab": "General information", - "score": 1685.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)", - "tab": "General information", - "score": 594.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.525, mean=0.525, max=0.525, sum=1.05 (2)", - "tab": "Efficiency", - "score": 0.5247626876831055 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)", - "tab": "General information", - "score": 438.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882, - "details": { - "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.498, mean=0.498, max=0.498, sum=0.995 (2)", - "tab": "Efficiency", - "score": 0.49771531004654734 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)", - "tab": "General information", - "score": 602.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.608, mean=0.608, max=0.608, sum=1.216 (2)", - "tab": "Efficiency", - "score": 0.608082628250122 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)", - "tab": "General information", - "score": 552.87 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.419, mean=0.419, max=0.419, sum=0.839 (2)", - "tab": "Efficiency", - "score": 0.41935023991566783 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)", - "tab": "General information", - "score": 402.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.851, - "details": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.901 (2)", - "tab": "Efficiency", - "score": 0.4506680082767568 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)", - "tab": "General information", - "score": 309.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)", - "tab": "Efficiency", - "score": 0.4601488047632678 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)", - "tab": "General information", - "score": 474.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.754, - "details": { - "description": "min=0.754, mean=0.754, max=0.754, sum=1.508 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.41, mean=0.41, max=0.41, sum=0.819 (2)", - "tab": "Efficiency", - "score": 0.40957188984704396 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)", - "tab": "General information", - "score": 597.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.627, - "details": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.254 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.524, mean=0.524, max=0.524, sum=1.047 (2)", - "tab": "Efficiency", - "score": 0.5235741989953178 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)", - "tab": "General information", - "score": 619.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.907, - "details": { - "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)", - "tab": "Efficiency", - "score": 0.43886603309262184 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.937 (2)", - "tab": "Efficiency", - "score": 0.4683608938320517 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.511, mean=0.511, max=0.511, sum=1.022 (2)", - "tab": "Efficiency", - "score": 0.5109630298614501 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.665, mean=0.665, max=0.665, sum=1.33 (2)", - "tab": "Efficiency", - "score": 0.665167844656742 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)", - "tab": "Efficiency", - "score": 0.43152768804569436 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.845 (2)", - "tab": "Efficiency", - "score": 0.4224596888290168 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.404, mean=0.404, max=0.404, sum=0.808 (2)", - "tab": "Efficiency", - "score": 0.4038744736940433 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.869 (2)", - "tab": "Efficiency", - "score": 0.43474441987496837 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)", - "tab": "Efficiency", - "score": 0.4159359881857864 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.493, mean=0.493, max=0.493, sum=0.985 (2)", - "tab": "Efficiency", - "score": 0.49265997772974685 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.835 (2)", - "tab": "Efficiency", - "score": 0.41751264342490363 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.497, mean=0.497, max=0.497, sum=0.993 (2)", - "tab": "Efficiency", - "score": 0.49666665218494555 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.606, mean=0.606, max=0.606, sum=1.213 (2)", - "tab": "Efficiency", - "score": 0.6064977821181802 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.469, mean=0.469, max=0.469, sum=0.939 (2)", - "tab": "Efficiency", - "score": 0.46946642569851776 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)", - "tab": "General information", - "score": 500.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)", - "tab": "General information", - "score": 513.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)", - "tab": "General information", - "score": 935.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)", - "tab": "General information", - "score": 2797.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)", - "tab": "General information", - "score": 394.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)", - "tab": "General information", - "score": 479.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)", - "tab": "General information", - "score": 396.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)", - "tab": "General information", - "score": 566.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)", - "tab": "General information", - "score": 415.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)", - "tab": "General information", - "score": 591.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)", - "tab": "General information", - "score": 502.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)", - "tab": "General information", - "score": 858.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)", - "tab": "General information", - "score": 2205.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)", - "tab": "General information", - "score": 1426.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374, - "details": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.748 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)", - "tab": "Efficiency", - "score": 0.45039264396701695 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.494, mean=0.494, max=0.494, sum=0.989 (2)", - "tab": "Efficiency", - "score": 0.494300215931262 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)", - "tab": "General information", - "score": 321.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)", - "tab": "General information", - "score": 347.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.543, mean=0.543, max=0.543, sum=1.086 (2)", - "tab": "Efficiency", - "score": 0.5427691305964446 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)", - "tab": "General information", - "score": 644.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.853, - "details": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.485, mean=0.485, max=0.485, sum=0.969 (2)", - "tab": "Efficiency", - "score": 0.48451554263296304 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)", - "tab": "General information", - "score": 450.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.571, - "details": { - "description": "min=0.571, mean=0.571, max=0.571, sum=1.143 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.515, mean=0.515, max=0.515, sum=1.029 (2)", - "tab": "Efficiency", - "score": 0.5145284725087029 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)", - "tab": "General information", - "score": 702.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.139 (2)", - "tab": "Efficiency", - "score": 0.5696360532519886 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)", - "tab": "General information", - "score": 281.3009708737864 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.876 (2)", - "tab": "Efficiency", - "score": 0.43808113204108345 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)", - "tab": "General information", - "score": 428.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.514, mean=0.514, max=0.514, sum=1.029 (2)", - "tab": "Efficiency", - "score": 0.514304575920105 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)", - "tab": "General information", - "score": 338.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.773 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.79 (2)", - "tab": "Efficiency", - "score": 0.3951411627870562 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)", - "tab": "General information", - "score": 314.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "description": "min=0.637, mean=0.637, max=0.637, sum=1.274 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.806 (2)", - "tab": "Efficiency", - "score": 0.4028203390646672 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.801 (2)", - "tab": "Efficiency", - "score": 0.4004550709633243 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)", - "tab": "General information", - "score": 495.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)", - "tab": "General information", - "score": 690.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", - "tab": "Efficiency", - "score": 0.4019969655018227 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)", - "tab": "General information", - "score": 585.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.867, - "details": { - "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)", - "tab": "Efficiency", - "score": 0.40340044910525097 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)", - "tab": "General information", - "score": 540.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.764, - "details": { - "description": "min=0.764, mean=0.764, max=0.764, sum=1.527 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.109 (2)", - "tab": "Efficiency", - "score": 0.5543096672404896 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)", - "tab": "General information", - "score": 426.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.913 (2)", - "tab": "Efficiency", - "score": 0.45644889948319417 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)", - "tab": "General information", - "score": 1193.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "details": { - "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.458, mean=0.458, max=0.458, sum=0.916 (2)", - "tab": "Efficiency", - "score": 0.4581311152349064 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)", - "tab": "General information", - "score": 456.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566, - "details": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.899 (2)", - "tab": "Efficiency", - "score": 0.44963935197117816 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)", - "tab": "General information", - "score": 336.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)", - "tab": "Efficiency", - "score": 0.45928927890041416 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)", - "tab": "General information", - "score": 268.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json deleted file mode 100644 index a87c94c3b..000000000 --- a/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Flash 002", - "id": "google/gemini-1.5-flash-002", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739, - "details": { - "description": "min=0.27, mean=0.739, max=0.959, sum=84.201 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.262, mean=0.315, max=0.767, sum=35.937 (114)", - "tab": "Efficiency", - "score": 0.3152340762781926 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)", - "tab": "General information", - "score": 632.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63, - "details": { - "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)", - "tab": "Efficiency", - "score": 0.29806760787963865 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)", - "tab": "General information", - "score": 383.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.583 (2)", - "tab": "Efficiency", - "score": 0.29152930047776965 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)", - "tab": "General information", - "score": 344.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "description": "min=0.637, mean=0.637, max=0.637, sum=1.275 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.2988364624977112 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)", - "tab": "Efficiency", - "score": 0.29801897870169747 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.597 (2)", - "tab": "Efficiency", - "score": 0.2985741686820984 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.576 (2)", - "tab": "Efficiency", - "score": 0.28819103717803957 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)", - "tab": "Efficiency", - "score": 0.29290392495304174 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)", - "tab": "Efficiency", - "score": 0.29088794483857994 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)", - "tab": "General information", - "score": 570.02 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)", - "tab": "General information", - "score": 482.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)", - "tab": "General information", - "score": 857.86 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)", - "tab": "General information", - "score": 626.69 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)", - "tab": "General information", - "score": 513.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)", - "tab": "General information", - "score": 507.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.2992409729957581 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)", - "tab": "General information", - "score": 380.91 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)", - "tab": "Efficiency", - "score": 0.295004924138387 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)", - "tab": "General information", - "score": 634.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47, - "details": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)", - "tab": "Efficiency", - "score": 0.3007749605178833 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)", - "tab": "General information", - "score": 456.54 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.2988583313094245 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)", - "tab": "General information", - "score": 407.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.797, - "details": { - "description": "min=0.797, mean=0.797, max=0.797, sum=1.595 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)", - "tab": "Efficiency", - "score": 0.2892080227278436 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)", - "tab": "General information", - "score": 340.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.806, - "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.611 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)", - "tab": "Efficiency", - "score": 0.3027217843953301 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.636 (2)", - "tab": "Efficiency", - "score": 0.318213385893098 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.34364056462881 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.366, mean=0.366, max=0.366, sum=0.732 (2)", - "tab": "Efficiency", - "score": 0.3660228084894567 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)", - "tab": "General information", - "score": 1113.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)", - "tab": "General information", - "score": 755.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)", - "tab": "General information", - "score": 1685.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)", - "tab": "General information", - "score": 594.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)", - "tab": "Efficiency", - "score": 0.291001398563385 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)", - "tab": "General information", - "score": 438.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.895, - "details": { - "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)", - "tab": "Efficiency", - "score": 0.2922459558436745 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)", - "tab": "General information", - "score": 602.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27, - "details": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)", - "tab": "Efficiency", - "score": 0.29986772060394284 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)", - "tab": "General information", - "score": 552.87 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.792, - "details": { - "description": "min=0.792, mean=0.792, max=0.792, sum=1.585 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.601 (2)", - "tab": "Efficiency", - "score": 0.3003354540411031 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)", - "tab": "General information", - "score": 402.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.851, - "details": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.575 (2)", - "tab": "Efficiency", - "score": 0.28759900458315585 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)", - "tab": "General information", - "score": 309.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772, - "details": { - "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.2938007436949631 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)", - "tab": "General information", - "score": 474.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704, - "details": { - "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)", - "tab": "Efficiency", - "score": 0.29476307119641987 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)", - "tab": "General information", - "score": 597.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.595, - "details": { - "description": "min=0.595, mean=0.595, max=0.595, sum=1.19 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.283, mean=0.283, max=0.283, sum=0.567 (2)", - "tab": "Efficiency", - "score": 0.28335455107310464 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)", - "tab": "General information", - "score": 619.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.869, mean=0.869, max=0.869, sum=1.738 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Efficiency", - "score": 0.2898174070542858 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.553 (2)", - "tab": "Efficiency", - "score": 0.27643810704423877 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.579 (2)", - "tab": "Efficiency", - "score": 0.28958702087402344 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.369, mean=0.369, max=0.369, sum=0.739 (2)", - "tab": "Efficiency", - "score": 0.369471347693241 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.278, mean=0.278, max=0.278, sum=0.556 (2)", - "tab": "Efficiency", - "score": 0.2780994249112678 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)", - "tab": "Efficiency", - "score": 0.26881929382759057 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", - "tab": "Efficiency", - "score": 0.2700315811695197 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)", - "tab": "Efficiency", - "score": 0.2672289636400011 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.262, mean=0.262, max=0.262, sum=0.525 (2)", - "tab": "Efficiency", - "score": 0.2623477593189528 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.583 (2)", - "tab": "Efficiency", - "score": 0.2917157135262395 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.537 (2)", - "tab": "Efficiency", - "score": 0.2685232871169344 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)", - "tab": "Efficiency", - "score": 0.5018655392858717 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)", - "tab": "Efficiency", - "score": 0.4363996000850902 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.3298424698632478 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)", - "tab": "General information", - "score": 500.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)", - "tab": "General information", - "score": 513.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)", - "tab": "General information", - "score": 935.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)", - "tab": "General information", - "score": 2797.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)", - "tab": "General information", - "score": 394.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)", - "tab": "General information", - "score": 479.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)", - "tab": "General information", - "score": 396.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)", - "tab": "General information", - "score": 566.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)", - "tab": "General information", - "score": 415.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)", - "tab": "General information", - "score": 591.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)", - "tab": "General information", - "score": 502.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)", - "tab": "General information", - "score": 858.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)", - "tab": "General information", - "score": 2205.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)", - "tab": "General information", - "score": 1426.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.847, - "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.278, mean=0.278, max=0.278, sum=0.555 (2)", - "tab": "Efficiency", - "score": 0.2775634660849122 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)", - "tab": "Efficiency", - "score": 0.41606709793323776 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)", - "tab": "General information", - "score": 321.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)", - "tab": "General information", - "score": 347.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.504 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.268, mean=0.268, max=0.268, sum=0.535 (2)", - "tab": "Efficiency", - "score": 0.267673009683278 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)", - "tab": "General information", - "score": 644.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.718 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.268, mean=0.268, max=0.268, sum=0.535 (2)", - "tab": "Efficiency", - "score": 0.2676804094958159 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)", - "tab": "General information", - "score": 450.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)", - "tab": "Efficiency", - "score": 0.2695028483867645 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)", - "tab": "General information", - "score": 702.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)", - "tab": "Efficiency", - "score": 0.3324842568740104 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)", - "tab": "General information", - "score": 281.3009708737864 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.953, - "details": { - "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.767, mean=0.767, max=0.767, sum=1.533 (2)", - "tab": "Efficiency", - "score": 0.7665768270818596 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)", - "tab": "General information", - "score": 428.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)", - "tab": "Efficiency", - "score": 0.2972432613372803 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)", - "tab": "General information", - "score": 338.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.849 (2)", - "tab": "Efficiency", - "score": 0.4247035331652996 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)", - "tab": "General information", - "score": 314.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.676, - "details": { - "description": "min=0.676, mean=0.676, max=0.676, sum=1.352 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)", - "tab": "Efficiency", - "score": 0.2965996671963289 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)", - "tab": "Efficiency", - "score": 0.29666628491279134 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)", - "tab": "General information", - "score": 495.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)", - "tab": "General information", - "score": 690.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.588, - "details": { - "description": "min=0.588, mean=0.588, max=0.588, sum=1.176 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.575 (2)", - "tab": "Efficiency", - "score": 0.2876783258774701 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)", - "tab": "General information", - "score": 585.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.762, - "details": { - "description": "min=0.762, mean=0.762, max=0.762, sum=1.525 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)", - "tab": "Efficiency", - "score": 0.3001174411655944 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)", - "tab": "General information", - "score": 540.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)", - "tab": "Efficiency", - "score": 0.2860603137449785 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)", - "tab": "General information", - "score": 426.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.547, - "details": { - "description": "min=0.547, mean=0.547, max=0.547, sum=1.094 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.795 (2)", - "tab": "Efficiency", - "score": 0.3977492381115349 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)", - "tab": "General information", - "score": 1193.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.851, - "details": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.701 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)", - "tab": "Efficiency", - "score": 0.29507939969722313 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)", - "tab": "General information", - "score": 456.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524, - "details": { - "description": "min=0.524, mean=0.524, max=0.524, sum=1.048 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.574 (2)", - "tab": "Efficiency", - "score": 0.28698748852833206 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)", - "tab": "General information", - "score": 336.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.576 (2)", - "tab": "Efficiency", - "score": 0.2880588832654451 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)", - "tab": "General information", - "score": 268.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.817, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json deleted file mode 100644 index b8d59d877..000000000 --- a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Flash 0514 preview", - "id": "google/gemini-1.5-flash-preview-0514", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.374, mean=0.778, max=0.969, sum=88.647 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.323, mean=0.348, max=0.49, sum=39.671 (114)", - "tab": "Efficiency", - "score": 0.3479928578252291 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)", - "tab": "General information", - "score": 632.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.828 (2)", - "tab": "Efficiency", - "score": 0.4139195799827576 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)", - "tab": "General information", - "score": 383.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)", - "tab": "Efficiency", - "score": 0.33077726717348455 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)", - "tab": "General information", - "score": 344.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)", - "tab": "Efficiency", - "score": 0.3412753510475159 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)", - "tab": "Efficiency", - "score": 0.33089664578437805 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.358, mean=0.358, max=0.358, sum=0.715 (2)", - "tab": "Efficiency", - "score": 0.35753655195236206 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.688 (2)", - "tab": "Efficiency", - "score": 0.3440544652938843 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.33949112616522464 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.678 (2)", - "tab": "Efficiency", - "score": 0.33893728957456704 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)", - "tab": "General information", - "score": 570.02 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)", - "tab": "General information", - "score": 482.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)", - "tab": "General information", - "score": 857.86 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)", - "tab": "General information", - "score": 626.69 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)", - "tab": "General information", - "score": 513.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)", - "tab": "General information", - "score": 507.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.657 (2)", - "tab": "Efficiency", - "score": 0.3285136580467224 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)", - "tab": "General information", - "score": 380.91 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.33929300726505746 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)", - "tab": "General information", - "score": 634.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.65 (2)", - "tab": "Efficiency", - "score": 0.32497448682785035 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)", - "tab": "General information", - "score": 456.54 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)", - "tab": "Efficiency", - "score": 0.3270833028687371 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)", - "tab": "General information", - "score": 407.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)", - "tab": "Efficiency", - "score": 0.3517766727128596 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)", - "tab": "General information", - "score": 340.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.65 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.707 (2)", - "tab": "Efficiency", - "score": 0.3533606018967294 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.707 (2)", - "tab": "Efficiency", - "score": 0.35356061509315 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)", - "tab": "Efficiency", - "score": 0.37605549059613214 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.707 (2)", - "tab": "Efficiency", - "score": 0.3533070875625861 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)", - "tab": "General information", - "score": 1113.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)", - "tab": "General information", - "score": 755.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)", - "tab": "General information", - "score": 1685.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)", - "tab": "General information", - "score": 594.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.3394037842750549 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)", - "tab": "General information", - "score": 438.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.737 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.758 (2)", - "tab": "Efficiency", - "score": 0.3787926027649327 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)", - "tab": "General information", - "score": 602.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)", - "tab": "Efficiency", - "score": 0.3517553758621216 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)", - "tab": "General information", - "score": 552.87 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=1.675 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)", - "tab": "Efficiency", - "score": 0.3246132454782162 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)", - "tab": "General information", - "score": 402.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.711 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.32754647579598933 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)", - "tab": "General information", - "score": 309.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.814, - "details": { - "description": "min=0.814, mean=0.814, max=0.814, sum=1.628 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)", - "tab": "Efficiency", - "score": 0.3282040464467016 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)", - "tab": "General information", - "score": 474.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.33972583182905086 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)", - "tab": "General information", - "score": 597.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611, - "details": { - "description": "min=0.611, mean=0.611, max=0.611, sum=1.222 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.34669986982194206 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)", - "tab": "General information", - "score": 619.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.907, - "details": { - "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.362, mean=0.362, max=0.362, sum=0.725 (2)", - "tab": "Efficiency", - "score": 0.36248803600188223 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)", - "tab": "Efficiency", - "score": 0.3359241544319491 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.709 (2)", - "tab": "Efficiency", - "score": 0.35430107831954955 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)", - "tab": "Efficiency", - "score": 0.4900842637726755 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)", - "tab": "Efficiency", - "score": 0.33633674395204793 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.669 (2)", - "tab": "Efficiency", - "score": 0.3347120445627005 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.661 (2)", - "tab": "Efficiency", - "score": 0.33047562073438597 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.3431409650378757 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)", - "tab": "Efficiency", - "score": 0.328948572904122 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.3431161413129592 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.364, mean=0.364, max=0.364, sum=0.728 (2)", - "tab": "Efficiency", - "score": 0.3637816064498004 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.701 (2)", - "tab": "Efficiency", - "score": 0.35072638701509545 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.484, mean=0.484, max=0.484, sum=0.967 (2)", - "tab": "Efficiency", - "score": 0.48351573476604387 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.376, mean=0.376, max=0.376, sum=0.753 (2)", - "tab": "Efficiency", - "score": 0.3762651908246777 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)", - "tab": "General information", - "score": 500.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)", - "tab": "General information", - "score": 513.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)", - "tab": "General information", - "score": 935.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)", - "tab": "General information", - "score": 2797.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)", - "tab": "General information", - "score": 394.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)", - "tab": "General information", - "score": 479.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)", - "tab": "General information", - "score": 396.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)", - "tab": "General information", - "score": 566.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)", - "tab": "General information", - "score": 415.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)", - "tab": "General information", - "score": 591.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)", - "tab": "General information", - "score": 502.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)", - "tab": "General information", - "score": 858.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)", - "tab": "General information", - "score": 2205.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)", - "tab": "General information", - "score": 1426.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374, - "details": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.748 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)", - "tab": "Efficiency", - "score": 0.3287716788561355 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.32337414208105053 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)", - "tab": "General information", - "score": 321.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)", - "tab": "General information", - "score": 347.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.698 (2)", - "tab": "Efficiency", - "score": 0.34882096219653924 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)", - "tab": "General information", - "score": 644.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.853, - "details": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)", - "tab": "Efficiency", - "score": 0.32894283277125447 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)", - "tab": "General information", - "score": 450.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.689 (2)", - "tab": "Efficiency", - "score": 0.3445145934820175 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)", - "tab": "General information", - "score": 702.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.709 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)", - "tab": "Efficiency", - "score": 0.32611215461805027 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)", - "tab": "General information", - "score": 281.3009708737864 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.936, - "details": { - "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.663 (2)", - "tab": "Efficiency", - "score": 0.3313393389057909 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)", - "tab": "General information", - "score": 428.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.667 (2)", - "tab": "Efficiency", - "score": 0.3336531209945679 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)", - "tab": "General information", - "score": 338.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.3299713125630814 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)", - "tab": "General information", - "score": 314.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.631, - "details": { - "description": "min=0.631, mean=0.631, max=0.631, sum=1.263 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)", - "tab": "Efficiency", - "score": 0.33562634716863216 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)", - "tab": "Efficiency", - "score": 0.34689992780224144 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)", - "tab": "General information", - "score": 495.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)", - "tab": "General information", - "score": 690.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.801, - "details": { - "description": "min=0.801, mean=0.801, max=0.801, sum=1.601 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.695 (2)", - "tab": "Efficiency", - "score": 0.3477346959456899 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)", - "tab": "General information", - "score": 585.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.867, - "details": { - "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)", - "tab": "Efficiency", - "score": 0.34701154850147387 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)", - "tab": "General information", - "score": 540.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)", - "tab": "Efficiency", - "score": 0.3317977645180442 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)", - "tab": "General information", - "score": 426.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)", - "tab": "Efficiency", - "score": 0.3700062508485755 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)", - "tab": "General information", - "score": 1193.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.33022794794680466 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)", - "tab": "General information", - "score": 456.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566, - "details": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)", - "tab": "Efficiency", - "score": 0.3290767310613609 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)", - "tab": "General information", - "score": 336.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)", - "tab": "Efficiency", - "score": 0.3263405735729731 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)", - "tab": "General information", - "score": 268.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.713, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json deleted file mode 100644 index 0632aee68..000000000 --- a/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Pro 001", - "id": "google/gemini-1.5-pro-001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827, - "details": { - "description": "min=0.374, mean=0.827, max=0.974, sum=94.288 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.519, mean=0.618, max=0.799, sum=70.445 (114)", - "tab": "Efficiency", - "score": 0.6179386045856378 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)", - "tab": "General information", - "score": 632.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)", - "tab": "Efficiency", - "score": 0.6589885497093201 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)", - "tab": "General information", - "score": 383.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)", - "tab": "Efficiency", - "score": 0.6710023721059163 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)", - "tab": "General information", - "score": 344.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.527 (2)", - "tab": "Efficiency", - "score": 0.7634538197517395 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.622, mean=0.622, max=0.622, sum=1.244 (2)", - "tab": "Efficiency", - "score": 0.6218778673145506 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.664, mean=0.664, max=0.664, sum=1.328 (2)", - "tab": "Efficiency", - "score": 0.6641578316688538 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.389 (2)", - "tab": "Efficiency", - "score": 0.6943222141265869 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)", - "tab": "Efficiency", - "score": 0.5860298300065057 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.799, mean=0.799, max=0.799, sum=1.597 (2)", - "tab": "Efficiency", - "score": 0.7986945521597769 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)", - "tab": "General information", - "score": 570.02 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)", - "tab": "General information", - "score": 482.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)", - "tab": "General information", - "score": 857.86 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)", - "tab": "General information", - "score": 626.69 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)", - "tab": "General information", - "score": 513.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)", - "tab": "General information", - "score": 507.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)", - "tab": "Efficiency", - "score": 0.7018922233581543 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)", - "tab": "General information", - "score": 380.91 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.728, - "details": { - "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", - "tab": "Efficiency", - "score": 0.6497656546140972 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)", - "tab": "General information", - "score": 634.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)", - "tab": "Efficiency", - "score": 0.6698257994651794 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)", - "tab": "General information", - "score": 456.54 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.624, mean=0.624, max=0.624, sum=1.248 (2)", - "tab": "Efficiency", - "score": 0.6239932885876408 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)", - "tab": "General information", - "score": 407.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Efficiency", - "score": 0.5198829174041748 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)", - "tab": "General information", - "score": 340.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.601, mean=0.601, max=0.601, sum=1.202 (2)", - "tab": "Efficiency", - "score": 0.6008452876467546 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.079 (2)", - "tab": "Efficiency", - "score": 0.5394198826864256 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.564, mean=0.564, max=0.564, sum=1.128 (2)", - "tab": "Efficiency", - "score": 0.5641645779784438 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)", - "tab": "Efficiency", - "score": 0.5440043469792918 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)", - "tab": "General information", - "score": 1113.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)", - "tab": "General information", - "score": 755.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)", - "tab": "General information", - "score": 1685.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)", - "tab": "General information", - "score": 594.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.677, mean=0.677, max=0.677, sum=1.354 (2)", - "tab": "Efficiency", - "score": 0.6769772005081177 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)", - "tab": "General information", - "score": 438.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.829 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)", - "tab": "Efficiency", - "score": 0.6491834003674356 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)", - "tab": "General information", - "score": 602.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.394 (2)", - "tab": "Efficiency", - "score": 0.697232437133789 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)", - "tab": "General information", - "score": 552.87 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.853, - "details": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.091 (2)", - "tab": "Efficiency", - "score": 0.545333849708989 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)", - "tab": "General information", - "score": 402.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.573, mean=0.573, max=0.573, sum=1.146 (2)", - "tab": "Efficiency", - "score": 0.5729408700415428 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)", - "tab": "General information", - "score": 309.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.622, mean=0.622, max=0.622, sum=1.244 (2)", - "tab": "Efficiency", - "score": 0.6219884050303492 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)", - "tab": "General information", - "score": 474.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.939, - "details": { - "description": "min=0.939, mean=0.939, max=0.939, sum=1.878 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.097 (2)", - "tab": "Efficiency", - "score": 0.5484477596938926 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)", - "tab": "General information", - "score": 597.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)", - "tab": "Efficiency", - "score": 0.6678630435277545 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)", - "tab": "General information", - "score": 619.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)", - "tab": "Efficiency", - "score": 0.5502124647940358 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.156 (2)", - "tab": "Efficiency", - "score": 0.5780763097584541 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Efficiency", - "score": 0.6602028679847717 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.775, mean=0.775, max=0.775, sum=1.55 (2)", - "tab": "Efficiency", - "score": 0.7751016385627515 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.571, mean=0.571, max=0.571, sum=1.141 (2)", - "tab": "Efficiency", - "score": 0.5705801778369479 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.582, mean=0.582, max=0.582, sum=1.163 (2)", - "tab": "Efficiency", - "score": 0.5816669402344857 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.081 (2)", - "tab": "Efficiency", - "score": 0.5402819168873322 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.168 (2)", - "tab": "Efficiency", - "score": 0.5841257324925175 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.556, mean=0.556, max=0.556, sum=1.113 (2)", - "tab": "Efficiency", - "score": 0.556499927985568 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.632, mean=0.632, max=0.632, sum=1.264 (2)", - "tab": "Efficiency", - "score": 0.6318649550936869 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)", - "tab": "Efficiency", - "score": 0.5397529965814423 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.603, mean=0.603, max=0.603, sum=1.205 (2)", - "tab": "Efficiency", - "score": 0.6027307720096023 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.762, mean=0.762, max=0.762, sum=1.524 (2)", - "tab": "Efficiency", - "score": 0.7618554059196921 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.626, mean=0.626, max=0.626, sum=1.252 (2)", - "tab": "Efficiency", - "score": 0.6258294099493872 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)", - "tab": "General information", - "score": 500.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)", - "tab": "General information", - "score": 513.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)", - "tab": "General information", - "score": 935.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)", - "tab": "General information", - "score": 2797.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)", - "tab": "General information", - "score": 394.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)", - "tab": "General information", - "score": 479.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)", - "tab": "General information", - "score": 396.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)", - "tab": "General information", - "score": 566.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)", - "tab": "General information", - "score": 415.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)", - "tab": "General information", - "score": 591.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)", - "tab": "General information", - "score": 502.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)", - "tab": "General information", - "score": 858.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)", - "tab": "General information", - "score": 2205.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)", - "tab": "General information", - "score": 1426.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374, - "details": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.748 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.563, mean=0.563, max=0.563, sum=1.127 (2)", - "tab": "Efficiency", - "score": 0.5634646939589838 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.616, mean=0.616, max=0.616, sum=1.231 (2)", - "tab": "Efficiency", - "score": 0.6156448550143484 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)", - "tab": "General information", - "score": 321.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)", - "tab": "General information", - "score": 347.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)", - "tab": "Efficiency", - "score": 0.672865920815586 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)", - "tab": "General information", - "score": 644.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.896, - "details": { - "description": "min=0.896, mean=0.896, max=0.896, sum=1.791 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.616, mean=0.616, max=0.616, sum=1.233 (2)", - "tab": "Efficiency", - "score": 0.6164792593271454 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)", - "tab": "General information", - "score": 450.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)", - "tab": "Efficiency", - "score": 0.6377767409597125 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)", - "tab": "General information", - "score": 702.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.922, - "details": { - "description": "min=0.922, mean=0.922, max=0.922, sum=1.845 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)", - "tab": "Efficiency", - "score": 0.6690320089025404 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)", - "tab": "General information", - "score": 281.3009708737864 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=1.863 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)", - "tab": "Efficiency", - "score": 0.5537131362491183 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)", - "tab": "General information", - "score": 428.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.678, mean=0.678, max=0.678, sum=1.356 (2)", - "tab": "Efficiency", - "score": 0.678006865978241 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)", - "tab": "General information", - "score": 338.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.958, - "details": { - "description": "min=0.958, mean=0.958, max=0.958, sum=1.916 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)", - "tab": "Efficiency", - "score": 0.519028120113972 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)", - "tab": "General information", - "score": 314.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739, - "details": { - "description": "min=0.739, mean=0.739, max=0.739, sum=1.477 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.546, mean=0.546, max=0.546, sum=1.092 (2)", - "tab": "Efficiency", - "score": 0.5461560525755952 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)", - "tab": "Efficiency", - "score": 0.5358252359053416 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)", - "tab": "General information", - "score": 495.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)", - "tab": "General information", - "score": 690.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.879, - "details": { - "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.569, mean=0.569, max=0.569, sum=1.139 (2)", - "tab": "Efficiency", - "score": 0.5694240697848252 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)", - "tab": "General information", - "score": 585.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.141 (2)", - "tab": "Efficiency", - "score": 0.5704048761615047 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)", - "tab": "General information", - "score": 540.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.818, - "details": { - "description": "min=0.818, mean=0.818, max=0.818, sum=1.636 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.403 (2)", - "tab": "Efficiency", - "score": 0.7017486507242376 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)", - "tab": "General information", - "score": 426.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.873, - "details": { - "description": "min=0.873, mean=0.873, max=0.873, sum=1.747 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", - "tab": "Efficiency", - "score": 0.6002200584022366 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)", - "tab": "General information", - "score": 1193.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.603, mean=0.603, max=0.603, sum=1.206 (2)", - "tab": "Efficiency", - "score": 0.6029752119263606 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)", - "tab": "General information", - "score": 456.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)", - "tab": "Efficiency", - "score": 0.5903763368905309 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)", - "tab": "General information", - "score": 336.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.568, mean=0.568, max=0.568, sum=1.137 (2)", - "tab": "Efficiency", - "score": 0.5682888700250994 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)", - "tab": "General information", - "score": 268.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json deleted file mode 100644 index d6a3ba87a..000000000 --- a/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Pro 002", - "id": "google/gemini-1.5-pro-002", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.566, mean=0.869, max=0.99, sum=99.042 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.42, mean=0.696, max=1.671, sum=79.296 (114)", - "tab": "Efficiency", - "score": 0.695582110070124 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)", - "tab": "General information", - "score": 632.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=1.671, mean=1.671, max=1.671, sum=3.341 (2)", - "tab": "Efficiency", - "score": 1.6706047868728637 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)", - "tab": "General information", - "score": 383.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.653, mean=0.653, max=0.653, sum=1.306 (2)", - "tab": "Efficiency", - "score": 0.652814730891475 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)", - "tab": "General information", - "score": 344.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.863, - "details": { - "description": "min=0.863, mean=0.863, max=0.863, sum=1.725 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=1.16, mean=1.16, max=1.16, sum=2.319 (2)", - "tab": "Efficiency", - "score": 1.1597088170051575 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.511, mean=0.511, max=0.511, sum=1.022 (2)", - "tab": "Efficiency", - "score": 0.5110265033112632 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)", - "tab": "Efficiency", - "score": 0.8800347399711609 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.478, mean=0.478, max=0.478, sum=0.955 (2)", - "tab": "Efficiency", - "score": 0.477603075504303 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)", - "tab": "Efficiency", - "score": 0.8651723158841877 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)", - "tab": "Efficiency", - "score": 0.5927850522247016 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)", - "tab": "General information", - "score": 570.02 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)", - "tab": "General information", - "score": 482.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)", - "tab": "General information", - "score": 857.86 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)", - "tab": "General information", - "score": 626.69 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)", - "tab": "General information", - "score": 513.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)", - "tab": "General information", - "score": 507.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.905 (2)", - "tab": "Efficiency", - "score": 0.45262243270874025 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)", - "tab": "General information", - "score": 380.91 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=1.068, mean=1.068, max=1.068, sum=2.135 (2)", - "tab": "Efficiency", - "score": 1.067676763785513 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)", - "tab": "General information", - "score": 634.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)", - "tab": "Efficiency", - "score": 0.7918326926231384 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)", - "tab": "General information", - "score": 456.54 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898, - "details": { - "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Efficiency", - "score": 0.7597615586386787 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)", - "tab": "General information", - "score": 407.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.887, - "details": { - "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.907 (2)", - "tab": "Efficiency", - "score": 0.45336360793405023 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)", - "tab": "General information", - "score": 340.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.912, - "details": { - "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.609 (2)", - "tab": "Efficiency", - "score": 0.8043198874768089 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.738, mean=0.738, max=0.738, sum=1.476 (2)", - "tab": "Efficiency", - "score": 0.7378175072636165 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.546, mean=0.546, max=0.546, sum=1.091 (2)", - "tab": "Efficiency", - "score": 0.5455011718431694 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Efficiency", - "score": 0.47001955400105394 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)", - "tab": "General information", - "score": 1113.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)", - "tab": "General information", - "score": 755.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)", - "tab": "General information", - "score": 1685.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)", - "tab": "General information", - "score": 594.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.533, mean=0.533, max=0.533, sum=1.065 (2)", - "tab": "Efficiency", - "score": 0.5325308299064636 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)", - "tab": "General information", - "score": 438.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=1.036, mean=1.036, max=1.036, sum=2.071 (2)", - "tab": "Efficiency", - "score": 1.03554652239147 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)", - "tab": "General information", - "score": 602.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=1.112, mean=1.112, max=1.112, sum=2.223 (2)", - "tab": "Efficiency", - "score": 1.1116365933418273 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)", - "tab": "General information", - "score": 552.87 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.906, - "details": { - "description": "min=0.906, mean=0.906, max=0.906, sum=1.811 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.469, mean=0.469, max=0.469, sum=0.937 (2)", - "tab": "Efficiency", - "score": 0.4685829783385655 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)", - "tab": "General information", - "score": 402.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "details": { - "description": "min=0.945, mean=0.945, max=0.945, sum=1.889 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.317 (2)", - "tab": "Efficiency", - "score": 0.6586567797559373 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)", - "tab": "General information", - "score": 309.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.948 (2)", - "tab": "Efficiency", - "score": 0.4739974646732725 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)", - "tab": "General information", - "score": 474.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.942, - "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)", - "tab": "Efficiency", - "score": 0.5800282936247568 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)", - "tab": "General information", - "score": 597.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.754, - "details": { - "description": "min=0.754, mean=0.754, max=0.754, sum=1.508 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)", - "tab": "Efficiency", - "score": 0.9259536947522845 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)", - "tab": "General information", - "score": 619.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.937, - "details": { - "description": "min=0.937, mean=0.937, max=0.937, sum=1.873 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.458, mean=0.458, max=0.458, sum=0.916 (2)", - "tab": "Efficiency", - "score": 0.4579133049134285 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.587, mean=0.587, max=0.587, sum=1.175 (2)", - "tab": "Efficiency", - "score": 0.5872501540066574 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.493, mean=0.493, max=0.493, sum=0.987 (2)", - "tab": "Efficiency", - "score": 0.49327227354049685 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Efficiency", - "score": 0.8402222113175826 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.674, mean=0.674, max=0.674, sum=1.349 (2)", - "tab": "Efficiency", - "score": 0.6743082650984177 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.494, mean=0.494, max=0.494, sum=0.988 (2)", - "tab": "Efficiency", - "score": 0.4939905238275083 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.501 (2)", - "tab": "Efficiency", - "score": 0.750414514541626 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Efficiency", - "score": 0.8088616865652579 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.423 (2)", - "tab": "Efficiency", - "score": 0.711490568994474 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.832, mean=0.832, max=0.832, sum=1.664 (2)", - "tab": "Efficiency", - "score": 0.8320141549141992 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.654, mean=0.654, max=0.654, sum=1.309 (2)", - "tab": "Efficiency", - "score": 0.6543280317149031 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.377 (2)", - "tab": "Efficiency", - "score": 0.6883480460555466 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.695 (2)", - "tab": "Efficiency", - "score": 0.8477429151535034 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.641, mean=0.641, max=0.641, sum=1.282 (2)", - "tab": "Efficiency", - "score": 0.6409383886474095 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)", - "tab": "General information", - "score": 500.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)", - "tab": "General information", - "score": 513.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)", - "tab": "General information", - "score": 935.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)", - "tab": "General information", - "score": 2797.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)", - "tab": "General information", - "score": 394.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)", - "tab": "General information", - "score": 479.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)", - "tab": "General information", - "score": 396.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)", - "tab": "General information", - "score": 566.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)", - "tab": "General information", - "score": 415.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)", - "tab": "General information", - "score": 591.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)", - "tab": "General information", - "score": 502.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)", - "tab": "General information", - "score": 858.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)", - "tab": "General information", - "score": 2205.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)", - "tab": "General information", - "score": 1426.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.651 (2)", - "tab": "Efficiency", - "score": 0.8252711541984113 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)", - "tab": "Efficiency", - "score": 0.689175573014121 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)", - "tab": "General information", - "score": 321.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)", - "tab": "General information", - "score": 347.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.544, mean=0.544, max=0.544, sum=1.089 (2)", - "tab": "Efficiency", - "score": 0.5443926212216211 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)", - "tab": "General information", - "score": 644.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.902, - "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.804 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.412 (2)", - "tab": "Efficiency", - "score": 0.7058728443332977 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)", - "tab": "General information", - "score": 450.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.661 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)", - "tab": "Efficiency", - "score": 0.47608799380915506 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)", - "tab": "General information", - "score": 702.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)", - "tab": "Efficiency", - "score": 0.5099537488326286 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)", - "tab": "General information", - "score": 281.3009708737864 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.962, - "details": { - "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)", - "tab": "Efficiency", - "score": 0.42154710415082103 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)", - "tab": "General information", - "score": 428.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.321 (2)", - "tab": "Efficiency", - "score": 0.6604956579208374 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)", - "tab": "General information", - "score": 338.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.959, - "details": { - "description": "min=0.959, mean=0.959, max=0.959, sum=1.918 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.564, mean=0.564, max=0.564, sum=1.128 (2)", - "tab": "Efficiency", - "score": 0.5638943230055301 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)", - "tab": "General information", - "score": 314.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.792, - "details": { - "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=1.245, mean=1.245, max=1.245, sum=2.49 (2)", - "tab": "Efficiency", - "score": 1.244819999430221 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=1.526, mean=1.526, max=1.526, sum=3.052 (2)", - "tab": "Efficiency", - "score": 1.5260936177642652 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)", - "tab": "General information", - "score": 495.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)", - "tab": "General information", - "score": 690.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.629, mean=0.629, max=0.629, sum=1.259 (2)", - "tab": "Efficiency", - "score": 0.6292609475017373 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)", - "tab": "General information", - "score": 585.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.926, - "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)", - "tab": "Efficiency", - "score": 0.5400909362015901 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)", - "tab": "General information", - "score": 540.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)", - "tab": "Efficiency", - "score": 0.4420530059120872 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)", - "tab": "General information", - "score": 426.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.714 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)", - "tab": "Efficiency", - "score": 0.44290724871109943 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)", - "tab": "General information", - "score": 1193.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)", - "tab": "Efficiency", - "score": 0.4202856958208986 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)", - "tab": "General information", - "score": 456.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566, - "details": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.849 (2)", - "tab": "Efficiency", - "score": 0.4245123575968915 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)", - "tab": "General information", - "score": 336.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", - "tab": "Efficiency", - "score": 0.4207720505563836 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)", - "tab": "General information", - "score": 268.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.334, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json deleted file mode 100644 index de3a77c03..000000000 --- a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 1.5 Pro 0409 preview", - "id": "google/gemini-1.5-pro-preview-0409", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.397, mean=0.81, max=0.979, sum=92.284 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.877, mean=1.174, max=3.173, sum=133.815 (114)", - "tab": "Efficiency", - "score": 1.1738183835156866 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)", - "tab": "General information", - "score": 632.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=1.767, mean=1.767, max=1.767, sum=3.533 (2)", - "tab": "Efficiency", - "score": 1.7665750813484191 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)", - "tab": "General information", - "score": 383.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.541 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=3.173, mean=3.173, max=3.173, sum=6.346 (2)", - "tab": "Efficiency", - "score": 3.1730875386132134 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)", - "tab": "General information", - "score": 344.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=1.054, mean=1.054, max=1.054, sum=2.107 (2)", - "tab": "Efficiency", - "score": 1.053539514541626 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.985, mean=0.985, max=0.985, sum=1.971 (2)", - "tab": "Efficiency", - "score": 0.9854124503003227 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=1.301, mean=1.301, max=1.301, sum=2.603 (2)", - "tab": "Efficiency", - "score": 1.3013164806365967 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=1.187, mean=1.187, max=1.187, sum=2.375 (2)", - "tab": "Efficiency", - "score": 1.1873565983772278 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=1.149, mean=1.149, max=1.149, sum=2.298 (2)", - "tab": "Efficiency", - "score": 1.1490558723493807 - }, - "College Physics - Observed inference time (s)": { - "description": "min=1.017, mean=1.017, max=1.017, sum=2.034 (2)", - "tab": "Efficiency", - "score": 1.0169454929875392 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)", - "tab": "General information", - "score": 570.02 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)", - "tab": "General information", - "score": 482.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)", - "tab": "General information", - "score": 857.86 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)", - "tab": "General information", - "score": 626.69 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)", - "tab": "General information", - "score": 513.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)", - "tab": "General information", - "score": 507.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.26, mean=1.26, max=1.26, sum=2.52 (2)", - "tab": "Efficiency", - "score": 1.2601169872283935 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)", - "tab": "General information", - "score": 380.91 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.737, - "details": { - "description": "min=0.737, mean=0.737, max=0.737, sum=1.474 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.968, mean=0.968, max=0.968, sum=1.936 (2)", - "tab": "Efficiency", - "score": 0.9679407843372279 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)", - "tab": "General information", - "score": 634.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=1.066, mean=1.066, max=1.066, sum=2.132 (2)", - "tab": "Efficiency", - "score": 1.065871012210846 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)", - "tab": "General information", - "score": 456.54 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=1.079, mean=1.079, max=1.079, sum=2.157 (2)", - "tab": "Efficiency", - "score": 1.0785565420433327 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)", - "tab": "General information", - "score": 407.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.846, - "details": { - "description": "min=0.846, mean=0.846, max=0.846, sum=1.691 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=1.057, mean=1.057, max=1.057, sum=2.114 (2)", - "tab": "Efficiency", - "score": 1.0571237967328626 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)", - "tab": "General information", - "score": 340.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.866, - "details": { - "description": "min=0.866, mean=0.866, max=0.866, sum=1.732 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.258, mean=1.258, max=1.258, sum=2.516 (2)", - "tab": "Efficiency", - "score": 1.2578288101182213 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=1.179, mean=1.179, max=1.179, sum=2.359 (2)", - "tab": "Efficiency", - "score": 1.1793269350173625 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.246, mean=1.246, max=1.246, sum=2.491 (2)", - "tab": "Efficiency", - "score": 1.2455504093494716 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=1.181, mean=1.181, max=1.181, sum=2.362 (2)", - "tab": "Efficiency", - "score": 1.1811600880403268 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)", - "tab": "General information", - "score": 1113.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)", - "tab": "General information", - "score": 755.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)", - "tab": "General information", - "score": 1685.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)", - "tab": "General information", - "score": 594.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.969, mean=0.969, max=0.969, sum=1.938 (2)", - "tab": "Efficiency", - "score": 0.968876302242279 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)", - "tab": "General information", - "score": 438.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.829 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Efficiency", - "score": 0.9198912256642392 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)", - "tab": "General information", - "score": 602.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=1.009, mean=1.009, max=1.009, sum=2.019 (2)", - "tab": "Efficiency", - "score": 1.0093300080299377 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)", - "tab": "General information", - "score": 552.87 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=1.079, mean=1.079, max=1.079, sum=2.157 (2)", - "tab": "Efficiency", - "score": 1.0787266893206902 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)", - "tab": "General information", - "score": 402.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "details": { - "description": "min=0.915, mean=0.915, max=0.915, sum=1.83 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.962, mean=0.962, max=0.962, sum=1.925 (2)", - "tab": "Efficiency", - "score": 0.9624196154005984 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)", - "tab": "General information", - "score": 309.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772, - "details": { - "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=1.272, mean=1.272, max=1.272, sum=2.544 (2)", - "tab": "Efficiency", - "score": 1.271799375270975 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)", - "tab": "General information", - "score": 474.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.767 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=1.052, mean=1.052, max=1.052, sum=2.104 (2)", - "tab": "Efficiency", - "score": 1.0518414406549363 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)", - "tab": "General information", - "score": 597.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=1.075, mean=1.075, max=1.075, sum=2.151 (2)", - "tab": "Efficiency", - "score": 1.0754183095598977 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)", - "tab": "General information", - "score": 619.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=1.11, mean=1.11, max=1.11, sum=2.22 (2)", - "tab": "Efficiency", - "score": 1.1099017789286951 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=1.021, mean=1.021, max=1.021, sum=2.041 (2)", - "tab": "Efficiency", - "score": 1.0206051636211977 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=1.112, mean=1.112, max=1.112, sum=2.224 (2)", - "tab": "Efficiency", - "score": 1.1118335294723511 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.402, mean=1.402, max=1.402, sum=2.803 (2)", - "tab": "Efficiency", - "score": 1.4017024777152323 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.959, mean=0.959, max=0.959, sum=1.918 (2)", - "tab": "Efficiency", - "score": 0.9591333119556157 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=1.224, mean=1.224, max=1.224, sum=2.448 (2)", - "tab": "Efficiency", - "score": 1.2240539535957298 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=1.052, mean=1.052, max=1.052, sum=2.105 (2)", - "tab": "Efficiency", - "score": 1.052347583648486 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=1.167, mean=1.167, max=1.167, sum=2.335 (2)", - "tab": "Efficiency", - "score": 1.167454132327327 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.992, mean=0.992, max=0.992, sum=1.984 (2)", - "tab": "Efficiency", - "score": 0.991771269245308 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=1.275, mean=1.275, max=1.275, sum=2.549 (2)", - "tab": "Efficiency", - "score": 1.2746097031018593 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=1.143, mean=1.143, max=1.143, sum=2.286 (2)", - "tab": "Efficiency", - "score": 1.1432113459005075 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=1.417, mean=1.417, max=1.417, sum=2.834 (2)", - "tab": "Efficiency", - "score": 1.417081825159214 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.309, mean=1.309, max=1.309, sum=2.618 (2)", - "tab": "Efficiency", - "score": 1.3091707919158189 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.249, mean=1.249, max=1.249, sum=2.498 (2)", - "tab": "Efficiency", - "score": 1.2489153383150382 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)", - "tab": "General information", - "score": 500.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)", - "tab": "General information", - "score": 513.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)", - "tab": "General information", - "score": 935.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)", - "tab": "General information", - "score": 2797.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)", - "tab": "General information", - "score": 394.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)", - "tab": "General information", - "score": 479.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)", - "tab": "General information", - "score": 396.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)", - "tab": "General information", - "score": 566.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)", - "tab": "General information", - "score": 415.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)", - "tab": "General information", - "score": 591.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)", - "tab": "General information", - "score": 502.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)", - "tab": "General information", - "score": 858.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)", - "tab": "General information", - "score": 2205.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)", - "tab": "General information", - "score": 1426.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.397, - "details": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=1.295, mean=1.295, max=1.295, sum=2.59 (2)", - "tab": "Efficiency", - "score": 1.2951436652196362 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=1.699, mean=1.699, max=1.699, sum=3.399 (2)", - "tab": "Efficiency", - "score": 1.6993297884019756 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)", - "tab": "General information", - "score": 321.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)", - "tab": "General information", - "score": 347.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=1.151, mean=1.151, max=1.151, sum=2.303 (2)", - "tab": "Efficiency", - "score": 1.1514279527112472 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)", - "tab": "General information", - "score": 644.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.718 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=1.422, mean=1.422, max=1.422, sum=2.844 (2)", - "tab": "Efficiency", - "score": 1.4221880026390217 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)", - "tab": "General information", - "score": 450.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=1.339 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=1.005, mean=1.005, max=1.005, sum=2.011 (2)", - "tab": "Efficiency", - "score": 1.005433154957635 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)", - "tab": "General information", - "score": 702.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.874, - "details": { - "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.939, mean=0.939, max=0.939, sum=1.879 (2)", - "tab": "Efficiency", - "score": 0.9392627234597808 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)", - "tab": "General information", - "score": 281.3009708737864 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.953, - "details": { - "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=1.261, mean=1.261, max=1.261, sum=2.523 (2)", - "tab": "Efficiency", - "score": 1.2613265443051982 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)", - "tab": "General information", - "score": 428.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.897, mean=0.897, max=0.897, sum=1.795 (2)", - "tab": "Efficiency", - "score": 0.8973554396629333 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)", - "tab": "General information", - "score": 338.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=1.136, mean=1.136, max=1.136, sum=2.272 (2)", - "tab": "Efficiency", - "score": 1.1357932166882707 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)", - "tab": "General information", - "score": 314.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.966, mean=0.966, max=0.966, sum=1.933 (2)", - "tab": "Efficiency", - "score": 0.9664077420165573 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=1.0, mean=1.0, max=1.0, sum=1.999 (2)", - "tab": "Efficiency", - "score": 0.9996972816196952 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)", - "tab": "General information", - "score": 495.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)", - "tab": "General information", - "score": 690.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.846, - "details": { - "description": "min=0.846, mean=0.846, max=0.846, sum=1.693 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=1.042, mean=1.042, max=1.042, sum=2.084 (2)", - "tab": "Efficiency", - "score": 1.04191489858565 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)", - "tab": "General information", - "score": 585.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.888, mean=0.888, max=0.888, sum=1.775 (2)", - "tab": "Efficiency", - "score": 0.8876422820267854 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)", - "tab": "General information", - "score": 540.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.992, mean=0.992, max=0.992, sum=1.984 (2)", - "tab": "Efficiency", - "score": 0.9922328862276945 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)", - "tab": "General information", - "score": 426.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=1.117, mean=1.117, max=1.117, sum=2.234 (2)", - "tab": "Efficiency", - "score": 1.116919010512683 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)", - "tab": "General information", - "score": 1193.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.925, - "details": { - "description": "min=0.925, mean=0.925, max=0.925, sum=1.851 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=1.296, mean=1.296, max=1.296, sum=2.592 (2)", - "tab": "Efficiency", - "score": 1.29619625195935 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)", - "tab": "General information", - "score": 456.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", - "tab": "Efficiency", - "score": 0.8771147684878614 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)", - "tab": "General information", - "score": 336.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=1.225, mean=1.225, max=1.225, sum=2.451 (2)", - "tab": "Efficiency", - "score": 1.2254026856338769 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)", - "tab": "General information", - "score": 268.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.118, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json deleted file mode 100644 index 6b53de064..000000000 --- a/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemini 2.0 Flash Experimental", - "id": "google/gemini-2.0-flash-exp", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.797, - "details": { - "description": "min=0.554, mean=0.797, max=0.969, sum=90.902 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.379, mean=0.422, max=0.926, sum=48.097 (114)", - "tab": "Efficiency", - "score": 0.4219020959728089 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)", - "tab": "General information", - "score": 632.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Efficiency", - "score": 0.4077691292762756 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)", - "tab": "General information", - "score": 383.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)", - "tab": "Efficiency", - "score": 0.9258230227011222 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)", - "tab": "General information", - "score": 344.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.809 (2)", - "tab": "Efficiency", - "score": 0.4045387363433838 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.941 (2)", - "tab": "Efficiency", - "score": 0.4703653355439504 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)", - "tab": "Efficiency", - "score": 0.4358289122581482 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.827 (2)", - "tab": "Efficiency", - "score": 0.413386971950531 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)", - "tab": "Efficiency", - "score": 0.4259330606184943 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.912 (2)", - "tab": "Efficiency", - "score": 0.4557511432498109 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)", - "tab": "General information", - "score": 570.02 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)", - "tab": "General information", - "score": 482.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)", - "tab": "General information", - "score": 857.86 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)", - "tab": "General information", - "score": 626.69 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)", - "tab": "General information", - "score": 513.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)", - "tab": "General information", - "score": 507.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)", - "tab": "Efficiency", - "score": 0.4065685248374939 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)", - "tab": "General information", - "score": 380.91 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.41, mean=0.41, max=0.41, sum=0.819 (2)", - "tab": "Efficiency", - "score": 0.4097107544279935 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)", - "tab": "General information", - "score": 634.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.4148475766181946 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)", - "tab": "General information", - "score": 456.54 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898, - "details": { - "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)", - "tab": "Efficiency", - "score": 0.4418119721942478 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)", - "tab": "General information", - "score": 407.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.887, - "details": { - "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.817 (2)", - "tab": "Efficiency", - "score": 0.40853408831875426 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)", - "tab": "General information", - "score": 340.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.934 (2)", - "tab": "Efficiency", - "score": 0.46713243337238536 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)", - "tab": "Efficiency", - "score": 0.38551004812227074 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.859 (2)", - "tab": "Efficiency", - "score": 0.4294954424886691 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.793 (2)", - "tab": "Efficiency", - "score": 0.39653347715053683 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)", - "tab": "General information", - "score": 1113.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)", - "tab": "General information", - "score": 755.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)", - "tab": "General information", - "score": 1685.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)", - "tab": "General information", - "score": 594.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.829 (2)", - "tab": "Efficiency", - "score": 0.4144425654411316 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)", - "tab": "General information", - "score": 438.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.855 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)", - "tab": "Efficiency", - "score": 0.43207096739819173 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)", - "tab": "General information", - "score": 602.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)", - "tab": "Efficiency", - "score": 0.441267569065094 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)", - "tab": "General information", - "score": 552.87 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.879, - "details": { - "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)", - "tab": "Efficiency", - "score": 0.43878708245619286 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)", - "tab": "General information", - "score": 402.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.813, - "details": { - "description": "min=0.813, mean=0.813, max=0.813, sum=1.626 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.796 (2)", - "tab": "Efficiency", - "score": 0.3981509147806371 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)", - "tab": "General information", - "score": 309.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)", - "tab": "Efficiency", - "score": 0.47606519830637967 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)", - "tab": "General information", - "score": 474.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.714 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Efficiency", - "score": 0.4077642039647178 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)", - "tab": "General information", - "score": 597.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.571, - "details": { - "description": "min=0.571, mean=0.571, max=0.571, sum=1.143 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", - "tab": "Efficiency", - "score": 0.4018626610438029 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)", - "tab": "General information", - "score": 619.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.485 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.784 (2)", - "tab": "Efficiency", - "score": 0.39193403643946495 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.783 (2)", - "tab": "Efficiency", - "score": 0.3914114583302014 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.785 (2)", - "tab": "Efficiency", - "score": 0.3924300479888916 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.903 (2)", - "tab": "Efficiency", - "score": 0.451710438005852 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.773 (2)", - "tab": "Efficiency", - "score": 0.3862521937399199 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.817 (2)", - "tab": "Efficiency", - "score": 0.40865302950607063 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.385, mean=0.385, max=0.385, sum=0.771 (2)", - "tab": "Efficiency", - "score": 0.3853575364137307 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)", - "tab": "Efficiency", - "score": 0.39334204550142643 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Efficiency", - "score": 0.38397373171413646 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.823 (2)", - "tab": "Efficiency", - "score": 0.4116018955281239 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)", - "tab": "Efficiency", - "score": 0.3931623751964044 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.449, mean=0.449, max=0.449, sum=0.898 (2)", - "tab": "Efficiency", - "score": 0.44901008628032824 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.935 (2)", - "tab": "Efficiency", - "score": 0.46768493044610115 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.903 (2)", - "tab": "Efficiency", - "score": 0.451718654310653 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)", - "tab": "General information", - "score": 500.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)", - "tab": "General information", - "score": 513.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)", - "tab": "General information", - "score": 935.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)", - "tab": "General information", - "score": 2797.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)", - "tab": "General information", - "score": 394.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)", - "tab": "General information", - "score": 479.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)", - "tab": "General information", - "score": 396.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)", - "tab": "General information", - "score": 566.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)", - "tab": "General information", - "score": 415.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)", - "tab": "General information", - "score": 591.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)", - "tab": "General information", - "score": 502.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)", - "tab": "General information", - "score": 858.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)", - "tab": "General information", - "score": 2205.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)", - "tab": "General information", - "score": 1426.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Efficiency", - "score": 0.3999073441253115 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)", - "tab": "Efficiency", - "score": 0.4203109868610178 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)", - "tab": "General information", - "score": 321.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)", - "tab": "General information", - "score": 347.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.645, - "details": { - "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.913 (2)", - "tab": "Efficiency", - "score": 0.45661053972795973 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)", - "tab": "General information", - "score": 644.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.823 (2)", - "tab": "Efficiency", - "score": 0.4113436125538832 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)", - "tab": "General information", - "score": 450.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.759, - "details": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.833 (2)", - "tab": "Efficiency", - "score": 0.4165512855563845 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)", - "tab": "General information", - "score": 702.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=1.437 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.803 (2)", - "tab": "Efficiency", - "score": 0.4013508292077814 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)", - "tab": "General information", - "score": 281.3009708737864 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.944, - "details": { - "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.801 (2)", - "tab": "Efficiency", - "score": 0.4005699891310472 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)", - "tab": "General information", - "score": 428.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.773 (2)", - "tab": "Efficiency", - "score": 0.38653050899505614 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)", - "tab": "General information", - "score": 338.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.939, - "details": { - "description": "min=0.939, mean=0.939, max=0.939, sum=1.877 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)", - "tab": "Efficiency", - "score": 0.3861832460376647 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)", - "tab": "General information", - "score": 314.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=1.629 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Efficiency", - "score": 0.3839988109004291 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)", - "tab": "Efficiency", - "score": 0.4048716662316349 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)", - "tab": "General information", - "score": 495.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)", - "tab": "General information", - "score": 690.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.856, - "details": { - "description": "min=0.856, mean=0.856, max=0.856, sum=1.712 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", - "tab": "Efficiency", - "score": 0.39706431027331385 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)", - "tab": "General information", - "score": 585.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898, - "details": { - "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Efficiency", - "score": 0.3900022072556578 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)", - "tab": "General information", - "score": 540.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Efficiency", - "score": 0.37999111955816095 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)", - "tab": "General information", - "score": 426.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.394, mean=0.394, max=0.394, sum=0.787 (2)", - "tab": "Efficiency", - "score": 0.3936534463142862 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)", - "tab": "General information", - "score": 1193.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)", - "tab": "Efficiency", - "score": 0.3881402205471969 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)", - "tab": "General information", - "score": 456.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.758 (2)", - "tab": "Efficiency", - "score": 0.3791351461985025 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)", - "tab": "General information", - "score": 336.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731, - "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Efficiency", - "score": 0.38400994964510377 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)", - "tab": "General information", - "score": 268.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.567, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json deleted file mode 100644 index 8720cc062..000000000 --- a/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma 2 27B", - "id": "google/gemma-2-27b", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757, - "details": { - "description": "min=0.394, mean=0.757, max=0.979, sum=86.303 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=1.169, mean=2.744, max=12.207, sum=312.86 (114)", - "tab": "Efficiency", - "score": 2.7443855864562217 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=260.164, mean=624.617, max=2789.424, sum=71206.345 (114)", - "tab": "General information", - "score": 624.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4, - "details": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=1.522, mean=1.522, max=1.522, sum=3.043 (2)", - "tab": "Efficiency", - "score": 1.5217395949363708 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=375.97, mean=375.97, max=375.97, sum=751.94 (2)", - "tab": "General information", - "score": 375.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.541 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=1.179, mean=1.179, max=1.179, sum=2.359 (2)", - "tab": "Efficiency", - "score": 1.1792643246827301 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=336.356, mean=336.356, max=336.356, sum=672.711 (2)", - "tab": "General information", - "score": 336.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=2.168, mean=2.168, max=2.168, sum=4.337 (2)", - "tab": "Efficiency", - "score": 2.168372049331665 - }, - "College Biology - Observed inference time (s)": { - "description": "min=1.995, mean=1.995, max=1.995, sum=3.99 (2)", - "tab": "Efficiency", - "score": 1.994903423719936 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=3.315, mean=3.315, max=3.315, sum=6.631 (2)", - "tab": "Efficiency", - "score": 3.315422866344452 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=2.323, mean=2.323, max=2.323, sum=4.647 (2)", - "tab": "Efficiency", - "score": 2.323271915912628 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=2.118, mean=2.118, max=2.118, sum=4.236 (2)", - "tab": "Efficiency", - "score": 2.117893081179933 - }, - "College Physics - Observed inference time (s)": { - "description": "min=1.982, mean=1.982, max=1.982, sum=3.964 (2)", - "tab": "Efficiency", - "score": 1.9819396874483894 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=562.02, mean=562.02, max=562.02, sum=1124.04 (2)", - "tab": "General information", - "score": 562.02 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=474.799, mean=474.799, max=474.799, sum=949.597 (2)", - "tab": "General information", - "score": 474.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=849.86, mean=849.86, max=849.86, sum=1699.72 (2)", - "tab": "General information", - "score": 849.86 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=618.69, mean=618.69, max=618.69, sum=1237.38 (2)", - "tab": "General information", - "score": 618.69 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=505.37, mean=505.37, max=505.37, sum=1010.74 (2)", - "tab": "General information", - "score": 505.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=499.471, mean=499.471, max=499.471, sum=998.941 (2)", - "tab": "General information", - "score": 499.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.314, mean=1.314, max=1.314, sum=2.628 (2)", - "tab": "Efficiency", - "score": 1.3139495277404785 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=372.91, mean=372.91, max=372.91, sum=745.82 (2)", - "tab": "General information", - "score": 372.91 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=2.14, mean=2.14, max=2.14, sum=4.28 (2)", - "tab": "Efficiency", - "score": 2.1398948138220266 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=626.553, mean=626.553, max=626.553, sum=1253.105 (2)", - "tab": "General information", - "score": 626.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43, - "details": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=1.452, mean=1.452, max=1.452, sum=2.905 (2)", - "tab": "Efficiency", - "score": 1.4524464893341065 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=448.54, mean=448.54, max=448.54, sum=897.08 (2)", - "tab": "General information", - "score": 448.54 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=1.421, mean=1.421, max=1.421, sum=2.841 (2)", - "tab": "Efficiency", - "score": 1.4206464577604223 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=399.87, mean=399.87, max=399.87, sum=799.741 (2)", - "tab": "General information", - "score": 399.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=1.169, mean=1.169, max=1.169, sum=2.337 (2)", - "tab": "Efficiency", - "score": 1.168742698871821 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=332.907, mean=332.907, max=332.907, sum=665.814 (2)", - "tab": "General information", - "score": 332.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=4.2, mean=4.2, max=4.2, sum=8.399 (2)", - "tab": "Efficiency", - "score": 4.199711911818561 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=3.427, mean=3.427, max=3.427, sum=6.854 (2)", - "tab": "Efficiency", - "score": 3.4269232200392596 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=7.724, mean=7.724, max=7.724, sum=15.448 (2)", - "tab": "Efficiency", - "score": 7.723928280417581 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=2.721, mean=2.721, max=2.721, sum=5.442 (2)", - "tab": "Efficiency", - "score": 2.721013201997171 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1105.092, mean=1105.092, max=1105.092, sum=2210.184 (2)", - "tab": "General information", - "score": 1105.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=747.418, mean=747.418, max=747.418, sum=1494.837 (2)", - "tab": "General information", - "score": 747.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1677.119, mean=1677.119, max=1677.119, sum=3354.239 (2)", - "tab": "General information", - "score": 1677.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=586.363, mean=586.363, max=586.363, sum=1172.725 (2)", - "tab": "General information", - "score": 586.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=1.555, mean=1.555, max=1.555, sum=3.109 (2)", - "tab": "Efficiency", - "score": 1.554630262851715 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=430.2, mean=430.2, max=430.2, sum=860.4 (2)", - "tab": "General information", - "score": 430.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=2.214, mean=2.214, max=2.214, sum=4.428 (2)", - "tab": "Efficiency", - "score": 2.214210780043351 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=594.421, mean=594.421, max=594.421, sum=1188.842 (2)", - "tab": "General information", - "score": 594.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=2.156, mean=2.156, max=2.156, sum=4.311 (2)", - "tab": "Efficiency", - "score": 2.1555044412612916 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=544.87, mean=544.87, max=544.87, sum=1089.74 (2)", - "tab": "General information", - "score": 544.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.615 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=1.81, mean=1.81, max=1.81, sum=3.619 (2)", - "tab": "Efficiency", - "score": 1.8096552030095514 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=394.592, mean=394.592, max=394.592, sum=789.185 (2)", - "tab": "General information", - "score": 394.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=1.424, mean=1.424, max=1.424, sum=2.848 (2)", - "tab": "Efficiency", - "score": 1.423792755857427 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=301.213, mean=301.213, max=301.213, sum=602.426 (2)", - "tab": "General information", - "score": 301.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.738, - "details": { - "description": "min=0.738, mean=0.738, max=0.738, sum=1.476 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=1.947, mean=1.947, max=1.947, sum=3.893 (2)", - "tab": "Efficiency", - "score": 1.9467107739941827 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=466.786, mean=466.786, max=466.786, sum=933.572 (2)", - "tab": "General information", - "score": 466.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.558, - "details": { - "description": "min=0.558, mean=0.558, max=0.558, sum=1.116 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=2.287, mean=2.287, max=2.287, sum=4.574 (2)", - "tab": "Efficiency", - "score": 2.286756881330379 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=589.341, mean=589.341, max=589.341, sum=1178.683 (2)", - "tab": "General information", - "score": 589.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.516, - "details": { - "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=2.327, mean=2.327, max=2.327, sum=4.653 (2)", - "tab": "Efficiency", - "score": 2.3266589963246904 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=611.563, mean=611.563, max=611.563, sum=1223.127 (2)", - "tab": "General information", - "score": 611.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.781 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=2.021, mean=2.021, max=2.021, sum=4.043 (2)", - "tab": "Efficiency", - "score": 2.021439305428536 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=2.053, mean=2.053, max=2.053, sum=4.106 (2)", - "tab": "Efficiency", - "score": 2.0532372467623556 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=3.599, mean=3.599, max=3.599, sum=7.197 (2)", - "tab": "Efficiency", - "score": 3.5985250592231752 - }, - "High School European History - Observed inference time (s)": { - "description": "min=12.207, mean=12.207, max=12.207, sum=24.413 (2)", - "tab": "Efficiency", - "score": 12.20667136221221 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=1.952, mean=1.952, max=1.952, sum=3.903 (2)", - "tab": "Efficiency", - "score": 1.9516368020664563 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=2.276, mean=2.276, max=2.276, sum=4.552 (2)", - "tab": "Efficiency", - "score": 2.2759376226929184 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=1.97, mean=1.97, max=1.97, sum=3.94 (2)", - "tab": "Efficiency", - "score": 1.9697805410776383 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=2.617, mean=2.617, max=2.617, sum=5.234 (2)", - "tab": "Efficiency", - "score": 2.616950834238971 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=2.123, mean=2.123, max=2.123, sum=4.245 (2)", - "tab": "Efficiency", - "score": 2.1225664866070786 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=2.697, mean=2.697, max=2.697, sum=5.394 (2)", - "tab": "Efficiency", - "score": 2.6972478115006 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=2.368, mean=2.368, max=2.368, sum=4.735 (2)", - "tab": "Efficiency", - "score": 2.3675809317772543 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=3.958, mean=3.958, max=3.958, sum=7.917 (2)", - "tab": "Efficiency", - "score": 3.9584906564818487 - }, - "High School US History - Observed inference time (s)": { - "description": "min=9.745, mean=9.745, max=9.745, sum=19.491 (2)", - "tab": "Efficiency", - "score": 9.745334922098646 - }, - "High School World History - Observed inference time (s)": { - "description": "min=6.489, mean=6.489, max=6.489, sum=12.977 (2)", - "tab": "Efficiency", - "score": 6.488561074944991 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=492.958, mean=492.958, max=492.958, sum=985.916 (2)", - "tab": "General information", - "score": 492.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=505.064, mean=505.064, max=505.064, sum=1010.128 (2)", - "tab": "General information", - "score": 505.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=927.13, mean=927.13, max=927.13, sum=1854.26 (2)", - "tab": "General information", - "score": 927.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2789.424, mean=2789.424, max=2789.424, sum=5578.848 (2)", - "tab": "General information", - "score": 2789.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=386.773, mean=386.773, max=386.773, sum=773.545 (2)", - "tab": "General information", - "score": 386.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=471.301, mean=471.301, max=471.301, sum=942.601 (2)", - "tab": "General information", - "score": 471.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=388.541, mean=388.541, max=388.541, sum=777.082 (2)", - "tab": "General information", - "score": 388.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=558.822, mean=558.822, max=558.822, sum=1117.644 (2)", - "tab": "General information", - "score": 558.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=407.954, mean=407.954, max=407.954, sum=815.908 (2)", - "tab": "General information", - "score": 407.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=583.715, mean=583.715, max=583.715, sum=1167.43 (2)", - "tab": "General information", - "score": 583.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=494.604, mean=494.604, max=494.604, sum=989.207 (2)", - "tab": "General information", - "score": 494.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=850.931, mean=850.931, max=850.931, sum=1701.861 (2)", - "tab": "General information", - "score": 850.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2197.583, mean=2197.583, max=2197.583, sum=4395.167 (2)", - "tab": "General information", - "score": 2197.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1418.544, mean=1418.544, max=1418.544, sum=2837.089 (2)", - "tab": "General information", - "score": 1418.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=1.712, mean=1.712, max=1.712, sum=3.425 (2)", - "tab": "Efficiency", - "score": 1.7123107461116773 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=1.754, mean=1.754, max=1.754, sum=3.508 (2)", - "tab": "Efficiency", - "score": 1.7542339390470783 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=313.587, mean=313.587, max=313.587, sum=627.175 (2)", - "tab": "General information", - "score": 313.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=339.183, mean=339.183, max=339.183, sum=678.366 (2)", - "tab": "General information", - "score": 339.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=2.9, mean=2.9, max=2.9, sum=5.801 (2)", - "tab": "Efficiency", - "score": 2.9003868654739757 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=636.165, mean=636.165, max=636.165, sum=1272.331 (2)", - "tab": "General information", - "score": 636.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=2.154, mean=2.154, max=2.154, sum=4.308 (2)", - "tab": "Efficiency", - "score": 2.1537599431956473 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.049, mean=442.049, max=442.049, sum=884.098 (2)", - "tab": "General information", - "score": 442.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=3.172, mean=3.172, max=3.172, sum=6.344 (2)", - "tab": "Efficiency", - "score": 3.172234045607703 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=694.402, mean=694.402, max=694.402, sum=1388.804 (2)", - "tab": "General information", - "score": 694.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=1.556, mean=1.556, max=1.556, sum=3.112 (2)", - "tab": "Efficiency", - "score": 1.5561023800118456 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=273.301, mean=273.301, max=273.301, sum=546.602 (2)", - "tab": "General information", - "score": 273.3009708737864 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=2.165, mean=2.165, max=2.165, sum=4.331 (2)", - "tab": "Efficiency", - "score": 2.1654122140672474 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=420.35, mean=420.35, max=420.35, sum=840.701 (2)", - "tab": "General information", - "score": 420.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=1.719, mean=1.719, max=1.719, sum=3.438 (2)", - "tab": "Efficiency", - "score": 1.7190089011192322 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=330.89, mean=330.89, max=330.89, sum=661.78 (2)", - "tab": "General information", - "score": 330.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=1.709, mean=1.709, max=1.709, sum=3.417 (2)", - "tab": "Efficiency", - "score": 1.7086633363141563 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=306.669, mean=306.669, max=306.669, sum=613.338 (2)", - "tab": "General information", - "score": 306.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.394, - "details": { - "description": "min=0.394, mean=0.394, max=0.394, sum=0.789 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=2.315, mean=2.315, max=2.315, sum=4.631 (2)", - "tab": "Efficiency", - "score": 2.315398308583078 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=3.188, mean=3.188, max=3.188, sum=6.376 (2)", - "tab": "Efficiency", - "score": 3.187839964914588 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=487.003, mean=487.003, max=487.003, sum=974.006 (2)", - "tab": "General information", - "score": 487.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=682.542, mean=682.542, max=682.542, sum=1365.084 (2)", - "tab": "General information", - "score": 682.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.647 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=2.692, mean=2.692, max=2.692, sum=5.383 (2)", - "tab": "Efficiency", - "score": 2.691618916255976 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=577.48, mean=577.48, max=577.48, sum=1154.961 (2)", - "tab": "General information", - "score": 577.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.753 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=2.537, mean=2.537, max=2.537, sum=5.075 (2)", - "tab": "Efficiency", - "score": 2.5372923561084417 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=532.198, mean=532.198, max=532.198, sum=1064.395 (2)", - "tab": "General information", - "score": 532.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=2.161, mean=2.161, max=2.161, sum=4.321 (2)", - "tab": "Efficiency", - "score": 2.160554786161943 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=418.655, mean=418.655, max=418.655, sum=837.309 (2)", - "tab": "General information", - "score": 418.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=5.336, mean=5.336, max=5.336, sum=10.672 (2)", - "tab": "Efficiency", - "score": 5.335982258465825 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1185.869, mean=1185.869, max=1185.869, sum=2371.739 (2)", - "tab": "General information", - "score": 1185.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=2.204, mean=2.204, max=2.204, sum=4.409 (2)", - "tab": "Efficiency", - "score": 2.2043708201071515 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=448.274, mean=448.274, max=448.274, sum=896.547 (2)", - "tab": "General information", - "score": 448.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=1.75, mean=1.75, max=1.75, sum=3.499 (2)", - "tab": "Efficiency", - "score": 1.7496386393007026 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=328.753, mean=328.753, max=328.753, sum=657.506 (2)", - "tab": "General information", - "score": 328.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=1.443, mean=1.443, max=1.443, sum=2.886 (2)", - "tab": "Efficiency", - "score": 1.443225710015548 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=260.164, mean=260.164, max=260.164, sum=520.327 (2)", - "tab": "General information", - "score": 260.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.05, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json deleted file mode 100644 index 2007b06df..000000000 --- a/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma 2 9B", - "id": "google/gemma-2-9b", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.721, - "details": { - "description": "min=0.295, mean=0.721, max=0.953, sum=82.233 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.425, mean=0.901, max=3.986, sum=102.765 (114)", - "tab": "Efficiency", - "score": 0.9014510090022484 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=260.164, mean=624.617, max=2789.424, sum=71206.345 (114)", - "tab": "General information", - "score": 624.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4, - "details": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", - "tab": "Efficiency", - "score": 0.6499301409721374 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=375.97, mean=375.97, max=375.97, sum=751.94 (2)", - "tab": "General information", - "score": 375.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704, - "details": { - "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)", - "tab": "Efficiency", - "score": 0.491805742405079 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=336.356, mean=336.356, max=336.356, sum=672.711 (2)", - "tab": "General information", - "score": 336.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.423 (2)", - "tab": "Efficiency", - "score": 0.7114056801795959 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.624, mean=0.624, max=0.624, sum=1.248 (2)", - "tab": "Efficiency", - "score": 0.6241771280765533 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=1.093, mean=1.093, max=1.093, sum=2.187 (2)", - "tab": "Efficiency", - "score": 1.0932785439491273 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)", - "tab": "Efficiency", - "score": 0.8027684283256531 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)", - "tab": "Efficiency", - "score": 0.6739495985769812 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.655, mean=0.655, max=0.655, sum=1.311 (2)", - "tab": "Efficiency", - "score": 0.6553734166949403 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=562.02, mean=562.02, max=562.02, sum=1124.04 (2)", - "tab": "General information", - "score": 562.02 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=474.799, mean=474.799, max=474.799, sum=949.597 (2)", - "tab": "General information", - "score": 474.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=849.86, mean=849.86, max=849.86, sum=1699.72 (2)", - "tab": "General information", - "score": 849.86 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=618.69, mean=618.69, max=618.69, sum=1237.38 (2)", - "tab": "General information", - "score": 618.69 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=505.37, mean=505.37, max=505.37, sum=1010.74 (2)", - "tab": "General information", - "score": 505.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=499.471, mean=499.471, max=499.471, sum=998.941 (2)", - "tab": "General information", - "score": 499.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)", - "tab": "Efficiency", - "score": 0.4640101146697998 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=372.91, mean=372.91, max=372.91, sum=745.82 (2)", - "tab": "General information", - "score": 372.91 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.766, mean=0.766, max=0.766, sum=1.531 (2)", - "tab": "Efficiency", - "score": 0.7655813254808125 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=626.553, mean=626.553, max=626.553, sum=1253.105 (2)", - "tab": "General information", - "score": 626.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)", - "tab": "Efficiency", - "score": 0.5422105526924134 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=448.54, mean=448.54, max=448.54, sum=897.08 (2)", - "tab": "General information", - "score": 448.54 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.496, mean=0.496, max=0.496, sum=0.991 (2)", - "tab": "Efficiency", - "score": 0.4956528963866057 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=399.87, mean=399.87, max=399.87, sum=799.741 (2)", - "tab": "General information", - "score": 399.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772, - "details": { - "description": "min=0.772, mean=0.772, max=0.772, sum=1.543 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)", - "tab": "Efficiency", - "score": 0.4251678066621639 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=332.907, mean=332.907, max=332.907, sum=665.814 (2)", - "tab": "General information", - "score": 332.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.575 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.37, mean=1.37, max=1.37, sum=2.74 (2)", - "tab": "Efficiency", - "score": 1.3702202570789002 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=1.128, mean=1.128, max=1.128, sum=2.255 (2)", - "tab": "Efficiency", - "score": 1.1277324375531352 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=2.433, mean=2.433, max=2.433, sum=4.866 (2)", - "tab": "Efficiency", - "score": 2.433138657113564 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)", - "tab": "Efficiency", - "score": 0.9092130824631336 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1105.092, mean=1105.092, max=1105.092, sum=2210.184 (2)", - "tab": "General information", - "score": 1105.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=747.418, mean=747.418, max=747.418, sum=1494.837 (2)", - "tab": "General information", - "score": 747.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1677.119, mean=1677.119, max=1677.119, sum=3354.239 (2)", - "tab": "General information", - "score": 1677.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=586.363, mean=586.363, max=586.363, sum=1172.725 (2)", - "tab": "General information", - "score": 586.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)", - "tab": "Efficiency", - "score": 0.5438596844673157 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=430.2, mean=430.2, max=430.2, sum=860.4 (2)", - "tab": "General information", - "score": 430.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.789, - "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.766, mean=0.766, max=0.766, sum=1.533 (2)", - "tab": "Efficiency", - "score": 0.7662546744472102 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=594.421, mean=594.421, max=594.421, sum=1188.842 (2)", - "tab": "General information", - "score": 594.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.425 (2)", - "tab": "Efficiency", - "score": 0.7125983119010926 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=544.87, mean=544.87, max=544.87, sum=1089.74 (2)", - "tab": "General information", - "score": 544.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777, - "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=1.555 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.121 (2)", - "tab": "Efficiency", - "score": 0.5606130177119992 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=394.592, mean=394.592, max=394.592, sum=789.185 (2)", - "tab": "General information", - "score": 394.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.732, - "details": { - "description": "min=0.732, mean=0.732, max=0.732, sum=1.464 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)", - "tab": "Efficiency", - "score": 0.4395242579439853 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=301.213, mean=301.213, max=301.213, sum=602.426 (2)", - "tab": "General information", - "score": 301.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724, - "details": { - "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.621, mean=0.621, max=0.621, sum=1.242 (2)", - "tab": "Efficiency", - "score": 0.620852176074324 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=466.786, mean=466.786, max=466.786, sum=933.572 (2)", - "tab": "General information", - "score": 466.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.577, - "details": { - "description": "min=0.577, mean=0.577, max=0.577, sum=1.153 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)", - "tab": "Efficiency", - "score": 0.7831445295343954 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=589.341, mean=589.341, max=589.341, sum=1178.683 (2)", - "tab": "General information", - "score": 589.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.492, - "details": { - "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.805, mean=0.805, max=0.805, sum=1.61 (2)", - "tab": "Efficiency", - "score": 0.804882182015313 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=611.563, mean=611.563, max=611.563, sum=1223.127 (2)", - "tab": "General information", - "score": 611.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)", - "tab": "Efficiency", - "score": 0.6510615141161027 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Efficiency", - "score": 0.6597568284114593 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=1.159, mean=1.159, max=1.159, sum=2.317 (2)", - "tab": "Efficiency", - "score": 1.1585216951370239 - }, - "High School European History - Observed inference time (s)": { - "description": "min=3.986, mean=3.986, max=3.986, sum=7.972 (2)", - "tab": "Efficiency", - "score": 3.9859177892858333 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)", - "tab": "Efficiency", - "score": 0.6379079361154576 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.719, mean=0.719, max=0.719, sum=1.438 (2)", - "tab": "Efficiency", - "score": 0.7190980182410521 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.646, mean=0.646, max=0.646, sum=1.292 (2)", - "tab": "Efficiency", - "score": 0.6461667580482288 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Efficiency", - "score": 0.8891835009610212 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)", - "tab": "Efficiency", - "score": 0.6818269651477077 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.905, mean=0.905, max=0.905, sum=1.81 (2)", - "tab": "Efficiency", - "score": 0.9050559808086875 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.746, mean=0.746, max=0.746, sum=1.491 (2)", - "tab": "Efficiency", - "score": 0.7455598682438561 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=1.279, mean=1.279, max=1.279, sum=2.558 (2)", - "tab": "Efficiency", - "score": 1.278907789124383 - }, - "High School US History - Observed inference time (s)": { - "description": "min=3.106, mean=3.106, max=3.106, sum=6.212 (2)", - "tab": "Efficiency", - "score": 3.1062067454936457 - }, - "High School World History - Observed inference time (s)": { - "description": "min=2.068, mean=2.068, max=2.068, sum=4.137 (2)", - "tab": "Efficiency", - "score": 2.0682604393375574 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=492.958, mean=492.958, max=492.958, sum=985.916 (2)", - "tab": "General information", - "score": 492.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=505.064, mean=505.064, max=505.064, sum=1010.128 (2)", - "tab": "General information", - "score": 505.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=927.13, mean=927.13, max=927.13, sum=1854.26 (2)", - "tab": "General information", - "score": 927.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2789.424, mean=2789.424, max=2789.424, sum=5578.848 (2)", - "tab": "General information", - "score": 2789.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=386.773, mean=386.773, max=386.773, sum=773.545 (2)", - "tab": "General information", - "score": 386.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=471.301, mean=471.301, max=471.301, sum=942.601 (2)", - "tab": "General information", - "score": 471.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=388.541, mean=388.541, max=388.541, sum=777.082 (2)", - "tab": "General information", - "score": 388.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=558.822, mean=558.822, max=558.822, sum=1117.644 (2)", - "tab": "General information", - "score": 558.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=407.954, mean=407.954, max=407.954, sum=815.908 (2)", - "tab": "General information", - "score": 407.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=583.715, mean=583.715, max=583.715, sum=1167.43 (2)", - "tab": "General information", - "score": 583.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=494.604, mean=494.604, max=494.604, sum=989.207 (2)", - "tab": "General information", - "score": 494.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=850.931, mean=850.931, max=850.931, sum=1701.861 (2)", - "tab": "General information", - "score": 850.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2197.583, mean=2197.583, max=2197.583, sum=4395.167 (2)", - "tab": "General information", - "score": 2197.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1418.544, mean=1418.544, max=1418.544, sum=2837.089 (2)", - "tab": "General information", - "score": 1418.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.095 (2)", - "tab": "Efficiency", - "score": 0.5475642894950148 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.564, mean=0.564, max=0.564, sum=1.129 (2)", - "tab": "Efficiency", - "score": 0.5644530576604013 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=313.587, mean=313.587, max=313.587, sum=627.175 (2)", - "tab": "General information", - "score": 313.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=339.183, mean=339.183, max=339.183, sum=678.366 (2)", - "tab": "General information", - "score": 339.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.956, mean=0.956, max=0.956, sum=1.911 (2)", - "tab": "Efficiency", - "score": 0.9556485384948983 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=636.165, mean=636.165, max=636.165, sum=1272.331 (2)", - "tab": "General information", - "score": 636.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.816, - "details": { - "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.699, mean=0.699, max=0.699, sum=1.398 (2)", - "tab": "Efficiency", - "score": 0.6992296397320332 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.049, mean=442.049, max=442.049, sum=884.098 (2)", - "tab": "General information", - "score": 442.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=1.048, mean=1.048, max=1.048, sum=2.096 (2)", - "tab": "Efficiency", - "score": 1.0480207417692458 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=694.402, mean=694.402, max=694.402, sum=1388.804 (2)", - "tab": "General information", - "score": 694.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.874, - "details": { - "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.019 (2)", - "tab": "Efficiency", - "score": 0.5093999186765801 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=273.301, mean=273.301, max=273.301, sum=546.602 (2)", - "tab": "General information", - "score": 273.3009708737864 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.919, - "details": { - "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.394 (2)", - "tab": "Efficiency", - "score": 0.6969545549816556 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=420.35, mean=420.35, max=420.35, sum=840.701 (2)", - "tab": "General information", - "score": 420.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.534, mean=0.534, max=0.534, sum=1.067 (2)", - "tab": "Efficiency", - "score": 0.5335883450508118 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=330.89, mean=330.89, max=330.89, sum=661.78 (2)", - "tab": "General information", - "score": 330.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.844, - "details": { - "description": "min=0.844, mean=0.844, max=0.844, sum=1.688 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.073 (2)", - "tab": "Efficiency", - "score": 0.5363688258832442 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=306.669, mean=306.669, max=306.669, sum=613.338 (2)", - "tab": "General information", - "score": 306.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.295, - "details": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.734, mean=0.734, max=0.734, sum=1.468 (2)", - "tab": "Efficiency", - "score": 0.7340341696160377 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=1.057, mean=1.057, max=1.057, sum=2.114 (2)", - "tab": "Efficiency", - "score": 1.0570912433070176 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=487.003, mean=487.003, max=487.003, sum=974.006 (2)", - "tab": "General information", - "score": 487.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=682.542, mean=682.542, max=682.542, sum=1365.084 (2)", - "tab": "General information", - "score": 682.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "details": { - "description": "min=0.775, mean=0.775, max=0.775, sum=1.549 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.779 (2)", - "tab": "Efficiency", - "score": 0.8894402412028094 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=577.48, mean=577.48, max=577.48, sum=1154.961 (2)", - "tab": "General information", - "score": 577.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.623 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.846, mean=0.846, max=0.846, sum=1.691 (2)", - "tab": "Efficiency", - "score": 0.8456013467576768 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=532.198, mean=532.198, max=532.198, sum=1064.395 (2)", - "tab": "General information", - "score": 532.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736, - "details": { - "description": "min=0.736, mean=0.736, max=0.736, sum=1.473 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.698, mean=0.698, max=0.698, sum=1.395 (2)", - "tab": "Efficiency", - "score": 0.6977464697577737 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=418.655, mean=418.655, max=418.655, sum=837.309 (2)", - "tab": "General information", - "score": 418.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=1.737, mean=1.737, max=1.737, sum=3.473 (2)", - "tab": "Efficiency", - "score": 1.7365190982818604 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1185.869, mean=1185.869, max=1185.869, sum=2371.739 (2)", - "tab": "General information", - "score": 1185.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)", - "tab": "Efficiency", - "score": 0.7115461138350454 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=448.274, mean=448.274, max=448.274, sum=896.547 (2)", - "tab": "General information", - "score": 448.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.571, mean=0.571, max=0.571, sum=1.142 (2)", - "tab": "Efficiency", - "score": 0.571121395352375 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=328.753, mean=328.753, max=328.753, sum=657.506 (2)", - "tab": "General information", - "score": 328.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.895 (2)", - "tab": "Efficiency", - "score": 0.44760305142542073 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=260.164, mean=260.164, max=260.164, sum=520.327 (2)", - "tab": "General information", - "score": 260.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.265, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json deleted file mode 100644 index 963d13c9a..000000000 --- a/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_gemma-7b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma 7B", - "id": "google/gemma-7b", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.28, mean=0.661, max=0.891, sum=75.376 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.238, mean=0.312, max=0.614, sum=35.566 (114)", - "tab": "Efficiency", - "score": 0.3119781121356026 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=260.164, mean=624.617, max=2789.424, sum=71206.345 (114)", - "tab": "General information", - "score": 624.6170571214202 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.28, - "details": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.271, mean=0.271, max=0.271, sum=0.543 (2)", - "tab": "Efficiency", - "score": 0.27131984949111937 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=375.97, mean=375.97, max=375.97, sum=751.94 (2)", - "tab": "General information", - "score": 375.97 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.563, - "details": { - "description": "min=0.563, mean=0.563, max=0.563, sum=1.126 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.587 (2)", - "tab": "Efficiency", - "score": 0.2935627672407362 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=336.356, mean=336.356, max=336.356, sum=672.711 (2)", - "tab": "General information", - "score": 336.35555555555555 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.412, - "details": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.824 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)", - "tab": "Efficiency", - "score": 0.26709758281707763 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)", - "tab": "Efficiency", - "score": 0.2961096896065606 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Efficiency", - "score": 0.2900628304481506 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.298998281955719 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.255, mean=0.255, max=0.255, sum=0.51 (2)", - "tab": "Efficiency", - "score": 0.25478591119622906 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.614, mean=0.614, max=0.614, sum=1.229 (2)", - "tab": "Efficiency", - "score": 0.614474796781353 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=562.02, mean=562.02, max=562.02, sum=1124.04 (2)", - "tab": "General information", - "score": 562.02 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=474.799, mean=474.799, max=474.799, sum=949.597 (2)", - "tab": "General information", - "score": 474.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=849.86, mean=849.86, max=849.86, sum=1699.72 (2)", - "tab": "General information", - "score": 849.86 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=618.69, mean=618.69, max=618.69, sum=1237.38 (2)", - "tab": "General information", - "score": 618.69 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=505.37, mean=505.37, max=505.37, sum=1010.74 (2)", - "tab": "General information", - "score": 505.3699421965318 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=499.471, mean=499.471, max=499.471, sum=998.941 (2)", - "tab": "General information", - "score": 499.47058823529414 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.251, mean=0.251, max=0.251, sum=0.503 (2)", - "tab": "Efficiency", - "score": 0.2512932848930359 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=372.91, mean=372.91, max=372.91, sum=745.82 (2)", - "tab": "General information", - "score": 372.91 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.474, - "details": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.569 (2)", - "tab": "Efficiency", - "score": 0.28468057565521776 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=626.553, mean=626.553, max=626.553, sum=1253.105 (2)", - "tab": "General information", - "score": 626.5526315789474 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42, - "details": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.591 (2)", - "tab": "Efficiency", - "score": 0.2956829309463501 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=448.54, mean=448.54, max=448.54, sum=897.08 (2)", - "tab": "General information", - "score": 448.54 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.769, - "details": { - "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.521 (2)", - "tab": "Efficiency", - "score": 0.26035096910264754 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=399.87, mean=399.87, max=399.87, sum=799.741 (2)", - "tab": "General information", - "score": 399.8703703703704 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.453 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.552 (2)", - "tab": "Efficiency", - "score": 0.276187143141817 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=332.907, mean=332.907, max=332.907, sum=665.814 (2)", - "tab": "General information", - "score": 332.90675241157555 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.712, - "details": { - "description": "min=0.712, mean=0.712, max=0.712, sum=1.425 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.3106422327897128 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.583 (2)", - "tab": "Efficiency", - "score": 0.2916089237159026 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.385, mean=0.385, max=0.385, sum=0.77 (2)", - "tab": "Efficiency", - "score": 0.38496507379812867 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.261, mean=0.261, max=0.261, sum=0.522 (2)", - "tab": "Efficiency", - "score": 0.26078930010203444 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1105.092, mean=1105.092, max=1105.092, sum=2210.184 (2)", - "tab": "General information", - "score": 1105.0919117647059 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=747.418, mean=747.418, max=747.418, sum=1494.837 (2)", - "tab": "General information", - "score": 747.418439716312 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1677.119, mean=1677.119, max=1677.119, sum=3354.239 (2)", - "tab": "General information", - "score": 1677.119295958279 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=586.363, mean=586.363, max=586.363, sum=1172.725 (2)", - "tab": "General information", - "score": 586.3627450980392 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)", - "tab": "Efficiency", - "score": 0.29293906927108765 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=430.2, mean=430.2, max=430.2, sum=860.4 (2)", - "tab": "General information", - "score": 430.2 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.717, - "details": { - "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", - "tab": "Efficiency", - "score": 0.2697504366699018 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=594.421, mean=594.421, max=594.421, sum=1188.842 (2)", - "tab": "General information", - "score": 594.421052631579 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65, - "details": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)", - "tab": "Efficiency", - "score": 0.297854323387146 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=544.87, mean=544.87, max=544.87, sum=1089.74 (2)", - "tab": "General information", - "score": 544.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.698, - "details": { - "description": "min=0.698, mean=0.698, max=0.698, sum=1.396 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.515 (2)", - "tab": "Efficiency", - "score": 0.25743662816173624 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=394.592, mean=394.592, max=394.592, sum=789.185 (2)", - "tab": "General information", - "score": 394.5924528301887 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621, - "details": { - "description": "min=0.621, mean=0.621, max=0.621, sum=1.243 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)", - "tab": "Efficiency", - "score": 0.24898753064744017 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=301.213, mean=301.213, max=301.213, sum=602.426 (2)", - "tab": "General information", - "score": 301.21276595744683 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.628, - "details": { - "description": "min=0.628, mean=0.628, max=0.628, sum=1.255 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)", - "tab": "Efficiency", - "score": 0.25389171797653726 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=466.786, mean=466.786, max=466.786, sum=933.572 (2)", - "tab": "General information", - "score": 466.78620689655173 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.516, - "details": { - "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.573 (2)", - "tab": "Efficiency", - "score": 0.28658196219691523 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=589.341, mean=589.341, max=589.341, sum=1178.683 (2)", - "tab": "General information", - "score": 589.3412698412699 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508, - "details": { - "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.097 (2)", - "tab": "Efficiency", - "score": 0.5483344452721732 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=611.563, mean=611.563, max=611.563, sum=1223.127 (2)", - "tab": "General information", - "score": 611.563492063492 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.251, mean=0.251, max=0.251, sum=0.502 (2)", - "tab": "Efficiency", - "score": 0.2509724578549785 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)", - "tab": "Efficiency", - "score": 0.2920628909406991 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.3299814939498901 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.553, mean=0.553, max=0.553, sum=1.107 (2)", - "tab": "Efficiency", - "score": 0.5534277785908092 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.471, mean=0.471, max=0.471, sum=0.943 (2)", - "tab": "Efficiency", - "score": 0.47140675602537213 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.565 (2)", - "tab": "Efficiency", - "score": 0.28242908734731725 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)", - "tab": "Efficiency", - "score": 0.3160711630796775 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)", - "tab": "Efficiency", - "score": 0.25601085556877984 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.251, mean=0.251, max=0.251, sum=0.503 (2)", - "tab": "Efficiency", - "score": 0.25132194386810813 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.3394651823485924 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)", - "tab": "Efficiency", - "score": 0.3483087859022508 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)", - "tab": "Efficiency", - "score": 0.31601137033215276 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)", - "tab": "Efficiency", - "score": 0.4523548308540793 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.683 (2)", - "tab": "Efficiency", - "score": 0.34174740565980033 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=492.958, mean=492.958, max=492.958, sum=985.916 (2)", - "tab": "General information", - "score": 492.958064516129 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=505.064, mean=505.064, max=505.064, sum=1010.128 (2)", - "tab": "General information", - "score": 505.064039408867 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=927.13, mean=927.13, max=927.13, sum=1854.26 (2)", - "tab": "General information", - "score": 927.13 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2789.424, mean=2789.424, max=2789.424, sum=5578.848 (2)", - "tab": "General information", - "score": 2789.4242424242425 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=386.773, mean=386.773, max=386.773, sum=773.545 (2)", - "tab": "General information", - "score": 386.77272727272725 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=471.301, mean=471.301, max=471.301, sum=942.601 (2)", - "tab": "General information", - "score": 471.30051813471505 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=388.541, mean=388.541, max=388.541, sum=777.082 (2)", - "tab": "General information", - "score": 388.54102564102567 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=558.822, mean=558.822, max=558.822, sum=1117.644 (2)", - "tab": "General information", - "score": 558.8222222222222 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=407.954, mean=407.954, max=407.954, sum=815.908 (2)", - "tab": "General information", - "score": 407.953781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=583.715, mean=583.715, max=583.715, sum=1167.43 (2)", - "tab": "General information", - "score": 583.7152317880794 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=494.604, mean=494.604, max=494.604, sum=989.207 (2)", - "tab": "General information", - "score": 494.60366972477067 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=850.931, mean=850.931, max=850.931, sum=1701.861 (2)", - "tab": "General information", - "score": 850.9305555555555 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2197.583, mean=2197.583, max=2197.583, sum=4395.167 (2)", - "tab": "General information", - "score": 2197.5833333333335 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1418.544, mean=1418.544, max=1418.544, sum=2837.089 (2)", - "tab": "General information", - "score": 1418.5443037974683 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)", - "tab": "Efficiency", - "score": 0.4062144061375092 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.476 (2)", - "tab": "Efficiency", - "score": 0.23785374910776852 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=313.587, mean=313.587, max=313.587, sum=627.175 (2)", - "tab": "General information", - "score": 313.58744394618833 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=339.183, mean=339.183, max=339.183, sum=678.366 (2)", - "tab": "General information", - "score": 339.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)", - "tab": "Efficiency", - "score": 0.2918710767730208 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=636.165, mean=636.165, max=636.165, sum=1272.331 (2)", - "tab": "General information", - "score": 636.1652892561983 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.954 (2)", - "tab": "Efficiency", - "score": 0.47711458089161507 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.049, mean=442.049, max=442.049, sum=884.098 (2)", - "tab": "General information", - "score": 442.0490797546012 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.265, mean=0.265, max=0.265, sum=0.529 (2)", - "tab": "Efficiency", - "score": 0.2645489977938788 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=694.402, mean=694.402, max=694.402, sum=1388.804 (2)", - "tab": "General information", - "score": 694.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.587 (2)", - "tab": "Efficiency", - "score": 0.293421483734279 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=273.301, mean=273.301, max=273.301, sum=546.602 (2)", - "tab": "General information", - "score": 273.3009708737864 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.769 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.507 (2)", - "tab": "Efficiency", - "score": 0.25355013211568195 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=420.35, mean=420.35, max=420.35, sum=840.701 (2)", - "tab": "General information", - "score": 420.35042735042737 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.262, mean=0.262, max=0.262, sum=0.524 (2)", - "tab": "Efficiency", - "score": 0.26187997102737426 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=330.89, mean=330.89, max=330.89, sum=661.78 (2)", - "tab": "General information", - "score": 330.89 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=1.676 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.245, mean=0.245, max=0.245, sum=0.49 (2)", - "tab": "Efficiency", - "score": 0.24482133348935103 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=306.669, mean=306.669, max=306.669, sum=613.338 (2)", - "tab": "General information", - "score": 306.669220945083 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.377, - "details": { - "description": "min=0.377, mean=0.377, max=0.377, sum=0.753 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)", - "tab": "Efficiency", - "score": 0.2542355225954442 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.784 (2)", - "tab": "Efficiency", - "score": 0.39224682173915415 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=487.003, mean=487.003, max=487.003, sum=974.006 (2)", - "tab": "General information", - "score": 487.0028901734104 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=682.542, mean=682.542, max=682.542, sum=1365.084 (2)", - "tab": "General information", - "score": 682.5418994413408 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.3507605791091919 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=577.48, mean=577.48, max=577.48, sum=1154.961 (2)", - "tab": "General information", - "score": 577.4803921568628 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.756, - "details": { - "description": "min=0.756, mean=0.756, max=0.756, sum=1.512 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.509 (2)", - "tab": "Efficiency", - "score": 0.25446349014470604 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=532.198, mean=532.198, max=532.198, sum=1064.395 (2)", - "tab": "General information", - "score": 532.1975308641976 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.682, - "details": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.495 (2)", - "tab": "Efficiency", - "score": 0.24754605726762252 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=418.655, mean=418.655, max=418.655, sum=837.309 (2)", - "tab": "General information", - "score": 418.6545454545454 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.735, mean=0.735, max=0.735, sum=1.469 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)", - "tab": "Efficiency", - "score": 0.30012765806548447 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1185.869, mean=1185.869, max=1185.869, sum=2371.739 (2)", - "tab": "General information", - "score": 1185.869387755102 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.841, - "details": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)", - "tab": "Efficiency", - "score": 0.29275026487473826 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=448.274, mean=448.274, max=448.274, sum=896.547 (2)", - "tab": "General information", - "score": 448.27363184079604 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548, - "details": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.096 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.501 (2)", - "tab": "Efficiency", - "score": 0.2502512199332915 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=328.753, mean=328.753, max=328.753, sum=657.506 (2)", - "tab": "General information", - "score": 328.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)", - "tab": "Efficiency", - "score": 0.24913478734200462 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=260.164, mean=260.164, max=260.164, sum=520.327 (2)", - "tab": "General information", - "score": 260.1637426900585 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json deleted file mode 100644 index c0271bcb3..000000000 --- a/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_text-bison@001/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PaLM-2 Bison", - "id": "google/text-bison@001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.692, - "details": { - "description": "min=0.331, mean=0.692, max=0.927, sum=78.899 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.619, mean=1.845, max=23.541, sum=210.314 (114)", - "tab": "Efficiency", - "score": 1.8448593983042894 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=270.187, mean=635.61, max=2823.23, sum=72459.527 (114)", - "tab": "General information", - "score": 635.6098850770794 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=1.017, mean=1.017, max=1.017, sum=2.033 (2)", - "tab": "Efficiency", - "score": 1.0166235256195069 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=387.12, mean=387.12, max=387.12, sum=774.24 (2)", - "tab": "General information", - "score": 387.12 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644, - "details": { - "description": "min=0.644, mean=0.644, max=0.644, sum=1.289 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)", - "tab": "Efficiency", - "score": 0.836542272567749 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=344.089, mean=344.089, max=344.089, sum=688.178 (2)", - "tab": "General information", - "score": 344.0888888888889 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=1.352, mean=1.352, max=1.352, sum=2.704 (2)", - "tab": "Efficiency", - "score": 1.3518596124649047 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.862, mean=0.862, max=0.862, sum=1.724 (2)", - "tab": "Efficiency", - "score": 0.8619864102866914 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=23.541, mean=23.541, max=23.541, sum=47.082 (2)", - "tab": "Efficiency", - "score": 23.54095259666443 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.931, mean=0.931, max=0.931, sum=1.862 (2)", - "tab": "Efficiency", - "score": 0.9307789158821106 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)", - "tab": "Efficiency", - "score": 0.9472322174579422 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.856 (2)", - "tab": "Efficiency", - "score": 0.9281005485385072 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=573.7, mean=573.7, max=573.7, sum=1147.4 (2)", - "tab": "General information", - "score": 573.7 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=480.875, mean=480.875, max=480.875, sum=961.75 (2)", - "tab": "General information", - "score": 480.875 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=861.96, mean=861.96, max=861.96, sum=1723.92 (2)", - "tab": "General information", - "score": 861.96 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=636.94, mean=636.94, max=636.94, sum=1273.88 (2)", - "tab": "General information", - "score": 636.94 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=512.584, mean=512.584, max=512.584, sum=1025.168 (2)", - "tab": "General information", - "score": 512.5838150289018 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=513.647, mean=513.647, max=513.647, sum=1027.294 (2)", - "tab": "General information", - "score": 513.6470588235294 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.044, mean=1.044, max=1.044, sum=2.088 (2)", - "tab": "Efficiency", - "score": 1.0440657019615174 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=384.24, mean=384.24, max=384.24, sum=768.48 (2)", - "tab": "General information", - "score": 384.24 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518, - "details": { - "description": "min=0.518, mean=0.518, max=0.518, sum=1.035 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=1.047, mean=1.047, max=1.047, sum=2.094 (2)", - "tab": "Efficiency", - "score": 1.04721718921996 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=644.395, mean=644.395, max=644.395, sum=1288.789 (2)", - "tab": "General information", - "score": 644.3947368421053 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38, - "details": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)", - "tab": "Efficiency", - "score": 0.9128784847259521 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=455.63, mean=455.63, max=455.63, sum=911.26 (2)", - "tab": "General information", - "score": 455.63 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.769, - "details": { - "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)", - "tab": "Efficiency", - "score": 0.8838474772594593 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=414.444, mean=414.444, max=414.444, sum=828.889 (2)", - "tab": "General information", - "score": 414.44444444444446 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736, - "details": { - "description": "min=0.736, mean=0.736, max=0.736, sum=1.473 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.716, mean=0.716, max=0.716, sum=1.432 (2)", - "tab": "Efficiency", - "score": 0.7159656282406528 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=339.093, mean=339.093, max=339.093, sum=678.186 (2)", - "tab": "General information", - "score": 339.09324758842445 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.761, - "details": { - "description": "min=0.761, mean=0.761, max=0.761, sum=1.523 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=8.281, mean=8.281, max=8.281, sum=16.562 (2)", - "tab": "Efficiency", - "score": 8.280891868998022 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)", - "tab": "Efficiency", - "score": 0.8122333144465237 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)", - "tab": "Efficiency", - "score": 0.6340693978318335 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.394 (2)", - "tab": "Efficiency", - "score": 0.6971427946308859 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1104.614, mean=1104.614, max=1104.614, sum=2209.228 (2)", - "tab": "General information", - "score": 1104.6139705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=752.83, mean=752.83, max=752.83, sum=1505.66 (2)", - "tab": "General information", - "score": 752.8297872340426 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1701.909, mean=1701.909, max=1701.909, sum=3403.819 (2)", - "tab": "General information", - "score": 1701.9093872229466 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=594.446, mean=594.446, max=594.446, sum=1188.892 (2)", - "tab": "General information", - "score": 594.4460784313726 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=1.101, mean=1.101, max=1.101, sum=2.202 (2)", - "tab": "Efficiency", - "score": 1.1012366461753844 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=440.48, mean=440.48, max=440.48, sum=880.96 (2)", - "tab": "General information", - "score": 440.48 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.803, - "details": { - "description": "min=0.803, mean=0.803, max=0.803, sum=1.605 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.715, mean=0.715, max=0.715, sum=1.43 (2)", - "tab": "Efficiency", - "score": 0.7148221495904421 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=613.033, mean=613.033, max=613.033, sum=1226.066 (2)", - "tab": "General information", - "score": 613.0328947368421 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)", - "tab": "Efficiency", - "score": 0.8926668572425842 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=559.31, mean=559.31, max=559.31, sum=1118.62 (2)", - "tab": "General information", - "score": 559.31 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725, - "details": { - "description": "min=0.725, mean=0.725, max=0.725, sum=1.449 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.541 (2)", - "tab": "Efficiency", - "score": 0.7704581980435353 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=394.77, mean=394.77, max=394.77, sum=789.54 (2)", - "tab": "General information", - "score": 394.76981132075474 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694, - "details": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.387 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.828, mean=0.828, max=0.828, sum=1.656 (2)", - "tab": "Efficiency", - "score": 0.8279458959051903 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=309.477, mean=309.477, max=309.477, sum=618.953 (2)", - "tab": "General information", - "score": 309.4765957446809 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.379 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=10.257, mean=10.257, max=10.257, sum=20.514 (2)", - "tab": "Efficiency", - "score": 10.257030944166512 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=480.524, mean=480.524, max=480.524, sum=961.048 (2)", - "tab": "General information", - "score": 480.5241379310345 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.487, - "details": { - "description": "min=0.487, mean=0.487, max=0.487, sum=0.974 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)", - "tab": "Efficiency", - "score": 0.7508898708555434 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=599.828, mean=599.828, max=599.828, sum=1199.656 (2)", - "tab": "General information", - "score": 599.8280423280423 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.525 (2)", - "tab": "Efficiency", - "score": 0.7626136711665562 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=623.508, mean=623.508, max=623.508, sum=1247.016 (2)", - "tab": "General information", - "score": 623.5079365079365 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.869, mean=0.869, max=0.869, sum=1.738 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.577 (2)", - "tab": "Efficiency", - "score": 0.7886250380546816 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.837, mean=0.837, max=0.837, sum=1.675 (2)", - "tab": "Efficiency", - "score": 0.8373666197208348 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.961, mean=0.961, max=0.961, sum=1.922 (2)", - "tab": "Efficiency", - "score": 0.9611564636230469 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.13, mean=1.13, max=1.13, sum=2.26 (2)", - "tab": "Efficiency", - "score": 1.129964493260239 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.754, mean=0.754, max=0.754, sum=1.508 (2)", - "tab": "Efficiency", - "score": 0.7538033362590906 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)", - "tab": "Efficiency", - "score": 0.6876482963562012 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.718, mean=0.718, max=0.718, sum=1.437 (2)", - "tab": "Efficiency", - "score": 0.7183168649673461 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)", - "tab": "Efficiency", - "score": 0.7819750944773356 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.603 (2)", - "tab": "Efficiency", - "score": 0.8016475258755082 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.721 (2)", - "tab": "Efficiency", - "score": 0.860422892286288 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.35 (2)", - "tab": "Efficiency", - "score": 0.6752404208577008 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=9.407, mean=9.407, max=9.407, sum=18.814 (2)", - "tab": "Efficiency", - "score": 9.407231820954216 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.054, mean=1.054, max=1.054, sum=2.109 (2)", - "tab": "Efficiency", - "score": 1.0542718312319588 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.695 (2)", - "tab": "Efficiency", - "score": 0.8476851751029743 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=501.255, mean=501.255, max=501.255, sum=1002.51 (2)", - "tab": "General information", - "score": 501.2548387096774 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=515.473, mean=515.473, max=515.473, sum=1030.946 (2)", - "tab": "General information", - "score": 515.4729064039409 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=954.08, mean=954.08, max=954.08, sum=1908.16 (2)", - "tab": "General information", - "score": 954.08 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2823.23, mean=2823.23, max=2823.23, sum=5646.461 (2)", - "tab": "General information", - "score": 2823.230303030303 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=392.939, mean=392.939, max=392.939, sum=785.879 (2)", - "tab": "General information", - "score": 392.93939393939394 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=475.44, mean=475.44, max=475.44, sum=950.881 (2)", - "tab": "General information", - "score": 475.440414507772 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=395.962, mean=395.962, max=395.962, sum=791.923 (2)", - "tab": "General information", - "score": 395.96153846153845 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=580.393, mean=580.393, max=580.393, sum=1160.785 (2)", - "tab": "General information", - "score": 580.3925925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=414.361, mean=414.361, max=414.361, sum=828.723 (2)", - "tab": "General information", - "score": 414.3613445378151 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=592.252, mean=592.252, max=592.252, sum=1184.503 (2)", - "tab": "General information", - "score": 592.2516556291391 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=496.51, mean=496.51, max=496.51, sum=993.02 (2)", - "tab": "General information", - "score": 496.5100917431193 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=860.532, mean=860.532, max=860.532, sum=1721.065 (2)", - "tab": "General information", - "score": 860.5324074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2239.544, mean=2239.544, max=2239.544, sum=4479.088 (2)", - "tab": "General information", - "score": 2239.544117647059 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1437.051, mean=1437.051, max=1437.051, sum=2874.101 (2)", - "tab": "General information", - "score": 1437.0506329113923 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.619, mean=0.619, max=0.619, sum=1.237 (2)", - "tab": "Efficiency", - "score": 0.6185014632785267 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)", - "tab": "Efficiency", - "score": 0.8510732850955642 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=323.906, mean=323.906, max=323.906, sum=647.812 (2)", - "tab": "General information", - "score": 323.90582959641256 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=338.74, mean=338.74, max=338.74, sum=677.481 (2)", - "tab": "General information", - "score": 338.74045801526717 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.859 (2)", - "tab": "Efficiency", - "score": 0.929545400556454 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=651.686, mean=651.686, max=651.686, sum=1303.372 (2)", - "tab": "General information", - "score": 651.6859504132232 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.853, - "details": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.813, mean=0.813, max=0.813, sum=1.627 (2)", - "tab": "Efficiency", - "score": 0.8133661293544652 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=443.969, mean=443.969, max=443.969, sum=887.939 (2)", - "tab": "General information", - "score": 443.96932515337426 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.916, mean=0.916, max=0.916, sum=1.832 (2)", - "tab": "Efficiency", - "score": 0.9159843921661377 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=705.973, mean=705.973, max=705.973, sum=1411.946 (2)", - "tab": "General information", - "score": 705.9732142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.817, mean=0.817, max=0.817, sum=1.633 (2)", - "tab": "Efficiency", - "score": 0.8166041281616804 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=284.68, mean=284.68, max=284.68, sum=569.359 (2)", - "tab": "General information", - "score": 284.6796116504854 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", - "tab": "Efficiency", - "score": 0.789409975720267 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=428.726, mean=428.726, max=428.726, sum=857.453 (2)", - "tab": "General information", - "score": 428.7264957264957 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)", - "tab": "Efficiency", - "score": 0.8565307760238647 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=334.69, mean=334.69, max=334.69, sum=669.38 (2)", - "tab": "General information", - "score": 334.69 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.866, - "details": { - "description": "min=0.866, mean=0.866, max=0.866, sum=1.732 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=2.759, mean=2.759, max=2.759, sum=5.518 (2)", - "tab": "Efficiency", - "score": 2.7590373143991442 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=325.215, mean=325.215, max=325.215, sum=650.429 (2)", - "tab": "General information", - "score": 325.2145593869732 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.369, - "details": { - "description": "min=0.369, mean=0.369, max=0.369, sum=0.737 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.749, mean=0.749, max=0.749, sum=1.497 (2)", - "tab": "Efficiency", - "score": 0.7485969907286539 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.781, mean=0.781, max=0.781, sum=1.561 (2)", - "tab": "Efficiency", - "score": 0.7806768483955767 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=494.63, mean=494.63, max=494.63, sum=989.26 (2)", - "tab": "General information", - "score": 494.6300578034682 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=687.566, mean=687.566, max=687.566, sum=1375.133 (2)", - "tab": "General information", - "score": 687.5664804469274 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.621 (2)", - "tab": "Efficiency", - "score": 0.8104506489498163 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=589.663, mean=589.663, max=589.663, sum=1179.327 (2)", - "tab": "General information", - "score": 589.6633986928105 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.623 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.399 (2)", - "tab": "Efficiency", - "score": 0.6996216737193826 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=538.179, mean=538.179, max=538.179, sum=1076.358 (2)", - "tab": "General information", - "score": 538.179012345679 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691, - "details": { - "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.98, mean=0.98, max=0.98, sum=1.961 (2)", - "tab": "Efficiency", - "score": 0.980262413891879 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=426.982, mean=426.982, max=426.982, sum=853.964 (2)", - "tab": "General information", - "score": 426.9818181818182 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.812, - "details": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)", - "tab": "Efficiency", - "score": 0.8567250339352355 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1185.8, mean=1185.8, max=1185.8, sum=2371.6 (2)", - "tab": "General information", - "score": 1185.8 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=7.515, mean=7.515, max=7.515, sum=15.029 (2)", - "tab": "Efficiency", - "score": 7.514506837028769 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=459.642, mean=459.642, max=459.642, sum=919.284 (2)", - "tab": "General information", - "score": 459.64179104477614 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.494, - "details": { - "description": "min=0.494, mean=0.494, max=0.494, sum=0.988 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.577 (2)", - "tab": "Efficiency", - "score": 0.7884655989796282 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=337.06, mean=337.06, max=337.06, sum=674.12 (2)", - "tab": "General information", - "score": 337.06024096385545 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.604 (2)", - "tab": "Efficiency", - "score": 0.8022187299895704 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=270.187, mean=270.187, max=270.187, sum=540.374 (2)", - "tab": "General information", - "score": 270.187134502924 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.192, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json deleted file mode 100644 index 42c5040aa..000000000 --- a/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PaLM-2 Unicorn", - "id": "google/text-unicorn@001", - "developer": "google", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.493, mean=0.786, max=0.979, sum=89.606 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.743, mean=1.052, max=2.108, sum=119.953 (114)", - "tab": "Efficiency", - "score": 1.0522220782452074 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=270.187, mean=635.61, max=2823.23, sum=72459.527 (114)", - "tab": "General information", - "score": 635.6098850770794 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=1.277, mean=1.277, max=1.277, sum=2.555 (2)", - "tab": "Efficiency", - "score": 1.2773328518867493 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=387.12, mean=387.12, max=387.12, sum=774.24 (2)", - "tab": "General information", - "score": 387.12 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.467 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.957, mean=0.957, max=0.957, sum=1.914 (2)", - "tab": "Efficiency", - "score": 0.9569159172199391 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=344.089, mean=344.089, max=344.089, sum=688.178 (2)", - "tab": "General information", - "score": 344.0888888888889 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549, - "details": { - "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.869 (2)", - "tab": "Efficiency", - "score": 0.9343120718002319 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.873, mean=0.873, max=0.873, sum=1.746 (2)", - "tab": "Efficiency", - "score": 0.8729922622442245 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=1.165, mean=1.165, max=1.165, sum=2.33 (2)", - "tab": "Efficiency", - "score": 1.165095055103302 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=1.062, mean=1.062, max=1.062, sum=2.124 (2)", - "tab": "Efficiency", - "score": 1.0619186329841614 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.978, mean=0.978, max=0.978, sum=1.957 (2)", - "tab": "Efficiency", - "score": 0.978282785140021 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Efficiency", - "score": 0.8518095483966902 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=573.7, mean=573.7, max=573.7, sum=1147.4 (2)", - "tab": "General information", - "score": 573.7 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=480.875, mean=480.875, max=480.875, sum=961.75 (2)", - "tab": "General information", - "score": 480.875 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=861.96, mean=861.96, max=861.96, sum=1723.92 (2)", - "tab": "General information", - "score": 861.96 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=636.94, mean=636.94, max=636.94, sum=1273.88 (2)", - "tab": "General information", - "score": 636.94 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=512.584, mean=512.584, max=512.584, sum=1025.168 (2)", - "tab": "General information", - "score": 512.5838150289018 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=513.647, mean=513.647, max=513.647, sum=1027.294 (2)", - "tab": "General information", - "score": 513.6470588235294 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)", - "tab": "Efficiency", - "score": 0.8448482728004456 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=384.24, mean=384.24, max=384.24, sum=768.48 (2)", - "tab": "General information", - "score": 384.24 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649, - "details": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Efficiency", - "score": 0.8522159112127203 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=644.395, mean=644.395, max=644.395, sum=1288.789 (2)", - "tab": "General information", - "score": 644.3947368421053 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.888, mean=0.888, max=0.888, sum=1.775 (2)", - "tab": "Efficiency", - "score": 0.8876941871643066 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=455.63, mean=455.63, max=455.63, sum=911.26 (2)", - "tab": "General information", - "score": 455.63 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=1.017, mean=1.017, max=1.017, sum=2.034 (2)", - "tab": "Efficiency", - "score": 1.0168068651799802 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=414.444, mean=414.444, max=414.444, sum=828.889 (2)", - "tab": "General information", - "score": 414.44444444444446 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.672 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.895, mean=0.895, max=0.895, sum=1.79 (2)", - "tab": "Efficiency", - "score": 0.8949410808048064 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=339.093, mean=339.093, max=339.093, sum=678.186 (2)", - "tab": "General information", - "score": 339.09324758842445 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.858, - "details": { - "description": "min=0.858, mean=0.858, max=0.858, sum=1.716 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.088, mean=1.088, max=1.088, sum=2.175 (2)", - "tab": "Efficiency", - "score": 1.0875138991019304 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.978, mean=0.978, max=0.978, sum=1.956 (2)", - "tab": "Efficiency", - "score": 0.9778145923682139 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.205, mean=1.205, max=1.205, sum=2.41 (2)", - "tab": "Efficiency", - "score": 1.204983455416743 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.578 (2)", - "tab": "Efficiency", - "score": 0.7891469753645604 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1104.614, mean=1104.614, max=1104.614, sum=2209.228 (2)", - "tab": "General information", - "score": 1104.6139705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=752.83, mean=752.83, max=752.83, sum=1505.66 (2)", - "tab": "General information", - "score": 752.8297872340426 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1701.909, mean=1701.909, max=1701.909, sum=3403.819 (2)", - "tab": "General information", - "score": 1701.9093872229466 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=594.446, mean=594.446, max=594.446, sum=1188.892 (2)", - "tab": "General information", - "score": 594.4460784313726 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.485 (2)", - "tab": "Efficiency", - "score": 0.7426803350448609 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=440.48, mean=440.48, max=440.48, sum=880.96 (2)", - "tab": "General information", - "score": 440.48 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.862, - "details": { - "description": "min=0.862, mean=0.862, max=0.862, sum=1.724 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)", - "tab": "Efficiency", - "score": 0.8429784712038542 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=613.033, mean=613.033, max=613.033, sum=1226.066 (2)", - "tab": "General information", - "score": 613.0328947368421 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=1.018, mean=1.018, max=1.018, sum=2.035 (2)", - "tab": "Efficiency", - "score": 1.0176324987411498 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=559.31, mean=559.31, max=559.31, sum=1118.62 (2)", - "tab": "General information", - "score": 559.31 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.954, mean=0.954, max=0.954, sum=1.909 (2)", - "tab": "Efficiency", - "score": 0.9543584787620688 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=394.77, mean=394.77, max=394.77, sum=789.54 (2)", - "tab": "General information", - "score": 394.76981132075474 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.617 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.667 (2)", - "tab": "Efficiency", - "score": 0.8336589884250722 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=309.477, mean=309.477, max=309.477, sum=618.953 (2)", - "tab": "General information", - "score": 309.4765957446809 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772, - "details": { - "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=1.064, mean=1.064, max=1.064, sum=2.128 (2)", - "tab": "Efficiency", - "score": 1.0639554155283961 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=480.524, mean=480.524, max=480.524, sum=961.048 (2)", - "tab": "General information", - "score": 480.5241379310345 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.661, mean=0.661, max=0.661, sum=1.323 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=1.026, mean=1.026, max=1.026, sum=2.052 (2)", - "tab": "Efficiency", - "score": 1.0261994568759172 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=599.828, mean=599.828, max=599.828, sum=1199.656 (2)", - "tab": "General information", - "score": 599.8280423280423 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.317 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=1.016, mean=1.016, max=1.016, sum=2.032 (2)", - "tab": "Efficiency", - "score": 1.0157842484731523 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=623.508, mean=623.508, max=623.508, sum=1247.016 (2)", - "tab": "General information", - "score": 623.5079365079365 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911, - "details": { - "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=1.026, mean=1.026, max=1.026, sum=2.052 (2)", - "tab": "Efficiency", - "score": 1.026222055189071 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=1.054, mean=1.054, max=1.054, sum=2.109 (2)", - "tab": "Efficiency", - "score": 1.054317417990398 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=1.519, mean=1.519, max=1.519, sum=3.039 (2)", - "tab": "Efficiency", - "score": 1.519298493862152 - }, - "High School European History - Observed inference time (s)": { - "description": "min=2.108, mean=2.108, max=2.108, sum=4.215 (2)", - "tab": "Efficiency", - "score": 2.107529640197754 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=1.159, mean=1.159, max=1.159, sum=2.319 (2)", - "tab": "Efficiency", - "score": 1.1594982544581096 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=1.056, mean=1.056, max=1.056, sum=2.112 (2)", - "tab": "Efficiency", - "score": 1.0561638829621627 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=1.016, mean=1.016, max=1.016, sum=2.033 (2)", - "tab": "Efficiency", - "score": 1.0163854268880992 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=1.018, mean=1.018, max=1.018, sum=2.036 (2)", - "tab": "Efficiency", - "score": 1.0180342506479334 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)", - "tab": "Efficiency", - "score": 0.9054926122937884 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=1.252, mean=1.252, max=1.252, sum=2.503 (2)", - "tab": "Efficiency", - "score": 1.2517439276966829 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.954, mean=0.954, max=0.954, sum=1.909 (2)", - "tab": "Efficiency", - "score": 0.9543260762450891 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=1.329, mean=1.329, max=1.329, sum=2.657 (2)", - "tab": "Efficiency", - "score": 1.3287169370386336 - }, - "High School US History - Observed inference time (s)": { - "description": "min=2.056, mean=2.056, max=2.056, sum=4.112 (2)", - "tab": "Efficiency", - "score": 2.0560385222528494 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.276, mean=1.276, max=1.276, sum=2.553 (2)", - "tab": "Efficiency", - "score": 1.2764891250224053 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=501.255, mean=501.255, max=501.255, sum=1002.51 (2)", - "tab": "General information", - "score": 501.2548387096774 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=515.473, mean=515.473, max=515.473, sum=1030.946 (2)", - "tab": "General information", - "score": 515.4729064039409 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=954.08, mean=954.08, max=954.08, sum=1908.16 (2)", - "tab": "General information", - "score": 954.08 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2823.23, mean=2823.23, max=2823.23, sum=5646.461 (2)", - "tab": "General information", - "score": 2823.230303030303 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=392.939, mean=392.939, max=392.939, sum=785.879 (2)", - "tab": "General information", - "score": 392.93939393939394 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=475.44, mean=475.44, max=475.44, sum=950.881 (2)", - "tab": "General information", - "score": 475.440414507772 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=395.962, mean=395.962, max=395.962, sum=791.923 (2)", - "tab": "General information", - "score": 395.96153846153845 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=580.393, mean=580.393, max=580.393, sum=1160.785 (2)", - "tab": "General information", - "score": 580.3925925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=414.361, mean=414.361, max=414.361, sum=828.723 (2)", - "tab": "General information", - "score": 414.3613445378151 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=592.252, mean=592.252, max=592.252, sum=1184.503 (2)", - "tab": "General information", - "score": 592.2516556291391 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=496.51, mean=496.51, max=496.51, sum=993.02 (2)", - "tab": "General information", - "score": 496.5100917431193 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=860.532, mean=860.532, max=860.532, sum=1721.065 (2)", - "tab": "General information", - "score": 860.5324074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2239.544, mean=2239.544, max=2239.544, sum=4479.088 (2)", - "tab": "General information", - "score": 2239.544117647059 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1437.051, mean=1437.051, max=1437.051, sum=2874.101 (2)", - "tab": "General information", - "score": 1437.0506329113923 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)", - "tab": "Efficiency", - "score": 0.8839223662833996 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=1.095, mean=1.095, max=1.095, sum=2.191 (2)", - "tab": "Efficiency", - "score": 1.0953879956980699 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=323.906, mean=323.906, max=323.906, sum=647.812 (2)", - "tab": "General information", - "score": 323.90582959641256 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=338.74, mean=338.74, max=338.74, sum=677.481 (2)", - "tab": "General information", - "score": 338.74045801526717 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.909, - "details": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=1.104, mean=1.104, max=1.104, sum=2.208 (2)", - "tab": "Efficiency", - "score": 1.1039516984923812 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=651.686, mean=651.686, max=651.686, sum=1303.372 (2)", - "tab": "General information", - "score": 651.6859504132232 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=1.094, mean=1.094, max=1.094, sum=2.188 (2)", - "tab": "Efficiency", - "score": 1.0941538839983793 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=443.969, mean=443.969, max=443.969, sum=887.939 (2)", - "tab": "General information", - "score": 443.96932515337426 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=1.11, mean=1.11, max=1.11, sum=2.22 (2)", - "tab": "Efficiency", - "score": 1.110024324485234 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=705.973, mean=705.973, max=705.973, sum=1411.946 (2)", - "tab": "General information", - "score": 705.9732142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=1.154, mean=1.154, max=1.154, sum=2.308 (2)", - "tab": "Efficiency", - "score": 1.153875772235463 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=284.68, mean=284.68, max=284.68, sum=569.359 (2)", - "tab": "General information", - "score": 284.6796116504854 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=1.031, mean=1.031, max=1.031, sum=2.063 (2)", - "tab": "Efficiency", - "score": 1.0312827428181965 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=428.726, mean=428.726, max=428.726, sum=857.453 (2)", - "tab": "General information", - "score": 428.7264957264957 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=1.068, mean=1.068, max=1.068, sum=2.136 (2)", - "tab": "Efficiency", - "score": 1.0681284523010255 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=334.69, mean=334.69, max=334.69, sum=669.38 (2)", - "tab": "General information", - "score": 334.69 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)", - "tab": "Efficiency", - "score": 0.8939257733818824 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=325.215, mean=325.215, max=325.215, sum=650.429 (2)", - "tab": "General information", - "score": 325.2145593869732 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.124 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.988, mean=0.988, max=0.988, sum=1.976 (2)", - "tab": "Efficiency", - "score": 0.9880901995421834 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.968, mean=0.968, max=0.968, sum=1.935 (2)", - "tab": "Efficiency", - "score": 0.9677273009742439 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=494.63, mean=494.63, max=494.63, sum=989.26 (2)", - "tab": "General information", - "score": 494.6300578034682 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=687.566, mean=687.566, max=687.566, sum=1375.133 (2)", - "tab": "General information", - "score": 687.5664804469274 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.856, - "details": { - "description": "min=0.856, mean=0.856, max=0.856, sum=1.712 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)", - "tab": "Efficiency", - "score": 0.9120152238147711 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=589.663, mean=589.663, max=589.663, sum=1179.327 (2)", - "tab": "General information", - "score": 589.6633986928105 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.916, mean=0.916, max=0.916, sum=1.831 (2)", - "tab": "Efficiency", - "score": 0.9155398577819636 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=538.179, mean=538.179, max=538.179, sum=1076.358 (2)", - "tab": "General information", - "score": 538.179012345679 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.579 (2)", - "tab": "Efficiency", - "score": 0.7896393559195779 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=426.982, mean=426.982, max=426.982, sum=853.964 (2)", - "tab": "General information", - "score": 426.9818181818182 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.657 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=1.254, mean=1.254, max=1.254, sum=2.508 (2)", - "tab": "Efficiency", - "score": 1.2542338507516044 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1185.8, mean=1185.8, max=1185.8, sum=2371.6 (2)", - "tab": "General information", - "score": 1185.8 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.681 (2)", - "tab": "Efficiency", - "score": 0.8403987184685854 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=459.642, mean=459.642, max=459.642, sum=919.284 (2)", - "tab": "General information", - "score": 459.64179104477614 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572, - "details": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=1.029, mean=1.029, max=1.029, sum=2.059 (2)", - "tab": "Efficiency", - "score": 1.0293473134557884 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=337.06, mean=337.06, max=337.06, sum=674.12 (2)", - "tab": "General information", - "score": 337.06024096385545 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.963, mean=0.963, max=0.963, sum=1.926 (2)", - "tab": "Efficiency", - "score": 0.9628847495854249 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=270.187, mean=270.187, max=270.187, sum=540.374 (2)", - "tab": "General information", - "score": 270.187134502924 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.142, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json deleted file mode 100644 index 453cd8b3a..000000000 --- a/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 2 13B", - "id": "meta/llama-2-13b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.235, mean=0.554, max=0.83, sum=63.174 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.323, mean=0.492, max=1.697, sum=56.065 (114)", - "tab": "Efficiency", - "score": 0.49179914059061297 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)", - "tab": "General information", - "score": 706.6820126388612 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27, - "details": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.361, mean=0.361, max=0.361, sum=0.722 (2)", - "tab": "Efficiency", - "score": 0.3610322856903076 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)", - "tab": "General information", - "score": 397.65 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.496, - "details": { - "description": "min=0.496, mean=0.496, max=0.496, sum=0.993 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.357, mean=0.357, max=0.357, sum=0.715 (2)", - "tab": "Efficiency", - "score": 0.35744349868209274 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=409.133, mean=409.133, max=409.133, sum=818.267 (2)", - "tab": "General information", - "score": 409.1333333333333 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.235, - "details": { - "description": "min=0.235, mean=0.235, max=0.235, sum=0.471 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.449, mean=0.449, max=0.449, sum=0.897 (2)", - "tab": "Efficiency", - "score": 0.44854954242706296 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.802 (2)", - "tab": "Efficiency", - "score": 0.40112912986013627 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.534, mean=0.534, max=0.534, sum=1.069 (2)", - "tab": "Efficiency", - "score": 0.5343992376327514 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.454, mean=0.454, max=0.454, sum=0.909 (2)", - "tab": "Efficiency", - "score": 0.45426050424575803 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)", - "tab": "Efficiency", - "score": 0.4522962446157643 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.826 (2)", - "tab": "Efficiency", - "score": 0.4130270574607101 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)", - "tab": "General information", - "score": 622.43 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)", - "tab": "General information", - "score": 553.6319444444445 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)", - "tab": "General information", - "score": 901.14 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)", - "tab": "General information", - "score": 646.96 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)", - "tab": "General information", - "score": 608.6705202312139 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)", - "tab": "General information", - "score": 551.8725490196078 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.355, mean=0.355, max=0.355, sum=0.71 (2)", - "tab": "Efficiency", - "score": 0.3552073335647583 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)", - "tab": "General information", - "score": 428.17 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307, - "details": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.455, mean=0.455, max=0.455, sum=0.91 (2)", - "tab": "Efficiency", - "score": 0.45517582014987346 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)", - "tab": "General information", - "score": 684.6754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38, - "details": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.781 (2)", - "tab": "Efficiency", - "score": 0.3903778100013733 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=484.54, mean=484.54, max=484.54, sum=969.08 (2)", - "tab": "General information", - "score": 484.54 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704, - "details": { - "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.718 (2)", - "tab": "Efficiency", - "score": 0.35898366460093745 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=449.898, mean=449.898, max=449.898, sum=899.796 (2)", - "tab": "General information", - "score": 449.89814814814815 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672, - "details": { - "description": "min=0.672, mean=0.672, max=0.672, sum=1.344 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.3226076184361694 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=372.122, mean=372.122, max=372.122, sum=744.244 (2)", - "tab": "General information", - "score": 372.12218649517683 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.567, - "details": { - "description": "min=0.567, mean=0.567, max=0.567, sum=1.134 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)", - "tab": "Efficiency", - "score": 0.7594411802642486 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.55, mean=0.55, max=0.55, sum=1.099 (2)", - "tab": "Efficiency", - "score": 0.5495186367778914 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.102, mean=1.102, max=1.102, sum=2.205 (2)", - "tab": "Efficiency", - "score": 1.1024409701957851 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.875 (2)", - "tab": "Efficiency", - "score": 0.43751365219066346 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)", - "tab": "General information", - "score": 1330.6470588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)", - "tab": "General information", - "score": 823.2765957446809 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)", - "tab": "General information", - "score": 1915.0071707953064 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)", - "tab": "General information", - "score": 650.0784313725491 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.782 (2)", - "tab": "Efficiency", - "score": 0.3909334921836853 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)", - "tab": "General information", - "score": 479.81 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.546, - "details": { - "description": "min=0.546, mean=0.546, max=0.546, sum=1.092 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.945 (2)", - "tab": "Efficiency", - "score": 0.47229841351509094 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)", - "tab": "General information", - "score": 681.078947368421 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)", - "tab": "Efficiency", - "score": 0.4758677792549133 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)", - "tab": "General information", - "score": 674.44 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.592, - "details": { - "description": "min=0.592, mean=0.592, max=0.592, sum=1.185 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)", - "tab": "Efficiency", - "score": 0.38589143843021034 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=487.374, mean=487.374, max=487.374, sum=974.747 (2)", - "tab": "General information", - "score": 487.3735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.413, - "details": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.826 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.961 (2)", - "tab": "Efficiency", - "score": 0.4802838366082374 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=333.153, mean=333.153, max=333.153, sum=666.306 (2)", - "tab": "General information", - "score": 333.1531914893617 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.979 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.737 (2)", - "tab": "Efficiency", - "score": 0.36833986249463313 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=497.779, mean=497.779, max=497.779, sum=995.559 (2)", - "tab": "General information", - "score": 497.7793103448276 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307, - "details": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.497, mean=0.497, max=0.497, sum=0.995 (2)", - "tab": "Efficiency", - "score": 0.49746112028757733 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)", - "tab": "General information", - "score": 609.1560846560847 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.381, - "details": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.762 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)", - "tab": "Efficiency", - "score": 0.4436971952044775 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)", - "tab": "General information", - "score": 691.8095238095239 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.705, - "details": { - "description": "min=0.705, mean=0.705, max=0.705, sum=1.409 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.437, mean=0.437, max=0.437, sum=0.873 (2)", - "tab": "Efficiency", - "score": 0.43674747020967547 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.42318584883741556 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.567, mean=0.567, max=0.567, sum=1.133 (2)", - "tab": "Efficiency", - "score": 0.5666733002662658 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.697, mean=1.697, max=1.697, sum=3.394 (2)", - "tab": "Efficiency", - "score": 1.6971724553541703 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.532, mean=0.532, max=0.532, sum=1.065 (2)", - "tab": "Efficiency", - "score": 0.5323956747247716 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)", - "tab": "Efficiency", - "score": 0.36752033727774347 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.378, mean=0.378, max=0.378, sum=0.756 (2)", - "tab": "Efficiency", - "score": 0.3781696270673703 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.803 (2)", - "tab": "Efficiency", - "score": 0.4017471119209572 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)", - "tab": "Efficiency", - "score": 0.3603636326910067 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.858 (2)", - "tab": "Efficiency", - "score": 0.4290682780032126 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.42302281703424016 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.535, mean=0.535, max=0.535, sum=1.069 (2)", - "tab": "Efficiency", - "score": 0.534513204186051 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.332, mean=1.332, max=1.332, sum=2.665 (2)", - "tab": "Efficiency", - "score": 1.33243932910994 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.866, mean=0.866, max=0.866, sum=1.733 (2)", - "tab": "Efficiency", - "score": 0.8663106930406788 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)", - "tab": "General information", - "score": 596.8935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)", - "tab": "General information", - "score": 568.6650246305419 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)", - "tab": "General information", - "score": 988.57 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)", - "tab": "General information", - "score": 3159.6363636363635 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=436.657, mean=436.657, max=436.657, sum=873.313 (2)", - "tab": "General information", - "score": 436.65656565656565 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)", - "tab": "General information", - "score": 527.9274611398964 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=445.662, mean=445.662, max=445.662, sum=891.323 (2)", - "tab": "General information", - "score": 445.66153846153844 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)", - "tab": "General information", - "score": 579.1814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=449.492, mean=449.492, max=449.492, sum=898.983 (2)", - "tab": "General information", - "score": 449.49159663865544 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)", - "tab": "General information", - "score": 621.7880794701987 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)", - "tab": "General information", - "score": 585.9192660550459 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)", - "tab": "General information", - "score": 908.2083333333334 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)", - "tab": "General information", - "score": 2535.323529411765 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)", - "tab": "General information", - "score": 1638.2194092827003 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.618, - "details": { - "description": "min=0.618, mean=0.618, max=0.618, sum=1.237 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.473, mean=0.473, max=0.473, sum=0.947 (2)", - "tab": "Efficiency", - "score": 0.47327254385157014 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.737 (2)", - "tab": "Efficiency", - "score": 0.3683396113737849 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=361.26, mean=361.26, max=361.26, sum=722.52 (2)", - "tab": "General information", - "score": 361.26008968609864 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=403.382, mean=403.382, max=403.382, sum=806.763 (2)", - "tab": "General information", - "score": 403.381679389313 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.504 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.488, mean=0.488, max=0.488, sum=0.975 (2)", - "tab": "Efficiency", - "score": 0.48763008551164105 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)", - "tab": "General information", - "score": 729.4628099173553 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.687, - "details": { - "description": "min=0.687, mean=0.687, max=0.687, sum=1.374 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.361, mean=0.361, max=0.361, sum=0.722 (2)", - "tab": "Efficiency", - "score": 0.3607579462367333 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)", - "tab": "General information", - "score": 502.7546012269939 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.286, - "details": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.571 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.478, mean=0.478, max=0.478, sum=0.955 (2)", - "tab": "Efficiency", - "score": 0.4776035504681723 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)", - "tab": "General information", - "score": 730.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.738, - "details": { - "description": "min=0.738, mean=0.738, max=0.738, sum=1.476 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.34303417715054113 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=315.777, mean=315.777, max=315.777, sum=631.553 (2)", - "tab": "General information", - "score": 315.77669902912623 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.749 (2)", - "tab": "Efficiency", - "score": 0.37440858845017916 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=472.628, mean=472.628, max=472.628, sum=945.256 (2)", - "tab": "General information", - "score": 472.62820512820514 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57, - "details": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)", - "tab": "Efficiency", - "score": 0.3651238298416138 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=408.14, mean=408.14, max=408.14, sum=816.28 (2)", - "tab": "General information", - "score": 408.14 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.748, - "details": { - "description": "min=0.748, mean=0.748, max=0.748, sum=1.497 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Efficiency", - "score": 0.34193715342768916 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=345.913, mean=345.913, max=345.913, sum=691.826 (2)", - "tab": "General information", - "score": 345.9131545338442 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407, - "details": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.841 (2)", - "tab": "Efficiency", - "score": 0.4205500893510146 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.949 (2)", - "tab": "Efficiency", - "score": 0.4744861464260677 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)", - "tab": "General information", - "score": 542.5057803468208 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)", - "tab": "General information", - "score": 756.4793296089385 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.627, - "details": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.906 (2)", - "tab": "Efficiency", - "score": 0.4530853640799429 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)", - "tab": "General information", - "score": 695.9215686274509 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.654, - "details": { - "description": "min=0.654, mean=0.654, max=0.654, sum=1.309 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.889 (2)", - "tab": "Efficiency", - "score": 0.44473813345402846 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)", - "tab": "General information", - "score": 619.1851851851852 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.774 (2)", - "tab": "Efficiency", - "score": 0.38679331866177646 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=474.827, mean=474.827, max=474.827, sum=949.655 (2)", - "tab": "General information", - "score": 474.8272727272727 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.608, - "details": { - "description": "min=0.608, mean=0.608, max=0.608, sum=1.216 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.771, mean=0.771, max=0.771, sum=1.542 (2)", - "tab": "Efficiency", - "score": 0.7707553902450873 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)", - "tab": "General information", - "score": 1377.530612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.761, - "details": { - "description": "min=0.761, mean=0.761, max=0.761, sum=1.522 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.385, mean=0.385, max=0.385, sum=0.77 (2)", - "tab": "Efficiency", - "score": 0.38491436853930727 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)", - "tab": "General information", - "score": 508.4776119402985 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.476, - "details": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Efficiency", - "score": 0.3499309801193605 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=405.108, mean=405.108, max=405.108, sum=810.217 (2)", - "tab": "General information", - "score": 405.10843373493975 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.675 (2)", - "tab": "Efficiency", - "score": 0.33768263197781745 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=304.474, mean=304.474, max=304.474, sum=608.947 (2)", - "tab": "General information", - "score": 304.4736842105263 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json deleted file mode 100644 index aa6a9caa2..000000000 --- a/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 2 70B", - "id": "meta/llama-2-70b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.695, - "details": { - "description": "min=0.31, mean=0.695, max=0.933, sum=79.283 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.314, mean=0.466, max=0.981, sum=53.164 (114)", - "tab": "Efficiency", - "score": 0.46634649940337786 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)", - "tab": "General information", - "score": 706.6820126388612 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31, - "details": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)", - "tab": "Efficiency", - "score": 0.3601346731185913 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)", - "tab": "General information", - "score": 397.65 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.607, - "details": { - "description": "min=0.607, mean=0.607, max=0.607, sum=1.215 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.866 (2)", - "tab": "Efficiency", - "score": 0.4331345310917607 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=409.133, mean=409.133, max=409.133, sum=818.267 (2)", - "tab": "General information", - "score": 409.1333333333333 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363, - "details": { - "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.757 (2)", - "tab": "Efficiency", - "score": 0.3786743521690369 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.773 (2)", - "tab": "Efficiency", - "score": 0.38658806019359165 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.888 (2)", - "tab": "Efficiency", - "score": 0.44394851446151734 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", - "tab": "Efficiency", - "score": 0.7099040699005127 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.939 (2)", - "tab": "Efficiency", - "score": 0.4695483673514658 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.778 (2)", - "tab": "Efficiency", - "score": 0.3889027389825559 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)", - "tab": "General information", - "score": 622.43 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)", - "tab": "General information", - "score": 553.6319444444445 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)", - "tab": "General information", - "score": 901.14 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)", - "tab": "General information", - "score": 646.96 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)", - "tab": "General information", - "score": 608.6705202312139 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)", - "tab": "General information", - "score": 551.8725490196078 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.371, mean=0.371, max=0.371, sum=0.743 (2)", - "tab": "Efficiency", - "score": 0.3714062762260437 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)", - "tab": "General information", - "score": 428.17 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43, - "details": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.783 (2)", - "tab": "Efficiency", - "score": 0.3916624889039157 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)", - "tab": "General information", - "score": 684.6754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47, - "details": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.747 (2)", - "tab": "Efficiency", - "score": 0.3736806106567383 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=484.54, mean=484.54, max=484.54, sum=969.08 (2)", - "tab": "General information", - "score": 484.54 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.387 (2)", - "tab": "Efficiency", - "score": 0.6937185768727903 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=449.898, mean=449.898, max=449.898, sum=899.796 (2)", - "tab": "General information", - "score": 449.89814814814815 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.628 (2)", - "tab": "Efficiency", - "score": 0.3140420009085603 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=372.122, mean=372.122, max=372.122, sum=744.244 (2)", - "tab": "General information", - "score": 372.12218649517683 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.515, mean=0.515, max=0.515, sum=1.029 (2)", - "tab": "Efficiency", - "score": 0.5146331287482205 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.774 (2)", - "tab": "Efficiency", - "score": 0.3871775383644916 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.395 (2)", - "tab": "Efficiency", - "score": 0.6972876995452224 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)", - "tab": "Efficiency", - "score": 0.39348618851767647 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)", - "tab": "General information", - "score": 1330.6470588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)", - "tab": "General information", - "score": 823.2765957446809 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)", - "tab": "General information", - "score": 1915.0071707953064 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)", - "tab": "General information", - "score": 650.0784313725491 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.696 (2)", - "tab": "Efficiency", - "score": 0.3482255029678345 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)", - "tab": "General information", - "score": 479.81 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.912 (2)", - "tab": "Efficiency", - "score": 0.45624671798003347 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)", - "tab": "General information", - "score": 681.078947368421 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)", - "tab": "Efficiency", - "score": 0.6490170955657959 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)", - "tab": "General information", - "score": 674.44 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.717, - "details": { - "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.394, mean=0.394, max=0.394, sum=0.788 (2)", - "tab": "Efficiency", - "score": 0.394086869257801 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=487.374, mean=487.374, max=487.374, sum=974.747 (2)", - "tab": "General information", - "score": 487.3735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.668, - "details": { - "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)", - "tab": "Efficiency", - "score": 0.5188552247717025 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=333.153, mean=333.153, max=333.153, sum=666.306 (2)", - "tab": "General information", - "score": 333.1531914893617 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.634, - "details": { - "description": "min=0.634, mean=0.634, max=0.634, sum=1.269 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.414785334159588 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=497.779, mean=497.779, max=497.779, sum=995.559 (2)", - "tab": "General information", - "score": 497.7793103448276 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421, - "details": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.841 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)", - "tab": "Efficiency", - "score": 0.4069670924433955 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)", - "tab": "General information", - "score": 609.1560846560847 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.468, - "details": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.937 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.41500668109409394 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)", - "tab": "General information", - "score": 691.8095238095239 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882, - "details": { - "description": "min=0.882, mean=0.882, max=0.882, sum=1.764 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.759 (2)", - "tab": "Efficiency", - "score": 0.3793416823110273 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", - "tab": "Efficiency", - "score": 0.4020436197666112 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.124 (2)", - "tab": "Efficiency", - "score": 0.5618092942237854 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.981, mean=0.981, max=0.981, sum=1.962 (2)", - "tab": "Efficiency", - "score": 0.9809041355595444 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.41476938218781445 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.775 (2)", - "tab": "Efficiency", - "score": 0.3875881736142648 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Efficiency", - "score": 0.3797990028674786 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.817 (2)", - "tab": "Efficiency", - "score": 0.40841888145164207 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)", - "tab": "Efficiency", - "score": 0.4407546289828645 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.851 (2)", - "tab": "Efficiency", - "score": 0.42553993724039846 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.469, mean=0.469, max=0.469, sum=0.939 (2)", - "tab": "Efficiency", - "score": 0.46939194880494284 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.499, mean=0.499, max=0.499, sum=0.998 (2)", - "tab": "Efficiency", - "score": 0.4990172529662097 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.693 (2)", - "tab": "Efficiency", - "score": 0.8465246745184356 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.558, mean=0.558, max=0.558, sum=1.117 (2)", - "tab": "Efficiency", - "score": 0.5583362217190899 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)", - "tab": "General information", - "score": 596.8935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)", - "tab": "General information", - "score": 568.6650246305419 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)", - "tab": "General information", - "score": 988.57 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)", - "tab": "General information", - "score": 3159.6363636363635 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=436.657, mean=436.657, max=436.657, sum=873.313 (2)", - "tab": "General information", - "score": 436.65656565656565 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)", - "tab": "General information", - "score": 527.9274611398964 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=445.662, mean=445.662, max=445.662, sum=891.323 (2)", - "tab": "General information", - "score": 445.66153846153844 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)", - "tab": "General information", - "score": 579.1814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=449.492, mean=449.492, max=449.492, sum=898.983 (2)", - "tab": "General information", - "score": 449.49159663865544 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)", - "tab": "General information", - "score": 621.7880794701987 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)", - "tab": "General information", - "score": 585.9192660550459 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)", - "tab": "General information", - "score": 908.2083333333334 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)", - "tab": "General information", - "score": 2535.323529411765 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)", - "tab": "General information", - "score": 1638.2194092827003 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.487, mean=0.487, max=0.487, sum=0.973 (2)", - "tab": "Efficiency", - "score": 0.4866963897585334 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.781 (2)", - "tab": "Efficiency", - "score": 0.3902700020156744 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=361.26, mean=361.26, max=361.26, sum=722.52 (2)", - "tab": "General information", - "score": 361.26008968609864 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=403.382, mean=403.382, max=403.382, sum=806.763 (2)", - "tab": "General information", - "score": 403.381679389313 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.538, mean=0.538, max=0.538, sum=1.076 (2)", - "tab": "Efficiency", - "score": 0.5381311483619627 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)", - "tab": "General information", - "score": 729.4628099173553 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.903 (2)", - "tab": "Efficiency", - "score": 0.4513764015736024 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)", - "tab": "General information", - "score": 502.7546012269939 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.491, - "details": { - "description": "min=0.491, mean=0.491, max=0.491, sum=0.982 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.447, mean=0.447, max=0.447, sum=0.895 (2)", - "tab": "Efficiency", - "score": 0.4473994416849954 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)", - "tab": "General information", - "score": 730.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.965 (2)", - "tab": "Efficiency", - "score": 0.482250699719179 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=315.777, mean=315.777, max=315.777, sum=631.553 (2)", - "tab": "General information", - "score": 315.77669902912623 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.529, mean=0.529, max=0.529, sum=1.059 (2)", - "tab": "Efficiency", - "score": 0.5294328500062991 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=472.628, mean=472.628, max=472.628, sum=945.256 (2)", - "tab": "General information", - "score": 472.62820512820514 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)", - "tab": "Efficiency", - "score": 0.42598395347595214 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=408.14, mean=408.14, max=408.14, sum=816.28 (2)", - "tab": "General information", - "score": 408.14 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.714 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)", - "tab": "Efficiency", - "score": 0.43395179502504233 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=345.913, mean=345.913, max=345.913, sum=691.826 (2)", - "tab": "General information", - "score": 345.9131545338442 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45, - "details": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.404, mean=0.404, max=0.404, sum=0.809 (2)", - "tab": "Efficiency", - "score": 0.4043546129513338 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.514, mean=0.514, max=0.514, sum=1.028 (2)", - "tab": "Efficiency", - "score": 0.5137747306397508 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)", - "tab": "General information", - "score": 542.5057803468208 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)", - "tab": "General information", - "score": 756.4793296089385 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.758, - "details": { - "description": "min=0.758, mean=0.758, max=0.758, sum=1.516 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.447, mean=0.447, max=0.447, sum=0.895 (2)", - "tab": "Efficiency", - "score": 0.44729572885176716 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)", - "tab": "General information", - "score": 695.9215686274509 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.945 (2)", - "tab": "Efficiency", - "score": 0.4722691575686137 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)", - "tab": "General information", - "score": 619.1851851851852 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)", - "tab": "Efficiency", - "score": 0.34489609761671586 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=474.827, mean=474.827, max=474.827, sum=949.655 (2)", - "tab": "General information", - "score": 474.8272727272727 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.484, mean=0.484, max=0.484, sum=0.968 (2)", - "tab": "Efficiency", - "score": 0.48404579649166185 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)", - "tab": "General information", - "score": 1377.530612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.769 (2)", - "tab": "Efficiency", - "score": 0.38445919781775023 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)", - "tab": "General information", - "score": 508.4776119402985 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)", - "tab": "Efficiency", - "score": 0.464106645928808 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=405.108, mean=405.108, max=405.108, sum=810.217 (2)", - "tab": "General information", - "score": 405.10843373493975 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.831 (2)", - "tab": "Efficiency", - "score": 0.41569664603785467 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=304.474, mean=304.474, max=304.474, sum=608.947 (2)", - "tab": "General information", - "score": 304.4736842105263 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json deleted file mode 100644 index 0649e7329..000000000 --- a/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 2 7B", - "id": "meta/llama-2-7b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.458, - "details": { - "description": "min=0.196, mean=0.458, max=0.713, sum=52.224 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.28, mean=0.374, max=0.947, sum=42.6 (114)", - "tab": "Efficiency", - "score": 0.37368440752207543 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)", - "tab": "General information", - "score": 706.6820126388612 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29, - "details": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)", - "tab": "Efficiency", - "score": 0.3319991087913513 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)", - "tab": "General information", - "score": 397.65 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452, - "details": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.904 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.619 (2)", - "tab": "Efficiency", - "score": 0.3097020767353199 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=409.133, mean=409.133, max=409.133, sum=818.267 (2)", - "tab": "General information", - "score": 409.1333333333333 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.196, - "details": { - "description": "min=0.196, mean=0.196, max=0.196, sum=0.392 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Efficiency", - "score": 0.35009843587875367 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)", - "tab": "Efficiency", - "score": 0.3278946164581511 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.763 (2)", - "tab": "Efficiency", - "score": 0.38129755973815915 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.682 (2)", - "tab": "Efficiency", - "score": 0.3409119129180908 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)", - "tab": "Efficiency", - "score": 0.3307889693045203 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Efficiency", - "score": 0.3398791224348779 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)", - "tab": "General information", - "score": 622.43 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)", - "tab": "General information", - "score": 553.6319444444445 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)", - "tab": "General information", - "score": 901.14 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)", - "tab": "General information", - "score": 646.96 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)", - "tab": "General information", - "score": 608.6705202312139 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)", - "tab": "General information", - "score": 551.8725490196078 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.659 (2)", - "tab": "Efficiency", - "score": 0.3293105459213257 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)", - "tab": "General information", - "score": 428.17 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.316, - "details": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.749 (2)", - "tab": "Efficiency", - "score": 0.3746668204926608 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)", - "tab": "General information", - "score": 684.6754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29, - "details": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.659 (2)", - "tab": "Efficiency", - "score": 0.32934638738632205 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=484.54, mean=484.54, max=484.54, sum=969.08 (2)", - "tab": "General information", - "score": 484.54 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519, - "details": { - "description": "min=0.519, mean=0.519, max=0.519, sum=1.037 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.2942208139984696 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=449.898, mean=449.898, max=449.898, sum=899.796 (2)", - "tab": "General information", - "score": 449.89814814814815 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.592, - "details": { - "description": "min=0.592, mean=0.592, max=0.592, sum=1.183 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)", - "tab": "Efficiency", - "score": 0.2999055814896366 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=372.122, mean=372.122, max=372.122, sum=744.244 (2)", - "tab": "General information", - "score": 372.12218649517683 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459, - "details": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.918 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)", - "tab": "Efficiency", - "score": 0.463154871674145 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.377, mean=0.377, max=0.377, sum=0.755 (2)", - "tab": "Efficiency", - "score": 0.37741253392916196 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.618, mean=0.618, max=0.618, sum=1.235 (2)", - "tab": "Efficiency", - "score": 0.6177054020385543 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)", - "tab": "Efficiency", - "score": 0.4397414544828577 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)", - "tab": "General information", - "score": 1330.6470588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)", - "tab": "General information", - "score": 823.2765957446809 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)", - "tab": "General information", - "score": 1915.0071707953064 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)", - "tab": "General information", - "score": 650.0784313725491 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.3431359338760376 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)", - "tab": "General information", - "score": 479.81 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408, - "details": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)", - "tab": "Efficiency", - "score": 0.34498921193574605 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)", - "tab": "General information", - "score": 681.078947368421 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48, - "details": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)", - "tab": "Efficiency", - "score": 0.3342457461357117 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)", - "tab": "General information", - "score": 674.44 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.453, - "details": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.906 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.3225168426081819 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=487.374, mean=487.374, max=487.374, sum=974.747 (2)", - "tab": "General information", - "score": 487.3735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.434, - "details": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)", - "tab": "Efficiency", - "score": 0.32303770450835534 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=333.153, mean=333.153, max=333.153, sum=666.306 (2)", - "tab": "General information", - "score": 333.1531914893617 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407, - "details": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)", - "tab": "Efficiency", - "score": 0.32454562516048036 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=497.779, mean=497.779, max=497.779, sum=995.559 (2)", - "tab": "General information", - "score": 497.7793103448276 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.254, - "details": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.669 (2)", - "tab": "Efficiency", - "score": 0.33426338718051 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)", - "tab": "General information", - "score": 609.1560846560847 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27, - "details": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)", - "tab": "Efficiency", - "score": 0.3832281846848745 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)", - "tab": "General information", - "score": 691.8095238095239 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662, - "details": { - "description": "min=0.662, mean=0.662, max=0.662, sum=1.325 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)", - "tab": "Efficiency", - "score": 0.32630388890543294 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.611 (2)", - "tab": "Efficiency", - "score": 0.30552317473688734 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)", - "tab": "Efficiency", - "score": 0.4060112690925598 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)", - "tab": "Efficiency", - "score": 0.9469690496271307 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.32730214523546625 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.3369472236830954 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)", - "tab": "Efficiency", - "score": 0.3308515047415709 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)", - "tab": "Efficiency", - "score": 0.3355037459620723 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.918 (2)", - "tab": "Efficiency", - "score": 0.45884753475670054 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)", - "tab": "Efficiency", - "score": 0.3355141222871692 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.778 (2)", - "tab": "Efficiency", - "score": 0.3889624678760494 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)", - "tab": "Efficiency", - "score": 0.39307444846188583 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Efficiency", - "score": 0.7781471855500165 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.542, mean=0.542, max=0.542, sum=1.085 (2)", - "tab": "Efficiency", - "score": 0.5424087500270409 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)", - "tab": "General information", - "score": 596.8935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)", - "tab": "General information", - "score": 568.6650246305419 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)", - "tab": "General information", - "score": 988.57 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)", - "tab": "General information", - "score": 3159.6363636363635 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=436.657, mean=436.657, max=436.657, sum=873.313 (2)", - "tab": "General information", - "score": 436.65656565656565 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)", - "tab": "General information", - "score": 527.9274611398964 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=445.662, mean=445.662, max=445.662, sum=891.323 (2)", - "tab": "General information", - "score": 445.66153846153844 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)", - "tab": "General information", - "score": 579.1814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=449.492, mean=449.492, max=449.492, sum=898.983 (2)", - "tab": "General information", - "score": 449.49159663865544 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)", - "tab": "General information", - "score": 621.7880794701987 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)", - "tab": "General information", - "score": 585.9192660550459 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)", - "tab": "General information", - "score": 908.2083333333334 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)", - "tab": "General information", - "score": 2535.323529411765 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)", - "tab": "General information", - "score": 1638.2194092827003 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.557, - "details": { - "description": "min=0.557, mean=0.557, max=0.557, sum=1.115 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)", - "tab": "Efficiency", - "score": 0.28007102974861725 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.671 (2)", - "tab": "Efficiency", - "score": 0.3354811176998925 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=361.26, mean=361.26, max=361.26, sum=722.52 (2)", - "tab": "General information", - "score": 361.26008968609864 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=403.382, mean=403.382, max=403.382, sum=806.763 (2)", - "tab": "General information", - "score": 403.381679389313 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.628, - "details": { - "description": "min=0.628, mean=0.628, max=0.628, sum=1.256 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.3510365151176768 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)", - "tab": "General information", - "score": 729.4628099173553 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.466, - "details": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.933 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.3273066304212699 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)", - "tab": "General information", - "score": 502.7546012269939 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402, - "details": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.366, mean=0.366, max=0.366, sum=0.732 (2)", - "tab": "Efficiency", - "score": 0.36619071449552265 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)", - "tab": "General information", - "score": 730.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.563, - "details": { - "description": "min=0.563, mean=0.563, max=0.563, sum=1.126 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.669 (2)", - "tab": "Efficiency", - "score": 0.33452116632924495 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=315.777, mean=315.777, max=315.777, sum=631.553 (2)", - "tab": "General information", - "score": 315.77669902912623 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.697, - "details": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.393 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)", - "tab": "Efficiency", - "score": 0.3312412653213892 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=472.628, mean=472.628, max=472.628, sum=945.256 (2)", - "tab": "General information", - "score": 472.62820512820514 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.3395656991004944 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=408.14, mean=408.14, max=408.14, sum=816.28 (2)", - "tab": "General information", - "score": 408.14 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.632, - "details": { - "description": "min=0.632, mean=0.632, max=0.632, sum=1.264 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.706 (2)", - "tab": "Efficiency", - "score": 0.3531375576862126 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=345.913, mean=345.913, max=345.913, sum=691.826 (2)", - "tab": "General information", - "score": 345.9131545338442 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.238, - "details": { - "description": "min=0.238, mean=0.238, max=0.238, sum=0.476 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)", - "tab": "Efficiency", - "score": 0.3263767213490657 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.369, mean=0.369, max=0.369, sum=0.738 (2)", - "tab": "Efficiency", - "score": 0.3688804725028949 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)", - "tab": "General information", - "score": 542.5057803468208 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)", - "tab": "General information", - "score": 756.4793296089385 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.497, - "details": { - "description": "min=0.497, mean=0.497, max=0.497, sum=0.993 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Efficiency", - "score": 0.34185195904152066 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)", - "tab": "General information", - "score": 695.9215686274509 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.503, - "details": { - "description": "min=0.503, mean=0.503, max=0.503, sum=1.006 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.665 (2)", - "tab": "Efficiency", - "score": 0.33259875023806534 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)", - "tab": "General information", - "score": 619.1851851851852 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.665 (2)", - "tab": "Efficiency", - "score": 0.3326493003151634 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=474.827, mean=474.827, max=474.827, sum=949.655 (2)", - "tab": "General information", - "score": 474.8272727272727 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.433, - "details": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.955 (2)", - "tab": "Efficiency", - "score": 0.4774373015578912 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)", - "tab": "General information", - "score": 1377.530612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.617, - "details": { - "description": "min=0.617, mean=0.617, max=0.617, sum=1.234 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.623 (2)", - "tab": "Efficiency", - "score": 0.31150120170555307 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)", - "tab": "General information", - "score": 508.4776119402985 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392, - "details": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.783 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.32997589513479947 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=405.108, mean=405.108, max=405.108, sum=810.217 (2)", - "tab": "General information", - "score": 405.10843373493975 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.713, - "details": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.427 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)", - "tab": "Efficiency", - "score": 0.2998225702876933 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=304.474, mean=304.474, max=304.474, sum=608.947 (2)", - "tab": "General information", - "score": 304.4736842105263 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.681, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json deleted file mode 100644 index 4f09a5ee3..000000000 --- a/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3 70B", - "id": "meta/llama-3-70b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.43, mean=0.793, max=0.979, sum=90.444 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.333, mean=0.462, max=1.184, sum=52.708 (114)", - "tab": "Efficiency", - "score": 0.46235507518987096 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=267.52, mean=607.619, max=2790.885, sum=69268.61 (114)", - "tab": "General information", - "score": 607.6193817308517 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43, - "details": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.774 (2)", - "tab": "Efficiency", - "score": 0.3868687057495117 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=366.43, mean=366.43, max=366.43, sum=732.86 (2)", - "tab": "General information", - "score": 366.43 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.782 (2)", - "tab": "Efficiency", - "score": 0.39101445586593064 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.874, mean=346.874, max=346.874, sum=693.748 (2)", - "tab": "General information", - "score": 346.8740740740741 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529, - "details": { - "description": "min=0.529, mean=0.529, max=0.529, sum=1.059 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)", - "tab": "Efficiency", - "score": 0.4319474816322327 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.394, mean=0.394, max=0.394, sum=0.788 (2)", - "tab": "Efficiency", - "score": 0.39422312213314903 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.959 (2)", - "tab": "Efficiency", - "score": 0.4797321176528931 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.806 (2)", - "tab": "Efficiency", - "score": 0.4030305552482605 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.849 (2)", - "tab": "Efficiency", - "score": 0.4245531242017801 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Efficiency", - "score": 0.41995686643263874 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=542.28, mean=542.28, max=542.28, sum=1084.56 (2)", - "tab": "General information", - "score": 542.28 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=466.875, mean=466.875, max=466.875, sum=933.75 (2)", - "tab": "General information", - "score": 466.875 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=821.29, mean=821.29, max=821.29, sum=1642.58 (2)", - "tab": "General information", - "score": 821.29 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=587.51, mean=587.51, max=587.51, sum=1175.02 (2)", - "tab": "General information", - "score": 587.51 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=495.705, mean=495.705, max=495.705, sum=991.41 (2)", - "tab": "General information", - "score": 495.70520231213874 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=496.569, mean=496.569, max=496.569, sum=993.137 (2)", - "tab": "General information", - "score": 496.5686274509804 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.783 (2)", - "tab": "Efficiency", - "score": 0.3916677093505859 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=371.51, mean=371.51, max=371.51, sum=743.02 (2)", - "tab": "General information", - "score": 371.51 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Efficiency", - "score": 0.4078888934955262 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=607.421, mean=607.421, max=607.421, sum=1214.842 (2)", - "tab": "General information", - "score": 607.421052631579 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.385, mean=0.385, max=0.385, sum=0.77 (2)", - "tab": "Efficiency", - "score": 0.3847800350189209 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)", - "tab": "General information", - "score": 392.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.736 (2)", - "tab": "Efficiency", - "score": 0.36775174847355596 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.63, mean=387.63, max=387.63, sum=775.259 (2)", - "tab": "General information", - "score": 387.6296296296296 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.357, mean=0.357, max=0.357, sum=0.713 (2)", - "tab": "Efficiency", - "score": 0.35669880894602685 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)", - "tab": "General information", - "score": 322.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.523, mean=0.523, max=0.523, sum=1.046 (2)", - "tab": "Efficiency", - "score": 0.5229001255596385 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Efficiency", - "score": 0.4082087980094531 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.738, mean=0.738, max=0.738, sum=1.477 (2)", - "tab": "Efficiency", - "score": 0.7383932933658167 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)", - "tab": "Efficiency", - "score": 0.3758435642797183 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1087.489, mean=1087.489, max=1087.489, sum=2174.978 (2)", - "tab": "General information", - "score": 1087.4889705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=651.585, mean=651.585, max=651.585, sum=1303.17 (2)", - "tab": "General information", - "score": 651.5851063829788 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1630.601, mean=1630.601, max=1630.601, sum=3261.202 (2)", - "tab": "General information", - "score": 1630.6010430247718 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=568.098, mean=568.098, max=568.098, sum=1136.196 (2)", - "tab": "General information", - "score": 568.0980392156863 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.805 (2)", - "tab": "Efficiency", - "score": 0.4027411961555481 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)", - "tab": "General information", - "score": 415.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.921, - "details": { - "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)", - "tab": "Efficiency", - "score": 0.4070533733618887 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=572.684, mean=572.684, max=572.684, sum=1145.368 (2)", - "tab": "General information", - "score": 572.6842105263158 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)", - "tab": "Efficiency", - "score": 0.3931219887733459 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)", - "tab": "General information", - "score": 562.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.831 (2)", - "tab": "Efficiency", - "score": 0.41558496907072245 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=390.928, mean=390.928, max=390.928, sum=781.857 (2)", - "tab": "General information", - "score": 390.92830188679244 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=1.677 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.517, mean=0.517, max=0.517, sum=1.034 (2)", - "tab": "Efficiency", - "score": 0.5170877294337496 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=297.834, mean=297.834, max=297.834, sum=595.668 (2)", - "tab": "General information", - "score": 297.83404255319147 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.766, - "details": { - "description": "min=0.766, mean=0.766, max=0.766, sum=1.531 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.796 (2)", - "tab": "Efficiency", - "score": 0.39815263419315733 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=428.607, mean=428.607, max=428.607, sum=857.214 (2)", - "tab": "General information", - "score": 428.60689655172416 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.632, - "details": { - "description": "min=0.632, mean=0.632, max=0.632, sum=1.265 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.478, mean=0.478, max=0.478, sum=0.957 (2)", - "tab": "Efficiency", - "score": 0.47845223719480806 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=524.854, mean=524.854, max=524.854, sum=1049.709 (2)", - "tab": "General information", - "score": 524.8544973544973 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.651, - "details": { - "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)", - "tab": "Efficiency", - "score": 0.4359313628030202 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=594.778, mean=594.778, max=594.778, sum=1189.556 (2)", - "tab": "General information", - "score": 594.7777777777778 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.512, mean=0.512, max=0.512, sum=1.023 (2)", - "tab": "Efficiency", - "score": 0.5115567738010037 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.089 (2)", - "tab": "Efficiency", - "score": 0.5445456727972171 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.501, mean=0.501, max=0.501, sum=1.002 (2)", - "tab": "Efficiency", - "score": 0.5008813333511353 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.184, mean=1.184, max=1.184, sum=2.367 (2)", - "tab": "Efficiency", - "score": 1.1835060582016455 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)", - "tab": "Efficiency", - "score": 0.3721387037123092 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.567, mean=0.567, max=0.567, sum=1.134 (2)", - "tab": "Efficiency", - "score": 0.5668655022438326 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.582, mean=0.582, max=0.582, sum=1.164 (2)", - "tab": "Efficiency", - "score": 0.5819246842310979 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.41, mean=0.41, max=0.41, sum=0.821 (2)", - "tab": "Efficiency", - "score": 0.410357196242721 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.759 (2)", - "tab": "Efficiency", - "score": 0.3792707469283032 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)", - "tab": "Efficiency", - "score": 0.39323860288455786 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.789 (2)", - "tab": "Efficiency", - "score": 0.3946729870017515 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)", - "tab": "Efficiency", - "score": 0.5162484921790935 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.956, mean=0.956, max=0.956, sum=1.911 (2)", - "tab": "Efficiency", - "score": 0.9556132928997862 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.583, mean=0.583, max=0.583, sum=1.165 (2)", - "tab": "Efficiency", - "score": 0.5826822735589264 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.671, mean=506.671, max=506.671, sum=1013.342 (2)", - "tab": "General information", - "score": 506.6709677419355 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=489.704, mean=489.704, max=489.704, sum=979.409 (2)", - "tab": "General information", - "score": 489.70443349753697 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)", - "tab": "General information", - "score": 860.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2790.885, mean=2790.885, max=2790.885, sum=5581.77 (2)", - "tab": "General information", - "score": 2790.8848484848486 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.035, mean=365.035, max=365.035, sum=730.071 (2)", - "tab": "General information", - "score": 365.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)", - "tab": "General information", - "score": 458.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=363.908, mean=363.908, max=363.908, sum=727.815 (2)", - "tab": "General information", - "score": 363.9076923076923 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=525.356, mean=525.356, max=525.356, sum=1050.711 (2)", - "tab": "General information", - "score": 525.3555555555556 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=392.013, mean=392.013, max=392.013, sum=784.025 (2)", - "tab": "General information", - "score": 392.0126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=553.457, mean=553.457, max=553.457, sum=1106.914 (2)", - "tab": "General information", - "score": 553.4569536423841 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.242, mean=488.242, max=488.242, sum=976.484 (2)", - "tab": "General information", - "score": 488.2422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=788.639, mean=788.639, max=788.639, sum=1577.278 (2)", - "tab": "General information", - "score": 788.6388888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)", - "tab": "General information", - "score": 2210.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1421.173, mean=1421.173, max=1421.173, sum=2842.346 (2)", - "tab": "General information", - "score": 1421.1729957805908 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.346398046733018 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.3509944832051983 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=312.888, mean=312.888, max=312.888, sum=625.776 (2)", - "tab": "General information", - "score": 312.88789237668163 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.168, mean=334.168, max=334.168, sum=668.336 (2)", - "tab": "General information", - "score": 334.1679389312977 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", - "tab": "Efficiency", - "score": 0.39698751701796353 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=632.818, mean=632.818, max=632.818, sum=1265.636 (2)", - "tab": "General information", - "score": 632.8181818181819 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)", - "tab": "Efficiency", - "score": 0.36976343722431204 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.564, mean=442.564, max=442.564, sum=885.129 (2)", - "tab": "General information", - "score": 442.5644171779141 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.714, - "details": { - "description": "min=0.714, mean=0.714, max=0.714, sum=1.429 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.401 (2)", - "tab": "Efficiency", - "score": 0.7002999080078942 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)", - "tab": "General information", - "score": 661.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.823 (2)", - "tab": "Efficiency", - "score": 0.41139175822433915 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.786, mean=276.786, max=276.786, sum=553.573 (2)", - "tab": "General information", - "score": 276.7864077669903 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)", - "tab": "Efficiency", - "score": 0.35977526811453014 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)", - "tab": "General information", - "score": 397.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.796 (2)", - "tab": "Efficiency", - "score": 0.398222451210022 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=333.99, mean=333.99, max=333.99, sum=667.98 (2)", - "tab": "General information", - "score": 333.99 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.834 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.512, mean=0.512, max=0.512, sum=1.023 (2)", - "tab": "Efficiency", - "score": 0.5115468505089615 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=292.911, mean=292.911, max=292.911, sum=585.821 (2)", - "tab": "General information", - "score": 292.9106002554278 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.598, - "details": { - "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)", - "tab": "Efficiency", - "score": 0.3959053982199961 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)", - "tab": "Efficiency", - "score": 0.46180219543712764 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.113, mean=469.113, max=469.113, sum=938.225 (2)", - "tab": "General information", - "score": 469.1127167630058 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)", - "tab": "General information", - "score": 649.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.793 (2)", - "tab": "Efficiency", - "score": 0.3964238252515107 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=579.814, mean=579.814, max=579.814, sum=1159.627 (2)", - "tab": "General information", - "score": 579.8137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.017 (2)", - "tab": "Efficiency", - "score": 0.50853196338371 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=507.528, mean=507.528, max=507.528, sum=1015.056 (2)", - "tab": "General information", - "score": 507.52777777777777 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", - "tab": "Efficiency", - "score": 0.4018417878584428 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)", - "tab": "General information", - "score": 398.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.665 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.653, mean=0.653, max=0.653, sum=1.306 (2)", - "tab": "Efficiency", - "score": 0.652998145745725 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)", - "tab": "General information", - "score": 1157.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)", - "tab": "Efficiency", - "score": 0.3602804935986723 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=438.517, mean=438.517, max=438.517, sum=877.035 (2)", - "tab": "General information", - "score": 438.51741293532336 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.371, mean=0.371, max=0.371, sum=0.743 (2)", - "tab": "Efficiency", - "score": 0.3714186226028994 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.018, mean=336.018, max=336.018, sum=672.036 (2)", - "tab": "General information", - "score": 336.01807228915663 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.906, - "details": { - "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.665 (2)", - "tab": "Efficiency", - "score": 0.3325699170430501 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=267.52, mean=267.52, max=267.52, sum=535.041 (2)", - "tab": "General information", - "score": 267.5204678362573 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json deleted file mode 100644 index 83f907e80..000000000 --- a/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3 8B", - "id": "meta/llama-3-8b", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.668, - "details": { - "description": "min=0.33, mean=0.668, max=0.885, sum=76.111 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.288, mean=0.35, max=0.586, sum=39.916 (114)", - "tab": "Efficiency", - "score": 0.350140152719457 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=267.52, mean=607.619, max=2790.885, sum=69268.61 (114)", - "tab": "General information", - "score": 607.6193817308517 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)", - "tab": "Efficiency", - "score": 0.30905162572860717 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=366.43, mean=366.43, max=366.43, sum=732.86 (2)", - "tab": "General information", - "score": 366.43 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.577 (2)", - "tab": "Efficiency", - "score": 0.28846773041619195 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.874, mean=346.874, max=346.874, sum=693.748 (2)", - "tab": "General information", - "score": 346.8740740740741 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451, - "details": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.902 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)", - "tab": "Efficiency", - "score": 0.3228257203102112 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.34339087539248997 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.366, mean=0.366, max=0.366, sum=0.733 (2)", - "tab": "Efficiency", - "score": 0.3662724041938782 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)", - "tab": "Efficiency", - "score": 0.320071747303009 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.657 (2)", - "tab": "Efficiency", - "score": 0.32854826739757736 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.2994629471909766 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=542.28, mean=542.28, max=542.28, sum=1084.56 (2)", - "tab": "General information", - "score": 542.28 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=466.875, mean=466.875, max=466.875, sum=933.75 (2)", - "tab": "General information", - "score": 466.875 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=821.29, mean=821.29, max=821.29, sum=1642.58 (2)", - "tab": "General information", - "score": 821.29 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=587.51, mean=587.51, max=587.51, sum=1175.02 (2)", - "tab": "General information", - "score": 587.51 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=495.705, mean=495.705, max=495.705, sum=991.41 (2)", - "tab": "General information", - "score": 495.70520231213874 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=496.569, mean=496.569, max=496.569, sum=993.137 (2)", - "tab": "General information", - "score": 496.5686274509804 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Efficiency", - "score": 0.3068851590156555 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=371.51, mean=371.51, max=371.51, sum=743.02 (2)", - "tab": "General information", - "score": 371.51 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518, - "details": { - "description": "min=0.518, mean=0.518, max=0.518, sum=1.035 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.689 (2)", - "tab": "Efficiency", - "score": 0.3442605817527102 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=607.421, mean=607.421, max=607.421, sum=1214.842 (2)", - "tab": "General information", - "score": 607.421052631579 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.34, - "details": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.3109010863304138 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)", - "tab": "General information", - "score": 392.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.32258448998133343 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.63, mean=387.63, max=387.63, sum=775.259 (2)", - "tab": "General information", - "score": 387.6296296296296 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743, - "details": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.486 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.617 (2)", - "tab": "Efficiency", - "score": 0.3085632078900598 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)", - "tab": "General information", - "score": 322.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.682 (2)", - "tab": "Efficiency", - "score": 0.34079881275401397 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)", - "tab": "Efficiency", - "score": 0.4504219800867933 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)", - "tab": "Efficiency", - "score": 0.4285039446344587 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)", - "tab": "Efficiency", - "score": 0.3759713149538227 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1087.489, mean=1087.489, max=1087.489, sum=2174.978 (2)", - "tab": "General information", - "score": 1087.4889705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=651.585, mean=651.585, max=651.585, sum=1303.17 (2)", - "tab": "General information", - "score": 651.5851063829788 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1630.601, mean=1630.601, max=1630.601, sum=3261.202 (2)", - "tab": "General information", - "score": 1630.6010430247718 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=568.098, mean=568.098, max=568.098, sum=1136.196 (2)", - "tab": "General information", - "score": 568.0980392156863 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.29950841665267947 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)", - "tab": "General information", - "score": 415.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.077 (2)", - "tab": "Efficiency", - "score": 0.5385584250876778 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=572.684, mean=572.684, max=572.684, sum=1145.368 (2)", - "tab": "General information", - "score": 572.6842105263158 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65, - "details": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.623 (2)", - "tab": "Efficiency", - "score": 0.311549117565155 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)", - "tab": "General information", - "score": 562.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.751, - "details": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.609 (2)", - "tab": "Efficiency", - "score": 0.3043576915309114 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=390.928, mean=390.928, max=390.928, sum=781.857 (2)", - "tab": "General information", - "score": 390.92830188679244 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.557, - "details": { - "description": "min=0.557, mean=0.557, max=0.557, sum=1.115 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.631 (2)", - "tab": "Efficiency", - "score": 0.31532351615581106 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=297.834, mean=297.834, max=297.834, sum=595.668 (2)", - "tab": "General information", - "score": 297.83404255319147 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.669, - "details": { - "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.635 (2)", - "tab": "Efficiency", - "score": 0.31737767910135206 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=428.607, mean=428.607, max=428.607, sum=857.214 (2)", - "tab": "General information", - "score": 428.60689655172416 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426, - "details": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)", - "tab": "Efficiency", - "score": 0.3080339734516447 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=524.854, mean=524.854, max=524.854, sum=1049.709 (2)", - "tab": "General information", - "score": 524.8544973544973 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.468, - "details": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.937 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.33724411328633624 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=594.778, mean=594.778, max=594.778, sum=1189.556 (2)", - "tab": "General information", - "score": 594.7777777777778 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.823, - "details": { - "description": "min=0.823, mean=0.823, max=0.823, sum=1.646 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)", - "tab": "Efficiency", - "score": 0.3359520781424738 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.619 (2)", - "tab": "Efficiency", - "score": 0.3092998248602956 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)", - "tab": "Efficiency", - "score": 0.324708514213562 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.544, mean=0.544, max=0.544, sum=1.087 (2)", - "tab": "Efficiency", - "score": 0.5437044996203798 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.609 (2)", - "tab": "Efficiency", - "score": 0.30433518236333673 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)", - "tab": "Efficiency", - "score": 0.3192491321366068 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.31492268366691395 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)", - "tab": "Efficiency", - "score": 0.3262451118893094 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)", - "tab": "Efficiency", - "score": 0.3451059505719097 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.541, mean=0.541, max=0.541, sum=1.082 (2)", - "tab": "Efficiency", - "score": 0.5410290490712552 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.379, mean=0.379, max=0.379, sum=0.757 (2)", - "tab": "Efficiency", - "score": 0.3786245923523509 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.493, mean=0.493, max=0.493, sum=0.986 (2)", - "tab": "Efficiency", - "score": 0.4927717314826118 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.481, mean=0.481, max=0.481, sum=0.962 (2)", - "tab": "Efficiency", - "score": 0.48103941655626486 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)", - "tab": "Efficiency", - "score": 0.5161508246313168 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.671, mean=506.671, max=506.671, sum=1013.342 (2)", - "tab": "General information", - "score": 506.6709677419355 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=489.704, mean=489.704, max=489.704, sum=979.409 (2)", - "tab": "General information", - "score": 489.70443349753697 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)", - "tab": "General information", - "score": 860.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2790.885, mean=2790.885, max=2790.885, sum=5581.77 (2)", - "tab": "General information", - "score": 2790.8848484848486 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.035, mean=365.035, max=365.035, sum=730.071 (2)", - "tab": "General information", - "score": 365.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)", - "tab": "General information", - "score": 458.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=363.908, mean=363.908, max=363.908, sum=727.815 (2)", - "tab": "General information", - "score": 363.9076923076923 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=525.356, mean=525.356, max=525.356, sum=1050.711 (2)", - "tab": "General information", - "score": 525.3555555555556 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=392.013, mean=392.013, max=392.013, sum=784.025 (2)", - "tab": "General information", - "score": 392.0126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=553.457, mean=553.457, max=553.457, sum=1106.914 (2)", - "tab": "General information", - "score": 553.4569536423841 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.242, mean=488.242, max=488.242, sum=976.484 (2)", - "tab": "General information", - "score": 488.2422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=788.639, mean=788.639, max=788.639, sum=1577.278 (2)", - "tab": "General information", - "score": 788.6388888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)", - "tab": "General information", - "score": 2210.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1421.173, mean=1421.173, max=1421.173, sum=2842.346 (2)", - "tab": "General information", - "score": 1421.1729957805908 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.748, - "details": { - "description": "min=0.748, mean=0.748, max=0.748, sum=1.496 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)", - "tab": "Efficiency", - "score": 0.30269593080597607 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.651 (2)", - "tab": "Efficiency", - "score": 0.32543583862654124 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=312.888, mean=312.888, max=312.888, sum=625.776 (2)", - "tab": "General information", - "score": 312.88789237668163 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.168, mean=334.168, max=334.168, sum=668.336 (2)", - "tab": "General information", - "score": 334.1679389312977 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)", - "tab": "Efficiency", - "score": 0.5860170076701267 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=632.818, mean=632.818, max=632.818, sum=1265.636 (2)", - "tab": "General information", - "score": 632.8181818181819 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.625 (2)", - "tab": "Efficiency", - "score": 0.31263120335303934 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.564, mean=442.564, max=442.564, sum=885.129 (2)", - "tab": "General information", - "score": 442.5644171779141 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.545, - "details": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.089 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)", - "tab": "Efficiency", - "score": 0.30891925522259306 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)", - "tab": "General information", - "score": 661.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.874, - "details": { - "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)", - "tab": "Efficiency", - "score": 0.29801390703442027 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.786, mean=276.786, max=276.786, sum=553.573 (2)", - "tab": "General information", - "score": 276.7864077669903 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.769 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)", - "tab": "Efficiency", - "score": 0.29727030717409575 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)", - "tab": "General information", - "score": 397.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)", - "tab": "Efficiency", - "score": 0.3011839747428894 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=333.99, mean=333.99, max=333.99, sum=667.98 (2)", - "tab": "General information", - "score": 333.99 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.831, - "details": { - "description": "min=0.831, mean=0.831, max=0.831, sum=1.663 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.703 (2)", - "tab": "Efficiency", - "score": 0.3515638007971519 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=292.911, mean=292.911, max=292.911, sum=585.821 (2)", - "tab": "General information", - "score": 292.9106002554278 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416, - "details": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.831 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)", - "tab": "Efficiency", - "score": 0.2926361808887107 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)", - "tab": "Efficiency", - "score": 0.3287937753027378 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.113, mean=469.113, max=469.113, sum=938.225 (2)", - "tab": "General information", - "score": 469.1127167630058 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)", - "tab": "General information", - "score": 649.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.761, - "details": { - "description": "min=0.761, mean=0.761, max=0.761, sum=1.523 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.3226836241927801 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=579.814, mean=579.814, max=579.814, sum=1159.627 (2)", - "tab": "General information", - "score": 579.8137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.738, - "details": { - "description": "min=0.738, mean=0.738, max=0.738, sum=1.475 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)", - "tab": "Efficiency", - "score": 0.2970340943630831 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=507.528, mean=507.528, max=507.528, sum=1015.056 (2)", - "tab": "General information", - "score": 507.52777777777777 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736, - "details": { - "description": "min=0.736, mean=0.736, max=0.736, sum=1.473 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)", - "tab": "Efficiency", - "score": 0.3247281486337835 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)", - "tab": "General information", - "score": 398.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.771, - "details": { - "description": "min=0.771, mean=0.771, max=0.771, sum=1.543 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.35109225779163594 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)", - "tab": "General information", - "score": 1157.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.866, - "details": { - "description": "min=0.866, mean=0.866, max=0.866, sum=1.731 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.31481776545889933 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=438.517, mean=438.517, max=438.517, sum=877.035 (2)", - "tab": "General information", - "score": 438.51741293532336 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566, - "details": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)", - "tab": "Efficiency", - "score": 0.2951422269085804 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.018, mean=336.018, max=336.018, sum=672.036 (2)", - "tab": "General information", - "score": 336.01807228915663 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.819, - "details": { - "description": "min=0.819, mean=0.819, max=0.819, sum=1.637 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.631 (2)", - "tab": "Efficiency", - "score": 0.3152559863196479 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=267.52, mean=267.52, max=267.52, sum=535.041 (2)", - "tab": "General information", - "score": 267.5204678362573 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json deleted file mode 100644 index c4ce37e9d..000000000 --- a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.1 Instruct Turbo 405B", - "id": "meta/llama-3.1-405b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.572, mean=0.845, max=0.984, sum=96.366 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.403, mean=0.685, max=1.366, sum=78.119 (114)", - "tab": "Efficiency", - "score": 0.6852569796494135 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)", - "tab": "General information", - "score": 614.6193817308517 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)", - "tab": "Efficiency", - "score": 0.4640246653556824 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)", - "tab": "General information", - "score": 373.43 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.806 (2)", - "tab": "Efficiency", - "score": 0.4029027055810999 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)", - "tab": "General information", - "score": 353.8740740740741 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.119 (2)", - "tab": "Efficiency", - "score": 0.5597123241424561 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.959 (2)", - "tab": "Efficiency", - "score": 0.4795056896077262 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.132 (2)", - "tab": "Efficiency", - "score": 0.5661771416664123 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.541, mean=0.541, max=0.541, sum=1.082 (2)", - "tab": "Efficiency", - "score": 0.5411620163917541 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.635, mean=0.635, max=0.635, sum=1.271 (2)", - "tab": "Efficiency", - "score": 0.6352733904226667 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.495, mean=0.495, max=0.495, sum=0.991 (2)", - "tab": "Efficiency", - "score": 0.4953400083616668 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)", - "tab": "General information", - "score": 549.28 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)", - "tab": "General information", - "score": 473.875 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)", - "tab": "General information", - "score": 828.29 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)", - "tab": "General information", - "score": 594.51 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)", - "tab": "General information", - "score": 502.70520231213874 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)", - "tab": "General information", - "score": 503.5686274509804 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.502, mean=0.502, max=0.502, sum=1.003 (2)", - "tab": "Efficiency", - "score": 0.5016749453544617 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)", - "tab": "General information", - "score": 378.51 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.746, mean=0.746, max=0.746, sum=1.491 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.598, mean=0.598, max=0.598, sum=1.195 (2)", - "tab": "Efficiency", - "score": 0.5976439986312598 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)", - "tab": "General information", - "score": 614.421052631579 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71, - "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)", - "tab": "Efficiency", - "score": 0.4706212830543518 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)", - "tab": "Efficiency", - "score": 0.9174331603226838 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)", - "tab": "General information", - "score": 394.6296296296296 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.753, mean=0.753, max=0.753, sum=1.506 (2)", - "tab": "Efficiency", - "score": 0.7531090411342608 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.095, mean=1.095, max=1.095, sum=2.191 (2)", - "tab": "Efficiency", - "score": 1.0953595541855867 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.739, mean=0.739, max=0.739, sum=1.478 (2)", - "tab": "Efficiency", - "score": 0.7390724031637746 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.053, mean=1.053, max=1.053, sum=2.107 (2)", - "tab": "Efficiency", - "score": 1.0534205999337087 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)", - "tab": "Efficiency", - "score": 0.5791019481771132 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)", - "tab": "General information", - "score": 1094.4889705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)", - "tab": "General information", - "score": 658.5851063829788 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)", - "tab": "General information", - "score": 1637.6010430247718 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)", - "tab": "General information", - "score": 575.0980392156863 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Efficiency", - "score": 0.5199404859542847 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.921, - "details": { - "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.933 (2)", - "tab": "Efficiency", - "score": 0.46656754769777 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)", - "tab": "General information", - "score": 579.6842105263158 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.862 (2)", - "tab": "Efficiency", - "score": 0.4309411120414734 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.879, - "details": { - "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.533, mean=0.533, max=0.533, sum=1.067 (2)", - "tab": "Efficiency", - "score": 0.5334792272099909 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)", - "tab": "General information", - "score": 397.92830188679244 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.753 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)", - "tab": "Efficiency", - "score": 0.5081663547678197 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)", - "tab": "General information", - "score": 304.83404255319147 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.821, - "details": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.641 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.959 (2)", - "tab": "Efficiency", - "score": 0.47960921155995334 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)", - "tab": "General information", - "score": 435.60689655172416 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=1.656 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.117 (2)", - "tab": "Efficiency", - "score": 0.5586125358702645 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)", - "tab": "General information", - "score": 531.8544973544973 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.698, - "details": { - "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.505, mean=0.505, max=0.505, sum=1.011 (2)", - "tab": "Efficiency", - "score": 0.5053695440292358 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)", - "tab": "General information", - "score": 601.7777777777778 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.487, mean=0.487, max=0.487, sum=0.974 (2)", - "tab": "Efficiency", - "score": 0.48715837847801946 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.914 (2)", - "tab": "Efficiency", - "score": 0.45692210949113216 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.334 (2)", - "tab": "Efficiency", - "score": 0.6668596768379211 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.089, mean=1.089, max=1.089, sum=2.178 (2)", - "tab": "Efficiency", - "score": 1.0890785202835545 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.491, mean=0.491, max=0.491, sum=0.983 (2)", - "tab": "Efficiency", - "score": 0.49135766848169193 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.437, mean=0.437, max=0.437, sum=0.874 (2)", - "tab": "Efficiency", - "score": 0.4368582340102122 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.612, mean=0.612, max=0.612, sum=1.224 (2)", - "tab": "Efficiency", - "score": 0.6121874619752933 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.929, mean=0.929, max=0.929, sum=1.858 (2)", - "tab": "Efficiency", - "score": 0.9291445193467317 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.737, mean=0.737, max=0.737, sum=1.475 (2)", - "tab": "Efficiency", - "score": 0.7372911036515436 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.591, mean=0.591, max=0.591, sum=1.181 (2)", - "tab": "Efficiency", - "score": 0.5905803986732533 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.767 (2)", - "tab": "Efficiency", - "score": 0.8837221084384743 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)", - "tab": "Efficiency", - "score": 0.6339434705398701 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.993, mean=0.993, max=0.993, sum=1.987 (2)", - "tab": "Efficiency", - "score": 0.9934839302418279 - }, - "High School World History - Observed inference time (s)": { - "description": "min=1.012, mean=1.012, max=1.012, sum=2.024 (2)", - "tab": "Efficiency", - "score": 1.0120529253271562 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)", - "tab": "General information", - "score": 513.6709677419354 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)", - "tab": "General information", - "score": 496.70443349753697 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)", - "tab": "General information", - "score": 2797.8848484848486 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)", - "tab": "General information", - "score": 372.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)", - "tab": "General information", - "score": 370.9076923076923 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)", - "tab": "General information", - "score": 532.3555555555556 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)", - "tab": "General information", - "score": 399.0126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)", - "tab": "General information", - "score": 560.4569536423841 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)", - "tab": "General information", - "score": 495.2422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)", - "tab": "General information", - "score": 795.6388888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)", - "tab": "General information", - "score": 1428.1729957805908 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.602, mean=0.602, max=0.602, sum=1.204 (2)", - "tab": "Efficiency", - "score": 0.6018790418257093 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.765, mean=0.765, max=0.765, sum=1.531 (2)", - "tab": "Efficiency", - "score": 0.7653163061797164 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)", - "tab": "General information", - "score": 319.88789237668163 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)", - "tab": "General information", - "score": 341.1679389312977 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.901 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", - "tab": "Efficiency", - "score": 0.7894663180201507 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)", - "tab": "General information", - "score": 639.8181818181819 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.923, mean=0.923, max=0.923, sum=1.847 (2)", - "tab": "Efficiency", - "score": 0.9234680895425059 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)", - "tab": "General information", - "score": 449.5644171779141 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.795, - "details": { - "description": "min=0.795, mean=0.795, max=0.795, sum=1.589 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=1.077, mean=1.077, max=1.077, sum=2.154 (2)", - "tab": "Efficiency", - "score": 1.0769924351147242 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.505, mean=0.505, max=0.505, sum=1.009 (2)", - "tab": "Efficiency", - "score": 0.5047070956924586 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)", - "tab": "General information", - "score": 283.7864077669903 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.962, - "details": { - "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.617, mean=0.617, max=0.617, sum=1.234 (2)", - "tab": "Efficiency", - "score": 0.6168569010547084 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.725, mean=0.725, max=0.725, sum=1.45 (2)", - "tab": "Efficiency", - "score": 0.7251019191741943 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)", - "tab": "General information", - "score": 340.99 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.939, - "details": { - "description": "min=0.939, mean=0.939, max=0.939, sum=1.877 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.844, mean=0.844, max=0.844, sum=1.689 (2)", - "tab": "Efficiency", - "score": 0.8444620089208181 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)", - "tab": "General information", - "score": 299.9106002554278 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=1.366, mean=1.366, max=1.366, sum=2.732 (2)", - "tab": "Efficiency", - "score": 1.3659538754148979 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.749, mean=0.749, max=0.749, sum=1.498 (2)", - "tab": "Efficiency", - "score": 0.7492334496375569 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)", - "tab": "General information", - "score": 476.1127167630058 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.856 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=1.217, mean=1.217, max=1.217, sum=2.433 (2)", - "tab": "Efficiency", - "score": 1.2165828491348067 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)", - "tab": "General information", - "score": 586.8137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.929, - "details": { - "description": "min=0.929, mean=0.929, max=0.929, sum=1.858 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.591, mean=0.591, max=0.591, sum=1.182 (2)", - "tab": "Efficiency", - "score": 0.5911465375511734 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)", - "tab": "General information", - "score": 514.5277777777778 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.818, - "details": { - "description": "min=0.818, mean=0.818, max=0.818, sum=1.636 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=1.129, mean=1.129, max=1.129, sum=2.258 (2)", - "tab": "Efficiency", - "score": 1.12924514467066 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.714 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.734, mean=0.734, max=0.734, sum=1.468 (2)", - "tab": "Efficiency", - "score": 0.7342344303520358 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.881 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.583, mean=0.583, max=0.583, sum=1.166 (2)", - "tab": "Efficiency", - "score": 0.5830918010787585 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)", - "tab": "General information", - "score": 445.51741293532336 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572, - "details": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.483, mean=0.483, max=0.483, sum=0.967 (2)", - "tab": "Efficiency", - "score": 0.4834072029734232 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)", - "tab": "General information", - "score": 343.01807228915663 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.906, - "details": { - "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.484, mean=0.484, max=0.484, sum=0.967 (2)", - "tab": "Efficiency", - "score": 0.48364103328414826 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)", - "tab": "General information", - "score": 274.5204678362573 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json deleted file mode 100644 index 0e4b849f9..000000000 --- a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.1 Instruct Turbo 70B", - "id": "meta/llama-3.1-70b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.801, - "details": { - "description": "min=0.404, mean=0.801, max=0.984, sum=91.318 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=2.517, mean=5.993, max=45.251, sum=683.146 (114)", - "tab": "Efficiency", - "score": 5.992510112833335 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)", - "tab": "General information", - "score": 614.6193817308517 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55, - "details": { - "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=45.251, mean=45.251, max=45.251, sum=90.501 (2)", - "tab": "Efficiency", - "score": 45.250502264499666 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)", - "tab": "General information", - "score": 373.43 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=36.973, mean=36.973, max=36.973, sum=73.946 (2)", - "tab": "Efficiency", - "score": 36.97310272499367 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)", - "tab": "General information", - "score": 353.8740740740741 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=4.774, mean=4.774, max=4.774, sum=9.548 (2)", - "tab": "Efficiency", - "score": 4.774094069004059 - }, - "College Biology - Observed inference time (s)": { - "description": "min=4.993, mean=4.993, max=4.993, sum=9.986 (2)", - "tab": "Efficiency", - "score": 4.992929225166638 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=4.499, mean=4.499, max=4.499, sum=8.999 (2)", - "tab": "Efficiency", - "score": 4.499426193237305 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=4.479, mean=4.479, max=4.479, sum=8.957 (2)", - "tab": "Efficiency", - "score": 4.478512156009674 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=3.886, mean=3.886, max=3.886, sum=7.773 (2)", - "tab": "Efficiency", - "score": 3.886489330688653 - }, - "College Physics - Observed inference time (s)": { - "description": "min=3.274, mean=3.274, max=3.274, sum=6.548 (2)", - "tab": "Efficiency", - "score": 3.2739863746306477 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)", - "tab": "General information", - "score": 549.28 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)", - "tab": "General information", - "score": 473.875 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)", - "tab": "General information", - "score": 828.29 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)", - "tab": "General information", - "score": 594.51 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)", - "tab": "General information", - "score": 502.70520231213874 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)", - "tab": "General information", - "score": 503.5686274509804 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=2.976, mean=2.976, max=2.976, sum=5.951 (2)", - "tab": "Efficiency", - "score": 2.9756615567207336 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)", - "tab": "General information", - "score": 378.51 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=4.295, mean=4.295, max=4.295, sum=8.59 (2)", - "tab": "Efficiency", - "score": 4.29522921327959 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)", - "tab": "General information", - "score": 614.421052631579 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "details": { - "description": "min=0.61, mean=0.61, max=0.61, sum=1.22 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=3.637, mean=3.637, max=3.637, sum=7.275 (2)", - "tab": "Efficiency", - "score": 3.637417833805084 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=3.163, mean=3.163, max=3.163, sum=6.326 (2)", - "tab": "Efficiency", - "score": 3.1630651178183378 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)", - "tab": "General information", - "score": 394.6296296296296 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.666 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=3.264, mean=3.264, max=3.264, sum=6.527 (2)", - "tab": "Efficiency", - "score": 3.2637280957875143 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.846, - "details": { - "description": "min=0.846, mean=0.846, max=0.846, sum=1.693 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=3.871, mean=3.871, max=3.871, sum=7.742 (2)", - "tab": "Efficiency", - "score": 3.8712061214096405 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=2.943, mean=2.943, max=2.943, sum=5.886 (2)", - "tab": "Efficiency", - "score": 2.9428400173254894 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=3.318, mean=3.318, max=3.318, sum=6.637 (2)", - "tab": "Efficiency", - "score": 3.318323635681978 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=3.102, mean=3.102, max=3.102, sum=6.203 (2)", - "tab": "Efficiency", - "score": 3.1015563872125416 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)", - "tab": "General information", - "score": 1094.4889705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)", - "tab": "General information", - "score": 658.5851063829788 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)", - "tab": "General information", - "score": 1637.6010430247718 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)", - "tab": "General information", - "score": 575.0980392156863 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=2.836, mean=2.836, max=2.836, sum=5.672 (2)", - "tab": "Efficiency", - "score": 2.835986142158508 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "description": "min=0.908, mean=0.908, max=0.908, sum=1.816 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=33.307, mean=33.307, max=33.307, sum=66.613 (2)", - "tab": "Efficiency", - "score": 33.3065683904447 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)", - "tab": "General information", - "score": 579.6842105263158 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=34.272, mean=34.272, max=34.272, sum=68.544 (2)", - "tab": "Efficiency", - "score": 34.27190991640091 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=6.181, mean=6.181, max=6.181, sum=12.362 (2)", - "tab": "Efficiency", - "score": 6.18122723057585 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)", - "tab": "General information", - "score": 397.92830188679244 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=3.413, mean=3.413, max=3.413, sum=6.825 (2)", - "tab": "Efficiency", - "score": 3.412742525465945 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)", - "tab": "General information", - "score": 304.83404255319147 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=4.146, mean=4.146, max=4.146, sum=8.292 (2)", - "tab": "Efficiency", - "score": 4.1461473415637835 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)", - "tab": "General information", - "score": 435.60689655172416 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.701, - "details": { - "description": "min=0.701, mean=0.701, max=0.701, sum=1.402 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=4.13, mean=4.13, max=4.13, sum=8.261 (2)", - "tab": "Efficiency", - "score": 4.1303687221789485 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)", - "tab": "General information", - "score": 531.8544973544973 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.349 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=3.65, mean=3.65, max=3.65, sum=7.301 (2)", - "tab": "Efficiency", - "score": 3.6502806383465964 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)", - "tab": "General information", - "score": 601.7777777777778 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.937, - "details": { - "description": "min=0.937, mean=0.937, max=0.937, sum=1.873 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=4.179, mean=4.179, max=4.179, sum=8.357 (2)", - "tab": "Efficiency", - "score": 4.178504861554792 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=3.78, mean=3.78, max=3.78, sum=7.56 (2)", - "tab": "Efficiency", - "score": 3.779934604766921 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=4.276, mean=4.276, max=4.276, sum=8.553 (2)", - "tab": "Efficiency", - "score": 4.276434569358826 - }, - "High School European History - Observed inference time (s)": { - "description": "min=4.728, mean=4.728, max=4.728, sum=9.457 (2)", - "tab": "Efficiency", - "score": 4.7283261154637195 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=3.994, mean=3.994, max=3.994, sum=7.987 (2)", - "tab": "Efficiency", - "score": 3.993738304484974 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=4.056, mean=4.056, max=4.056, sum=8.111 (2)", - "tab": "Efficiency", - "score": 4.055596974229566 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=4.06, mean=4.06, max=4.06, sum=8.12 (2)", - "tab": "Efficiency", - "score": 4.059808598420559 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=4.211, mean=4.211, max=4.211, sum=8.422 (2)", - "tab": "Efficiency", - "score": 4.210984716592011 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=3.869, mean=3.869, max=3.869, sum=7.738 (2)", - "tab": "Efficiency", - "score": 3.8690204860783424 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=3.802, mean=3.802, max=3.802, sum=7.604 (2)", - "tab": "Efficiency", - "score": 3.801914532453019 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=3.897, mean=3.897, max=3.897, sum=7.793 (2)", - "tab": "Efficiency", - "score": 3.8966542169588423 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=3.5, mean=3.5, max=3.5, sum=6.999 (2)", - "tab": "Efficiency", - "score": 3.499593519502216 - }, - "High School US History - Observed inference time (s)": { - "description": "min=3.948, mean=3.948, max=3.948, sum=7.897 (2)", - "tab": "Efficiency", - "score": 3.948316371908375 - }, - "High School World History - Observed inference time (s)": { - "description": "min=3.316, mean=3.316, max=3.316, sum=6.632 (2)", - "tab": "Efficiency", - "score": 3.3161907819755974 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)", - "tab": "General information", - "score": 513.6709677419354 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)", - "tab": "General information", - "score": 496.70443349753697 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)", - "tab": "General information", - "score": 2797.8848484848486 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)", - "tab": "General information", - "score": 372.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)", - "tab": "General information", - "score": 370.9076923076923 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)", - "tab": "General information", - "score": 532.3555555555556 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)", - "tab": "General information", - "score": 399.0126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)", - "tab": "General information", - "score": 560.4569536423841 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)", - "tab": "General information", - "score": 495.2422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)", - "tab": "General information", - "score": 795.6388888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)", - "tab": "General information", - "score": 1428.1729957805908 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=3.222, mean=3.222, max=3.222, sum=6.444 (2)", - "tab": "Efficiency", - "score": 3.2222468500180095 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=3.132, mean=3.132, max=3.132, sum=6.264 (2)", - "tab": "Efficiency", - "score": 3.1318228208381713 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)", - "tab": "General information", - "score": 319.88789237668163 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)", - "tab": "General information", - "score": 341.1679389312977 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.926, - "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.851 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=3.686, mean=3.686, max=3.686, sum=7.372 (2)", - "tab": "Efficiency", - "score": 3.68597848750343 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)", - "tab": "General information", - "score": 639.8181818181819 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.681 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=2.835, mean=2.835, max=2.835, sum=5.67 (2)", - "tab": "Efficiency", - "score": 2.834790670067255 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)", - "tab": "General information", - "score": 449.5644171779141 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=2.82, mean=2.82, max=2.82, sum=5.639 (2)", - "tab": "Efficiency", - "score": 2.81969299699579 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=2.909, mean=2.909, max=2.909, sum=5.818 (2)", - "tab": "Efficiency", - "score": 2.9087865861874183 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)", - "tab": "General information", - "score": 283.7864077669903 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.936, - "details": { - "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=2.727, mean=2.727, max=2.727, sum=5.455 (2)", - "tab": "Efficiency", - "score": 2.7273036078510122 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=2.657, mean=2.657, max=2.657, sum=5.314 (2)", - "tab": "Efficiency", - "score": 2.656917359828949 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)", - "tab": "General information", - "score": 340.99 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=3.308, mean=3.308, max=3.308, sum=6.616 (2)", - "tab": "Efficiency", - "score": 3.3082146720715713 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)", - "tab": "General information", - "score": 299.9106002554278 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.667 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=2.926, mean=2.926, max=2.926, sum=5.852 (2)", - "tab": "Efficiency", - "score": 2.9259741898906024 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=3.608, mean=3.608, max=3.608, sum=7.216 (2)", - "tab": "Efficiency", - "score": 3.608134973248956 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)", - "tab": "General information", - "score": 476.1127167630058 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=3.56, mean=3.56, max=3.56, sum=7.12 (2)", - "tab": "Efficiency", - "score": 3.56020544089523 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)", - "tab": "General information", - "score": 586.8137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=3.546, mean=3.546, max=3.546, sum=7.091 (2)", - "tab": "Efficiency", - "score": 3.54565680247766 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)", - "tab": "General information", - "score": 514.5277777777778 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=3.03, mean=3.03, max=3.03, sum=6.06 (2)", - "tab": "Efficiency", - "score": 3.0301454305648803 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=2.949, mean=2.949, max=2.949, sum=5.898 (2)", - "tab": "Efficiency", - "score": 2.948831728526524 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=2.843, mean=2.843, max=2.843, sum=5.686 (2)", - "tab": "Efficiency", - "score": 2.842961254404552 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)", - "tab": "General information", - "score": 445.51741293532336 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=3.05, mean=3.05, max=3.05, sum=6.101 (2)", - "tab": "Efficiency", - "score": 3.050425999135856 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)", - "tab": "General information", - "score": 343.01807228915663 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.895, - "details": { - "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=2.517, mean=2.517, max=2.517, sum=5.033 (2)", - "tab": "Efficiency", - "score": 2.5166666828400905 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)", - "tab": "General information", - "score": 274.5204678362573 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.021, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json deleted file mode 100644 index 6c1d661d4..000000000 --- a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.1 Instruct Turbo 8B", - "id": "meta/llama-3.1-8b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.26, mean=0.561, max=0.865, sum=63.912 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.202, mean=0.56, max=1.485, sum=63.854 (114)", - "tab": "Efficiency", - "score": 0.5601251981506405 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)", - "tab": "General information", - "score": 614.6193817308517 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.568 (2)", - "tab": "Efficiency", - "score": 0.28381933450698854 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)", - "tab": "General information", - "score": 373.43 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459, - "details": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)", - "tab": "Efficiency", - "score": 0.3231998196354619 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)", - "tab": "General information", - "score": 353.8740740740741 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363, - "details": { - "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.862 (2)", - "tab": "Efficiency", - "score": 0.43078258752822873 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.853 (2)", - "tab": "Efficiency", - "score": 0.42637243535783553 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)", - "tab": "Efficiency", - "score": 0.5623248195648194 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.371, mean=0.371, max=0.371, sum=0.742 (2)", - "tab": "Efficiency", - "score": 0.3709776735305786 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.79 (2)", - "tab": "Efficiency", - "score": 0.3948341918129452 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.789 (2)", - "tab": "Efficiency", - "score": 0.39474552051693784 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)", - "tab": "General information", - "score": 549.28 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)", - "tab": "General information", - "score": 473.875 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)", - "tab": "General information", - "score": 828.29 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)", - "tab": "General information", - "score": 594.51 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)", - "tab": "General information", - "score": 502.70520231213874 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)", - "tab": "General information", - "score": 503.5686274509804 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71, - "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.867 (2)", - "tab": "Efficiency", - "score": 0.43369229555130007 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)", - "tab": "General information", - "score": 378.51 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.351, - "details": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.371, mean=0.371, max=0.371, sum=0.742 (2)", - "tab": "Efficiency", - "score": 0.3707838414008157 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)", - "tab": "General information", - "score": 614.421052631579 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.202, mean=0.202, max=0.202, sum=0.403 (2)", - "tab": "Efficiency", - "score": 0.2015515398979187 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731, - "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.463 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=1.035, mean=1.035, max=1.035, sum=2.07 (2)", - "tab": "Efficiency", - "score": 1.0347525963076838 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)", - "tab": "General information", - "score": 394.6296296296296 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.681, mean=0.681, max=0.681, sum=1.363 (2)", - "tab": "Efficiency", - "score": 0.6814629341628391 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649, - "details": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.546, mean=0.546, max=0.546, sum=1.091 (2)", - "tab": "Efficiency", - "score": 0.5456299475010704 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.538, mean=0.538, max=0.538, sum=1.077 (2)", - "tab": "Efficiency", - "score": 0.5383730044601657 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.881, mean=0.881, max=0.881, sum=1.762 (2)", - "tab": "Efficiency", - "score": 0.8808572895368355 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)", - "tab": "Efficiency", - "score": 0.6941978611977272 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)", - "tab": "General information", - "score": 1094.4889705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)", - "tab": "General information", - "score": 658.5851063829788 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)", - "tab": "General information", - "score": 1637.6010430247718 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)", - "tab": "General information", - "score": 575.0980392156863 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.567, mean=0.567, max=0.567, sum=1.135 (2)", - "tab": "Efficiency", - "score": 0.5673955392837524 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.645, - "details": { - "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)", - "tab": "Efficiency", - "score": 0.3168644199245854 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)", - "tab": "General information", - "score": 579.6842105263158 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65, - "details": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.888 (2)", - "tab": "Efficiency", - "score": 0.44396358251571655 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.615, - "details": { - "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.369, mean=0.369, max=0.369, sum=0.738 (2)", - "tab": "Efficiency", - "score": 0.3692442273193935 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)", - "tab": "General information", - "score": 397.92830188679244 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.528, - "details": { - "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.701 (2)", - "tab": "Efficiency", - "score": 0.35051030605397326 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)", - "tab": "General information", - "score": 304.83404255319147 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.441, - "details": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Efficiency", - "score": 0.34982287637118636 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)", - "tab": "General information", - "score": 435.60689655172416 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429, - "details": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.801 (2)", - "tab": "Efficiency", - "score": 0.4003569991500289 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)", - "tab": "General information", - "score": 531.8544973544973 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444, - "details": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.357, mean=0.357, max=0.357, sum=0.714 (2)", - "tab": "Efficiency", - "score": 0.35707327108534553 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)", - "tab": "General information", - "score": 601.7777777777778 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.515, - "details": { - "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.211, mean=0.211, max=0.211, sum=0.423 (2)", - "tab": "Efficiency", - "score": 0.21137587870320967 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.211, mean=0.211, max=0.211, sum=0.423 (2)", - "tab": "Efficiency", - "score": 0.2113605567387172 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.214, mean=0.214, max=0.214, sum=0.428 (2)", - "tab": "Efficiency", - "score": 0.2138903546333313 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)", - "tab": "Efficiency", - "score": 0.33188523668231384 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.218, mean=0.218, max=0.218, sum=0.435 (2)", - "tab": "Efficiency", - "score": 0.21753037818754561 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.558, mean=0.558, max=0.558, sum=1.117 (2)", - "tab": "Efficiency", - "score": 0.558492410985917 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.703, mean=0.703, max=0.703, sum=1.407 (2)", - "tab": "Efficiency", - "score": 0.7033225890917656 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.299 (2)", - "tab": "Efficiency", - "score": 0.6494572189119127 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.612, mean=0.612, max=0.612, sum=1.223 (2)", - "tab": "Efficiency", - "score": 0.6115654797113242 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.564, mean=0.564, max=0.564, sum=1.127 (2)", - "tab": "Efficiency", - "score": 0.5636763351642533 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.681, mean=0.681, max=0.681, sum=1.363 (2)", - "tab": "Efficiency", - "score": 0.6813242522948378 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.606, mean=0.606, max=0.606, sum=1.212 (2)", - "tab": "Efficiency", - "score": 0.6060926814874014 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.122, mean=1.122, max=1.122, sum=2.244 (2)", - "tab": "Efficiency", - "score": 1.1218917334780973 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.538, mean=0.538, max=0.538, sum=1.076 (2)", - "tab": "Efficiency", - "score": 0.5378943324592043 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)", - "tab": "General information", - "score": 513.6709677419354 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)", - "tab": "General information", - "score": 496.70443349753697 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)", - "tab": "General information", - "score": 2797.8848484848486 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)", - "tab": "General information", - "score": 372.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)", - "tab": "General information", - "score": 370.9076923076923 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)", - "tab": "General information", - "score": 532.3555555555556 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)", - "tab": "General information", - "score": 399.0126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)", - "tab": "General information", - "score": 560.4569536423841 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)", - "tab": "General information", - "score": 495.2422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)", - "tab": "General information", - "score": 795.6388888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)", - "tab": "General information", - "score": 1428.1729957805908 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.685, mean=0.685, max=0.685, sum=1.369 (2)", - "tab": "Efficiency", - "score": 0.6845707412257858 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=1.227, mean=1.227, max=1.227, sum=2.455 (2)", - "tab": "Efficiency", - "score": 1.2273387745136524 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)", - "tab": "General information", - "score": 319.88789237668163 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)", - "tab": "General information", - "score": 341.1679389312977 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694, - "details": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.369 (2)", - "tab": "Efficiency", - "score": 0.6842782950598346 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)", - "tab": "General information", - "score": 639.8181818181819 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=1.35, mean=1.35, max=1.35, sum=2.7 (2)", - "tab": "Efficiency", - "score": 1.3501118970063566 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)", - "tab": "General information", - "score": 449.5644171779141 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.384, - "details": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.919 (2)", - "tab": "Efficiency", - "score": 0.45964209735393524 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.481, mean=0.481, max=0.481, sum=0.963 (2)", - "tab": "Efficiency", - "score": 0.48132226536574874 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)", - "tab": "General information", - "score": 283.7864077669903 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.529, mean=0.529, max=0.529, sum=1.059 (2)", - "tab": "Efficiency", - "score": 0.5294545297948723 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.521, mean=0.521, max=0.521, sum=1.041 (2)", - "tab": "Efficiency", - "score": 0.520596706867218 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)", - "tab": "General information", - "score": 340.99 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.653, - "details": { - "description": "min=0.653, mean=0.653, max=0.653, sum=1.305 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)", - "tab": "Efficiency", - "score": 0.8030396217282857 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)", - "tab": "General information", - "score": 299.9106002554278 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.368, - "details": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.657, mean=0.657, max=0.657, sum=1.314 (2)", - "tab": "Efficiency", - "score": 0.6570079657383737 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.299 (2)", - "tab": "Efficiency", - "score": 0.649639103266114 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)", - "tab": "General information", - "score": 476.1127167630058 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.712, - "details": { - "description": "min=0.712, mean=0.712, max=0.712, sum=1.425 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=1.485, mean=1.485, max=1.485, sum=2.971 (2)", - "tab": "Efficiency", - "score": 1.4853957338270798 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)", - "tab": "General information", - "score": 586.8137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.728, - "details": { - "description": "min=0.728, mean=0.728, max=0.728, sum=1.457 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)", - "tab": "Efficiency", - "score": 0.7917959955003526 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)", - "tab": "General information", - "score": 514.5277777777778 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.664, - "details": { - "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.493, mean=0.493, max=0.493, sum=0.986 (2)", - "tab": "Efficiency", - "score": 0.49318039634011007 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.576, - "details": { - "description": "min=0.576, mean=0.576, max=0.576, sum=1.151 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)", - "tab": "Efficiency", - "score": 0.6561975401275012 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.701, - "details": { - "description": "min=0.701, mean=0.701, max=0.701, sum=1.403 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.517, mean=0.517, max=0.517, sum=1.034 (2)", - "tab": "Efficiency", - "score": 0.5170851643405744 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)", - "tab": "General information", - "score": 445.51741293532336 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446, - "details": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.813 (2)", - "tab": "Efficiency", - "score": 0.40646702553852493 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)", - "tab": "General information", - "score": 343.01807228915663 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.789, - "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.587, mean=0.587, max=0.587, sum=1.173 (2)", - "tab": "Efficiency", - "score": 0.5866640882882458 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)", - "tab": "General information", - "score": 274.5204678362573 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.475, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json deleted file mode 100644 index 599cd6855..000000000 --- a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo 11B", - "id": "meta/llama-3.2-11b-vision-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565, - "details": { - "description": "min=0.25, mean=0.565, max=0.865, sum=64.419 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.204, mean=0.255, max=0.726, sum=29.095 (114)", - "tab": "Efficiency", - "score": 0.2552187424358169 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)", - "tab": "General information", - "score": 614.6193817308517 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.28, - "details": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.227, mean=0.227, max=0.227, sum=0.454 (2)", - "tab": "Efficiency", - "score": 0.2272411847114563 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)", - "tab": "General information", - "score": 373.43 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.533, - "details": { - "description": "min=0.533, mean=0.533, max=0.533, sum=1.067 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.222, mean=0.222, max=0.222, sum=0.443 (2)", - "tab": "Efficiency", - "score": 0.22151856069211606 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)", - "tab": "General information", - "score": 353.8740740740741 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.333, - "details": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.726, mean=0.726, max=0.726, sum=1.453 (2)", - "tab": "Efficiency", - "score": 0.7264108276367187 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.244, mean=0.244, max=0.244, sum=0.488 (2)", - "tab": "Efficiency", - "score": 0.24387328988975948 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.216, mean=0.216, max=0.216, sum=0.433 (2)", - "tab": "Efficiency", - "score": 0.21631600618362426 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.219, mean=0.219, max=0.219, sum=0.437 (2)", - "tab": "Efficiency", - "score": 0.21859397411346435 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.22, mean=0.22, max=0.22, sum=0.439 (2)", - "tab": "Efficiency", - "score": 0.21971637665191826 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.226, mean=0.226, max=0.226, sum=0.452 (2)", - "tab": "Efficiency", - "score": 0.22610483683791816 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)", - "tab": "General information", - "score": 549.28 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)", - "tab": "General information", - "score": 473.875 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)", - "tab": "General information", - "score": 828.29 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)", - "tab": "General information", - "score": 594.51 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)", - "tab": "General information", - "score": 502.70520231213874 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)", - "tab": "General information", - "score": 503.5686274509804 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71, - "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.469, mean=0.469, max=0.469, sum=0.938 (2)", - "tab": "Efficiency", - "score": 0.4692394161224365 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)", - "tab": "General information", - "score": 378.51 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395, - "details": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.789 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.226, mean=0.226, max=0.226, sum=0.451 (2)", - "tab": "Efficiency", - "score": 0.22570312023162842 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)", - "tab": "General information", - "score": 614.421052631579 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25, - "details": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.497 (2)", - "tab": "Efficiency", - "score": 0.24868298768997193 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722, - "details": { - "description": "min=0.722, mean=0.722, max=0.722, sum=1.444 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.409 (2)", - "tab": "Efficiency", - "score": 0.20448691756637008 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)", - "tab": "General information", - "score": 394.6296296296296 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.646, - "details": { - "description": "min=0.646, mean=0.646, max=0.646, sum=1.293 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.216, mean=0.216, max=0.216, sum=0.433 (2)", - "tab": "Efficiency", - "score": 0.21639636628497452 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649, - "details": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)", - "tab": "Efficiency", - "score": 0.30631748893681693 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.236, mean=0.236, max=0.236, sum=0.472 (2)", - "tab": "Efficiency", - "score": 0.23619349882112328 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.581 (2)", - "tab": "Efficiency", - "score": 0.2907135481940099 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.233, mean=0.233, max=0.233, sum=0.465 (2)", - "tab": "Efficiency", - "score": 0.23272827988356545 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)", - "tab": "General information", - "score": 1094.4889705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)", - "tab": "General information", - "score": 658.5851063829788 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)", - "tab": "General information", - "score": 1637.6010430247718 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)", - "tab": "General information", - "score": 575.0980392156863 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.765 (2)", - "tab": "Efficiency", - "score": 0.3825261640548706 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.671, - "details": { - "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.497 (2)", - "tab": "Efficiency", - "score": 0.24860012060717532 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)", - "tab": "General information", - "score": 579.6842105263158 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.231, mean=0.231, max=0.231, sum=0.462 (2)", - "tab": "Efficiency", - "score": 0.23080476760864257 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.638, - "details": { - "description": "min=0.638, mean=0.638, max=0.638, sum=1.275 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.23, mean=0.23, max=0.23, sum=0.46 (2)", - "tab": "Efficiency", - "score": 0.22993840721418274 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)", - "tab": "General information", - "score": 397.92830188679244 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536, - "details": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.221, mean=0.221, max=0.221, sum=0.441 (2)", - "tab": "Efficiency", - "score": 0.2206148127292065 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)", - "tab": "General information", - "score": 304.83404255319147 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.021 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.231, mean=0.231, max=0.231, sum=0.461 (2)", - "tab": "Efficiency", - "score": 0.23056076312887258 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)", - "tab": "General information", - "score": 435.60689655172416 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.458, - "details": { - "description": "min=0.458, mean=0.458, max=0.458, sum=0.915 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.224, mean=0.224, max=0.224, sum=0.447 (2)", - "tab": "Efficiency", - "score": 0.22350322569488848 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)", - "tab": "General information", - "score": 531.8544973544973 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.921 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.458 (2)", - "tab": "Efficiency", - "score": 0.22878488661750915 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)", - "tab": "General information", - "score": 601.7777777777778 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502, - "details": { - "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.225, mean=0.225, max=0.225, sum=0.449 (2)", - "tab": "Efficiency", - "score": 0.22474505209153697 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.212, mean=0.212, max=0.212, sum=0.424 (2)", - "tab": "Efficiency", - "score": 0.21204462192328694 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.23, mean=0.23, max=0.23, sum=0.461 (2)", - "tab": "Efficiency", - "score": 0.2303963828086853 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.574 (2)", - "tab": "Efficiency", - "score": 0.28706942760583126 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.458 (2)", - "tab": "Efficiency", - "score": 0.22903898388448388 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.469 (2)", - "tab": "Efficiency", - "score": 0.23445281092984688 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.459 (2)", - "tab": "Efficiency", - "score": 0.22930157551398644 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.23, mean=0.23, max=0.23, sum=0.46 (2)", - "tab": "Efficiency", - "score": 0.23021557595994738 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.235, mean=0.235, max=0.235, sum=0.471 (2)", - "tab": "Efficiency", - "score": 0.2354360087579038 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.458 (2)", - "tab": "Efficiency", - "score": 0.22899133953827105 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.227, mean=0.227, max=0.227, sum=0.454 (2)", - "tab": "Efficiency", - "score": 0.22700285386601718 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.24, mean=0.24, max=0.24, sum=0.48 (2)", - "tab": "Efficiency", - "score": 0.2400491248678278 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)", - "tab": "Efficiency", - "score": 0.2529456720632665 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.499 (2)", - "tab": "Efficiency", - "score": 0.249685173799217 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)", - "tab": "General information", - "score": 513.6709677419354 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)", - "tab": "General information", - "score": 496.70443349753697 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)", - "tab": "General information", - "score": 2797.8848484848486 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)", - "tab": "General information", - "score": 372.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)", - "tab": "General information", - "score": 370.9076923076923 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)", - "tab": "General information", - "score": 532.3555555555556 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)", - "tab": "General information", - "score": 399.0126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)", - "tab": "General information", - "score": 560.4569536423841 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)", - "tab": "General information", - "score": 495.2422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)", - "tab": "General information", - "score": 795.6388888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)", - "tab": "General information", - "score": 1428.1729957805908 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.763, - "details": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.527 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.32235514315789054 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.245, mean=0.245, max=0.245, sum=0.49 (2)", - "tab": "Efficiency", - "score": 0.24487258095777673 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)", - "tab": "General information", - "score": 319.88789237668163 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)", - "tab": "General information", - "score": 341.1679389312977 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.231, mean=0.231, max=0.231, sum=0.462 (2)", - "tab": "Efficiency", - "score": 0.23109814943360887 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)", - "tab": "General information", - "score": 639.8181818181819 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.22, mean=0.22, max=0.22, sum=0.44 (2)", - "tab": "Efficiency", - "score": 0.21997687714231526 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)", - "tab": "General information", - "score": 449.5644171779141 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375, - "details": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.467 (2)", - "tab": "Efficiency", - "score": 0.2336032326732363 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.728, - "details": { - "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.246, mean=0.246, max=0.246, sum=0.491 (2)", - "tab": "Efficiency", - "score": 0.24564221067335998 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)", - "tab": "General information", - "score": 283.7864077669903 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=1.675 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.537 (2)", - "tab": "Efficiency", - "score": 0.26863190149649596 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.273, mean=0.273, max=0.273, sum=0.546 (2)", - "tab": "Efficiency", - "score": 0.2728374266624451 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)", - "tab": "General information", - "score": 340.99 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644, - "details": { - "description": "min=0.644, mean=0.644, max=0.644, sum=1.287 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)", - "tab": "Efficiency", - "score": 0.33641790095264734 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)", - "tab": "General information", - "score": 299.9106002554278 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328, - "details": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.657 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.243, mean=0.243, max=0.243, sum=0.486 (2)", - "tab": "Efficiency", - "score": 0.24306911126726624 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.458 (2)", - "tab": "Efficiency", - "score": 0.2289134478435836 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)", - "tab": "General information", - "score": 476.1127167630058 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.24, mean=0.24, max=0.24, sum=0.48 (2)", - "tab": "Efficiency", - "score": 0.2399757040871514 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)", - "tab": "General information", - "score": 586.8137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.744, - "details": { - "description": "min=0.744, mean=0.744, max=0.744, sum=1.488 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.457 (2)", - "tab": "Efficiency", - "score": 0.2287170680952661 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)", - "tab": "General information", - "score": 514.5277777777778 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.645, - "details": { - "description": "min=0.645, mean=0.645, max=0.645, sum=1.291 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.246, mean=0.246, max=0.246, sum=0.491 (2)", - "tab": "Efficiency", - "score": 0.24565653367476029 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.567, - "details": { - "description": "min=0.567, mean=0.567, max=0.567, sum=1.135 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)", - "tab": "Efficiency", - "score": 0.25285910878862655 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.627, - "details": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.254 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)", - "tab": "Efficiency", - "score": 0.23380224503094876 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)", - "tab": "General information", - "score": 445.51741293532336 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446, - "details": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.223, mean=0.223, max=0.223, sum=0.447 (2)", - "tab": "Efficiency", - "score": 0.22334270161318492 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)", - "tab": "General information", - "score": 343.01807228915663 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.239, mean=0.239, max=0.239, sum=0.478 (2)", - "tab": "Efficiency", - "score": 0.23875254357767384 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)", - "tab": "General information", - "score": 274.5204678362573 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.897, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json deleted file mode 100644 index f14700c78..000000000 --- a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.2 Vision Instruct Turbo 90B", - "id": "meta/llama-3.2-90b-vision-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.803, - "details": { - "description": "min=0.407, mean=0.803, max=0.979, sum=91.503 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.256, mean=0.374, max=2.612, sum=42.58 (114)", - "tab": "Efficiency", - "score": 0.37350966276831277 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)", - "tab": "General information", - "score": 614.6193817308517 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=2.612, mean=2.612, max=2.612, sum=5.224 (2)", - "tab": "Efficiency", - "score": 2.611864836215973 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)", - "tab": "General information", - "score": 373.43 - }, - "Abstract Algebra - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)", - "tab": "Efficiency", - "score": 0.3359027315069128 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)", - "tab": "General information", - "score": 353.8740740740741 - }, - "Anatomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539, - "details": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.3104448890686035 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.272, mean=0.272, max=0.272, sum=0.544 (2)", - "tab": "Efficiency", - "score": 0.2720499005582597 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)", - "tab": "Efficiency", - "score": 0.32119542360305786 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.31477957487106323 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.283, mean=0.283, max=0.283, sum=0.566 (2)", - "tab": "Efficiency", - "score": 0.28313319255850905 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)", - "tab": "Efficiency", - "score": 0.31692570097306194 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)", - "tab": "General information", - "score": 549.28 - }, - "College Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)", - "tab": "General information", - "score": 473.875 - }, - "College Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)", - "tab": "General information", - "score": 828.29 - }, - "College Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)", - "tab": "General information", - "score": 594.51 - }, - "College Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)", - "tab": "General information", - "score": 502.70520231213874 - }, - "College Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)", - "tab": "General information", - "score": 503.5686274509804 - }, - "College Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.266, mean=0.266, max=0.266, sum=0.532 (2)", - "tab": "Efficiency", - "score": 0.26576273441314696 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)", - "tab": "General information", - "score": 378.51 - }, - "Computer Security - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)", - "tab": "Efficiency", - "score": 0.2972530210227297 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)", - "tab": "General information", - "score": 614.421052631579 - }, - "Econometrics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.533 (2)", - "tab": "Efficiency", - "score": 0.2666162133216858 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.279, mean=0.279, max=0.279, sum=0.558 (2)", - "tab": "Efficiency", - "score": 0.278864703796528 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)", - "tab": "General information", - "score": 394.6296296296296 - }, - "Jurisprudence - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.839, - "details": { - "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)", - "tab": "Efficiency", - "score": 0.29689135582117404 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.553, mean=0.553, max=0.553, sum=1.106 (2)", - "tab": "Efficiency", - "score": 0.5529017465956071 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.32346555189038 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.743 (2)", - "tab": "Efficiency", - "score": 0.3715069820859131 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.3151663907992294 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)", - "tab": "General information", - "score": 1094.4889705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)", - "tab": "General information", - "score": 658.5851063829788 - }, - "Professional Accounting - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)", - "tab": "General information", - "score": 1637.6010430247718 - }, - "Professional Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)", - "tab": "General information", - "score": 575.0980392156863 - }, - "Professional Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.507, mean=0.507, max=0.507, sum=1.014 (2)", - "tab": "Efficiency", - "score": 0.5069083476066589 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.921, - "details": { - "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)", - "tab": "Efficiency", - "score": 0.3323579352152975 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)", - "tab": "General information", - "score": 579.6842105263158 - }, - "Astronomy - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.581 (2)", - "tab": "Efficiency", - "score": 0.29072295665740966 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.579 (2)", - "tab": "Efficiency", - "score": 0.2897273891376999 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)", - "tab": "General information", - "score": 397.92830188679244 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.279, mean=0.279, max=0.279, sum=0.559 (2)", - "tab": "Efficiency", - "score": 0.2794749209221373 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)", - "tab": "General information", - "score": 304.83404255319147 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.759, - "details": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.517 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)", - "tab": "Efficiency", - "score": 0.2558267790695717 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)", - "tab": "General information", - "score": 435.60689655172416 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688, - "details": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.376 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.617 (2)", - "tab": "Efficiency", - "score": 0.30840403945357714 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)", - "tab": "General information", - "score": 531.8544973544973 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.683, mean=0.683, max=0.683, sum=1.365 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.609 (2)", - "tab": "Efficiency", - "score": 0.30448357074979754 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)", - "tab": "General information", - "score": 601.7777777777778 - }, - "Formal Logic - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.619 (2)", - "tab": "Efficiency", - "score": 0.3094667688492806 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.29394797386207017 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)", - "tab": "Efficiency", - "score": 0.30106969356536867 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)", - "tab": "Efficiency", - "score": 0.4799844944115841 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)", - "tab": "Efficiency", - "score": 0.29747620014229204 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.583 (2)", - "tab": "Efficiency", - "score": 0.2914604300662026 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.279, mean=0.279, max=0.279, sum=0.557 (2)", - "tab": "Efficiency", - "score": 0.27857950650728663 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.625 (2)", - "tab": "Efficiency", - "score": 0.3123831342767786 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.302, mean=0.302, max=0.302, sum=0.603 (2)", - "tab": "Efficiency", - "score": 0.30159517997453195 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.32152655108874995 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.581 (2)", - "tab": "Efficiency", - "score": 0.2903494253071076 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)", - "tab": "Efficiency", - "score": 0.33328031720938506 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.394, mean=0.394, max=0.394, sum=0.788 (2)", - "tab": "Efficiency", - "score": 0.39396579826579375 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.679, mean=0.679, max=0.679, sum=1.359 (2)", - "tab": "Efficiency", - "score": 0.6793377369265013 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)", - "tab": "General information", - "score": 513.6709677419354 - }, - "High School Biology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)", - "tab": "General information", - "score": 496.70443349753697 - }, - "High School Chemistry - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)", - "tab": "General information", - "score": 2797.8848484848486 - }, - "High School European History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)", - "tab": "General information", - "score": 372.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)", - "tab": "General information", - "score": 370.9076923076923 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)", - "tab": "General information", - "score": 532.3555555555556 - }, - "High School Mathematics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)", - "tab": "General information", - "score": 399.0126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)", - "tab": "General information", - "score": 560.4569536423841 - }, - "High School Physics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)", - "tab": "General information", - "score": 495.2422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)", - "tab": "General information", - "score": 795.6388888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)", - "tab": "General information", - "score": 1428.1729957805908 - }, - "High School World History - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)", - "tab": "Efficiency", - "score": 0.38789880863754206 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)", - "tab": "Efficiency", - "score": 0.2929920222013051 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)", - "tab": "General information", - "score": 319.88789237668163 - }, - "Human Aging - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)", - "tab": "General information", - "score": 341.1679389312977 - }, - "Human Sexuality - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.685 (2)", - "tab": "Efficiency", - "score": 0.34241620962284813 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)", - "tab": "General information", - "score": 639.8181818181819 - }, - "International Law - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.565 (2)", - "tab": "Efficiency", - "score": 0.28232605325663745 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)", - "tab": "General information", - "score": 449.5644171779141 - }, - "Logical Fallacies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688, - "details": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)", - "tab": "Efficiency", - "score": 0.33782388057027546 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.571 (2)", - "tab": "Efficiency", - "score": 0.2853238027072647 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)", - "tab": "General information", - "score": 283.7864077669903 - }, - "Management - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.944, - "details": { - "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)", - "tab": "Efficiency", - "score": 0.28032574796269083 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)", - "tab": "Efficiency", - "score": 0.29611136198043825 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)", - "tab": "General information", - "score": 340.99 - }, - "Medical Genetics - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.3237126984967735 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)", - "tab": "General information", - "score": 299.9106002554278 - }, - "Miscellaneous - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.841, - "details": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Efficiency", - "score": 0.2901734975032035 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=1.012 (2)", - "tab": "Efficiency", - "score": 0.5058047955262595 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)", - "tab": "General information", - "score": 476.1127167630058 - }, - "Moral Disputes - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.641 (2)", - "tab": "Efficiency", - "score": 0.32064209264867444 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)", - "tab": "General information", - "score": 586.8137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.614, mean=0.614, max=0.614, sum=1.227 (2)", - "tab": "Efficiency", - "score": 0.6136744522754057 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)", - "tab": "General information", - "score": 514.5277777777778 - }, - "Prehistory - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.29952496832067316 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.853, - "details": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)", - "tab": "Efficiency", - "score": 0.348436891789339 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)", - "tab": "Efficiency", - "score": 0.29732529915387357 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)", - "tab": "General information", - "score": 445.51741293532336 - }, - "Sociology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)", - "tab": "Efficiency", - "score": 0.32124968609177923 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)", - "tab": "General information", - "score": 343.01807228915663 - }, - "Virology - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.554 (2)", - "tab": "Efficiency", - "score": 0.27723441068191973 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)", - "tab": "General information", - "score": 274.5204678362573 - }, - "World Religions - # output tokens": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json deleted file mode 100644 index faf8ae128..000000000 --- a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama 3.3 Instruct Turbo 70B", - "id": "meta/llama-3.3-70b-instruct-turbo", - "developer": "meta", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.441, mean=0.791, max=0.984, sum=90.129 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.303, mean=0.345, max=0.559, sum=39.355 (114)", - "tab": "Efficiency", - "score": 0.34521783642237874 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)", - "tab": "General information", - "score": 614.6193817308517 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)", - "tab": "Efficiency", - "score": 0.3131356716156006 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)", - "tab": "General information", - "score": 373.43 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.3432198400850649 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)", - "tab": "General information", - "score": 353.8740740740741 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.039 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.717 (2)", - "tab": "Efficiency", - "score": 0.35871645450592043 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.366, mean=0.366, max=0.366, sum=0.732 (2)", - "tab": "Efficiency", - "score": 0.36611984339025283 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.701 (2)", - "tab": "Efficiency", - "score": 0.3503202319145203 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)", - "tab": "Efficiency", - "score": 0.33748736619949343 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.3367649737121053 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.615 (2)", - "tab": "Efficiency", - "score": 0.30743202976152006 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)", - "tab": "General information", - "score": 549.28 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)", - "tab": "General information", - "score": 473.875 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)", - "tab": "General information", - "score": 828.29 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)", - "tab": "General information", - "score": 594.51 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)", - "tab": "General information", - "score": 502.70520231213874 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)", - "tab": "General information", - "score": 503.5686274509804 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Efficiency", - "score": 0.33975651502609255 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)", - "tab": "General information", - "score": 378.51 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719, - "details": { - "description": "min=0.719, mean=0.719, max=0.719, sum=1.439 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)", - "tab": "Efficiency", - "score": 0.34139270113225567 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)", - "tab": "General information", - "score": 614.421052631579 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.34327178478240966 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.659 (2)", - "tab": "Efficiency", - "score": 0.32968640327453613 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)", - "tab": "General information", - "score": 394.6296296296296 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)", - "tab": "Efficiency", - "score": 0.32124289515700755 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.733 (2)", - "tab": "Efficiency", - "score": 0.36657266932375293 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Efficiency", - "score": 0.33986637440133605 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)", - "tab": "Efficiency", - "score": 0.3858062526237856 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)", - "tab": "Efficiency", - "score": 0.33390796184539795 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)", - "tab": "General information", - "score": 1094.4889705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)", - "tab": "General information", - "score": 658.5851063829788 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)", - "tab": "General information", - "score": 1637.6010430247718 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)", - "tab": "General information", - "score": 575.0980392156863 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.683 (2)", - "tab": "Efficiency", - "score": 0.34171419143676757 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.888, - "details": { - "description": "min=0.888, mean=0.888, max=0.888, sum=1.776 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.657 (2)", - "tab": "Efficiency", - "score": 0.3287427550867984 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)", - "tab": "General information", - "score": 579.6842105263158 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)", - "tab": "Efficiency", - "score": 0.327047655582428 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.3435286764828664 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)", - "tab": "General information", - "score": 397.92830188679244 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.821, - "details": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.643 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)", - "tab": "Efficiency", - "score": 0.33338003361478763 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)", - "tab": "General information", - "score": 304.83404255319147 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.709 (2)", - "tab": "Efficiency", - "score": 0.35425889245395004 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)", - "tab": "General information", - "score": 435.60689655172416 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672, - "details": { - "description": "min=0.672, mean=0.672, max=0.672, sum=1.344 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.669 (2)", - "tab": "Efficiency", - "score": 0.33447367299801456 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)", - "tab": "General information", - "score": 531.8544973544973 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.349 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Efficiency", - "score": 0.349764451148018 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)", - "tab": "General information", - "score": 601.7777777777778 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.907, - "details": { - "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)", - "tab": "Efficiency", - "score": 0.34841231069257184 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.65 (2)", - "tab": "Efficiency", - "score": 0.3249026636771968 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)", - "tab": "Efficiency", - "score": 0.3761155128479004 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)", - "tab": "Efficiency", - "score": 0.558924115787853 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.606 (2)", - "tab": "Efficiency", - "score": 0.30311920907762313 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.639 (2)", - "tab": "Efficiency", - "score": 0.3192925144353679 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.3212899880531507 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.661 (2)", - "tab": "Efficiency", - "score": 0.3307388570573595 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.663 (2)", - "tab": "Efficiency", - "score": 0.3317271210566288 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Efficiency", - "score": 0.34023177229016033 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.3273837903224 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.718 (2)", - "tab": "Efficiency", - "score": 0.359178250586545 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)", - "tab": "Efficiency", - "score": 0.443670579031402 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.764 (2)", - "tab": "Efficiency", - "score": 0.3818797411294929 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)", - "tab": "General information", - "score": 513.6709677419354 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)", - "tab": "General information", - "score": 496.70443349753697 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)", - "tab": "General information", - "score": 2797.8848484848486 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)", - "tab": "General information", - "score": 372.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)", - "tab": "General information", - "score": 370.9076923076923 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)", - "tab": "General information", - "score": 532.3555555555556 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)", - "tab": "General information", - "score": 399.0126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)", - "tab": "General information", - "score": 560.4569536423841 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)", - "tab": "General information", - "score": 495.2422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)", - "tab": "General information", - "score": 795.6388888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)", - "tab": "General information", - "score": 1428.1729957805908 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.691 (2)", - "tab": "Efficiency", - "score": 0.3452627787140987 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)", - "tab": "Efficiency", - "score": 0.34599654183132955 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)", - "tab": "General information", - "score": 319.88789237668163 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)", - "tab": "General information", - "score": 341.1679389312977 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.741 (2)", - "tab": "Efficiency", - "score": 0.3704575231252623 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)", - "tab": "General information", - "score": 639.8181818181819 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.816, - "details": { - "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.613 (2)", - "tab": "Efficiency", - "score": 0.30655721506458117 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)", - "tab": "General information", - "score": 449.5644171779141 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.714, - "details": { - "description": "min=0.714, mean=0.714, max=0.714, sum=1.429 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)", - "tab": "Efficiency", - "score": 0.3751111796924046 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.3368335811837206 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)", - "tab": "General information", - "score": 283.7864077669903 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927, - "details": { - "description": "min=0.927, mean=0.927, max=0.927, sum=1.855 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)", - "tab": "Efficiency", - "score": 0.320215484015962 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)", - "tab": "Efficiency", - "score": 0.3268785071372986 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)", - "tab": "General information", - "score": 340.99 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.829 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.641 (2)", - "tab": "Efficiency", - "score": 0.32054392161801704 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)", - "tab": "General information", - "score": 299.9106002554278 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.698, - "details": { - "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.644 (2)", - "tab": "Efficiency", - "score": 0.321929149544997 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.3511003518237748 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)", - "tab": "General information", - "score": 476.1127167630058 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882, - "details": { - "description": "min=0.882, mean=0.882, max=0.882, sum=1.765 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.711 (2)", - "tab": "Efficiency", - "score": 0.35563821730270884 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)", - "tab": "General information", - "score": 586.8137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.895, - "details": { - "description": "min=0.895, mean=0.895, max=0.895, sum=1.79 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.685 (2)", - "tab": "Efficiency", - "score": 0.34269326410175843 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)", - "tab": "General information", - "score": 514.5277777777778 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)", - "tab": "Efficiency", - "score": 0.34484653039412066 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.369, mean=0.369, max=0.369, sum=0.737 (2)", - "tab": "Efficiency", - "score": 0.3686914687253991 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.3236708546159279 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)", - "tab": "General information", - "score": 445.51741293532336 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566, - "details": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.3235311522541276 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)", - "tab": "General information", - "score": 343.01807228915663 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.606 (2)", - "tab": "Efficiency", - "score": 0.30298223132975616 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)", - "tab": "General information", - "score": 274.5204678362573 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json deleted file mode 100644 index 95bd9f1b8..000000000 --- a/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/microsoft_phi-2/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-2", - "id": "microsoft/phi-2", - "developer": "microsoft", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.231, mean=0.584, max=0.833, sum=66.604 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.267, mean=0.309, max=0.409, sum=35.222 (114)", - "tab": "Efficiency", - "score": 0.3089648339000309 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=2.945, mean=4.946, max=5, sum=563.886 (114)", - "tab": "General information", - "score": 4.946365736553069 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=277.404, mean=600.9, max=1826.103, sum=68502.623 (114)", - "tab": "General information", - "score": 600.9002028338741 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31, - "details": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)", - "tab": "Efficiency", - "score": 0.2925554180145264 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=371.38, mean=371.38, max=371.38, sum=742.76 (2)", - "tab": "General information", - "score": 371.38 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437, - "details": { - "description": "min=0.437, mean=0.437, max=0.437, sum=0.874 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.675 (2)", - "tab": "Efficiency", - "score": 0.3375302138151946 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=372.081, mean=372.081, max=372.081, sum=744.163 (2)", - "tab": "General information", - "score": 372.0814814814815 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382, - "details": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)", - "tab": "Efficiency", - "score": 0.2696530842781067 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.302, mean=0.302, max=0.302, sum=0.604 (2)", - "tab": "Efficiency", - "score": 0.3021910654173957 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.697 (2)", - "tab": "Efficiency", - "score": 0.34874132156372073 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)", - "tab": "Efficiency", - "score": 0.3188008284568787 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.607 (2)", - "tab": "Efficiency", - "score": 0.30374339412402557 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)", - "tab": "Efficiency", - "score": 0.31993647182688995 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=545.4, mean=545.4, max=545.4, sum=1090.8 (2)", - "tab": "General information", - "score": 545.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=482.278, mean=482.278, max=482.278, sum=964.556 (2)", - "tab": "General information", - "score": 482.27777777777777 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=852.03, mean=852.03, max=852.03, sum=1704.06 (2)", - "tab": "General information", - "score": 852.03 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=611.54, mean=611.54, max=611.54, sum=1223.08 (2)", - "tab": "General information", - "score": 611.54 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=530.301, mean=530.301, max=530.301, sum=1060.601 (2)", - "tab": "General information", - "score": 530.3005780346821 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=489.324, mean=489.324, max=489.324, sum=978.647 (2)", - "tab": "General information", - "score": 489.3235294117647 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.554 (2)", - "tab": "Efficiency", - "score": 0.2771985101699829 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=387.4, mean=387.4, max=387.4, sum=774.8 (2)", - "tab": "General information", - "score": 387.4 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342, - "details": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)", - "tab": "Efficiency", - "score": 0.294714699711716 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=624.07, mean=624.07, max=624.07, sum=1248.14 (2)", - "tab": "General information", - "score": 624.0701754385965 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35, - "details": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.631 (2)", - "tab": "Efficiency", - "score": 0.3154014134407043 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=398.42, mean=398.42, max=398.42, sum=796.84 (2)", - "tab": "General information", - "score": 398.42 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694, - "details": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.389 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.562 (2)", - "tab": "Efficiency", - "score": 0.28103237681918675 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=418.722, mean=418.722, max=418.722, sum=837.444 (2)", - "tab": "General information", - "score": 418.72222222222223 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.598, - "details": { - "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.597 (2)", - "tab": "Efficiency", - "score": 0.29847138410979146 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=353.711, mean=353.711, max=353.711, sum=707.421 (2)", - "tab": "General information", - "score": 353.7106109324759 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572, - "details": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.144 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)", - "tab": "Efficiency", - "score": 0.3051472201066859 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.619 (2)", - "tab": "Efficiency", - "score": 0.3096669819338102 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.363, mean=0.363, max=0.363, sum=0.727 (2)", - "tab": "Efficiency", - "score": 0.36331592731401224 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Efficiency", - "score": 0.30723563518399505 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1118.287, mean=1118.287, max=1118.287, sum=2236.574 (2)", - "tab": "General information", - "score": 1118.2867647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=660.72, mean=660.72, max=660.72, sum=1321.44 (2)", - "tab": "General information", - "score": 660.7198581560284 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=4.997, mean=4.997, max=4.997, sum=9.995 (2)", - "tab": "General information", - "score": 4.9973924380704045 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1657.596, mean=1657.596, max=1657.596, sum=3315.192 (2)", - "tab": "General information", - "score": 1657.5958279009126 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=597.574, mean=597.574, max=597.574, sum=1195.147 (2)", - "tab": "General information", - "score": 597.5735294117648 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)", - "tab": "Efficiency", - "score": 0.2921306538581848 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=433.12, mean=433.12, max=433.12, sum=866.24 (2)", - "tab": "General information", - "score": 433.12 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.605, - "details": { - "description": "min=0.605, mean=0.605, max=0.605, sum=1.211 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)", - "tab": "Efficiency", - "score": 0.2971143110802299 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=600.112, mean=600.112, max=600.112, sum=1200.224 (2)", - "tab": "General information", - "score": 600.1118421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)", - "tab": "Efficiency", - "score": 0.33283984184265136 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=589.43, mean=589.43, max=589.43, sum=1178.86 (2)", - "tab": "General information", - "score": 589.43 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.619, - "details": { - "description": "min=0.619, mean=0.619, max=0.619, sum=1.238 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.3039509620306627 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=423.925, mean=423.925, max=423.925, sum=847.849 (2)", - "tab": "General information", - "score": 423.92452830188677 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519, - "details": { - "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)", - "tab": "Efficiency", - "score": 0.30905701251740153 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=313.723, mean=313.723, max=313.723, sum=627.447 (2)", - "tab": "General information", - "score": 313.72340425531917 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.545, - "details": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.09 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.639 (2)", - "tab": "Efficiency", - "score": 0.31939958375075767 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=430.345, mean=430.345, max=430.345, sum=860.69 (2)", - "tab": "General information", - "score": 430.3448275862069 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463, - "details": { - "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.607 (2)", - "tab": "Efficiency", - "score": 0.30370362284322266 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=506.09, mean=506.09, max=506.09, sum=1012.18 (2)", - "tab": "General information", - "score": 506.0899470899471 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.389, - "details": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.778 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)", - "tab": "Efficiency", - "score": 0.3209871034773569 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=641, mean=641, max=641, sum=1282 (2)", - "tab": "General information", - "score": 641.0 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.279, mean=0.279, max=0.279, sum=0.557 (2)", - "tab": "Efficiency", - "score": 0.2785434192226779 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)", - "tab": "Efficiency", - "score": 0.3082333773814986 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)", - "tab": "Efficiency", - "score": 0.3267984962463379 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.819 (2)", - "tab": "Efficiency", - "score": 0.40945722406560725 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)", - "tab": "Efficiency", - "score": 0.30513872763123173 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)", - "tab": "Efficiency", - "score": 0.2802187642902908 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.782 (2)", - "tab": "Efficiency", - "score": 0.3909576538281563 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.30405007821542246 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.274, mean=0.274, max=0.274, sum=0.548 (2)", - "tab": "Efficiency", - "score": 0.2737702652185905 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)", - "tab": "Efficiency", - "score": 0.30272982452089425 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.609 (2)", - "tab": "Efficiency", - "score": 0.30458581688207226 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.629 (2)", - "tab": "Efficiency", - "score": 0.3143394479045161 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.759 (2)", - "tab": "Efficiency", - "score": 0.37960049802181767 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.729 (2)", - "tab": "Efficiency", - "score": 0.36470460791125076 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=540.748, mean=540.748, max=540.748, sum=1081.497 (2)", - "tab": "General information", - "score": 540.7483870967742 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=495.645, mean=495.645, max=495.645, sum=991.291 (2)", - "tab": "General information", - "score": 495.6453201970443 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=894.78, mean=894.78, max=894.78, sum=1789.56 (2)", - "tab": "General information", - "score": 894.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=2.945, mean=2.945, max=2.945, sum=5.891 (2)", - "tab": "General information", - "score": 2.9454545454545453 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=1826.103, mean=1826.103, max=1826.103, sum=3652.206 (2)", - "tab": "General information", - "score": 1826.1030303030302 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=397.646, mean=397.646, max=397.646, sum=795.293 (2)", - "tab": "General information", - "score": 397.64646464646466 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=478.073, mean=478.073, max=478.073, sum=956.145 (2)", - "tab": "General information", - "score": 478.07253886010363 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=391.931, mean=391.931, max=391.931, sum=783.862 (2)", - "tab": "General information", - "score": 391.9307692307692 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=526.352, mean=526.352, max=526.352, sum=1052.704 (2)", - "tab": "General information", - "score": 526.3518518518518 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=410.937, mean=410.937, max=410.937, sum=821.874 (2)", - "tab": "General information", - "score": 410.93697478991595 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=553.669, mean=553.669, max=553.669, sum=1107.338 (2)", - "tab": "General information", - "score": 553.6688741721854 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=516.842, mean=516.842, max=516.842, sum=1033.684 (2)", - "tab": "General information", - "score": 516.8422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=805, mean=805, max=805, sum=1610 (2)", - "tab": "General information", - "score": 805.0 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=4, mean=4, max=4, sum=8 (2)", - "tab": "General information", - "score": 4.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=1756.25, mean=1756.25, max=1756.25, sum=3512.5 (2)", - "tab": "General information", - "score": 1756.25 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1438.561, mean=1438.561, max=1438.561, sum=2877.122 (2)", - "tab": "General information", - "score": 1438.5611814345991 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)", - "tab": "Efficiency", - "score": 0.2979412987627791 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)", - "tab": "Efficiency", - "score": 0.30250649051811856 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=324.48, mean=324.48, max=324.48, sum=648.96 (2)", - "tab": "General information", - "score": 324.47982062780267 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=357.626, mean=357.626, max=357.626, sum=715.252 (2)", - "tab": "General information", - "score": 357.62595419847327 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.504 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Efficiency", - "score": 0.30694435647696505 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.843, mean=639.843, max=639.843, sum=1279.686 (2)", - "tab": "General information", - "score": 639.8429752066115 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.767, - "details": { - "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.274, mean=0.274, max=0.274, sum=0.548 (2)", - "tab": "Efficiency", - "score": 0.273789843167264 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=454.233, mean=454.233, max=454.233, sum=908.466 (2)", - "tab": "General information", - "score": 454.23312883435585 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.627 (2)", - "tab": "Efficiency", - "score": 0.31332691439560484 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=671.598, mean=671.598, max=671.598, sum=1343.196 (2)", - "tab": "General information", - "score": 671.5982142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.748, - "details": { - "description": "min=0.748, mean=0.748, max=0.748, sum=1.495 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)", - "tab": "Efficiency", - "score": 0.3051937992132983 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=292.34, mean=292.34, max=292.34, sum=584.68 (2)", - "tab": "General information", - "score": 292.3398058252427 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.276, mean=0.276, max=0.276, sum=0.552 (2)", - "tab": "Efficiency", - "score": 0.2761631949335082 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=437.667, mean=437.667, max=437.667, sum=875.333 (2)", - "tab": "General information", - "score": 437.6666666666667 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62, - "details": { - "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.609 (2)", - "tab": "Efficiency", - "score": 0.3045226716995239 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=352.71, mean=352.71, max=352.71, sum=705.42 (2)", - "tab": "General information", - "score": 352.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688, - "details": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.377 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)", - "tab": "Efficiency", - "score": 0.33387171049836645 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=314.847, mean=314.847, max=314.847, sum=629.693 (2)", - "tab": "General information", - "score": 314.84674329501917 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.231, - "details": { - "description": "min=0.231, mean=0.231, max=0.231, sum=0.463 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)", - "tab": "Efficiency", - "score": 0.3032567480395984 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)", - "tab": "Efficiency", - "score": 0.26702385215119945 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=497.514, mean=497.514, max=497.514, sum=995.029 (2)", - "tab": "General information", - "score": 497.514450867052 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=664.479, mean=664.479, max=664.479, sum=1328.959 (2)", - "tab": "General information", - "score": 664.4793296089385 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.627, - "details": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.3112297058105469 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=584.69, mean=584.69, max=584.69, sum=1169.379 (2)", - "tab": "General information", - "score": 584.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.605, - "details": { - "description": "min=0.605, mean=0.605, max=0.605, sum=1.21 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.583 (2)", - "tab": "Efficiency", - "score": 0.29145334090715574 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=524.454, mean=524.454, max=524.454, sum=1048.907 (2)", - "tab": "General information", - "score": 524.4537037037037 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673, - "details": { - "description": "min=0.673, mean=0.673, max=0.673, sum=1.345 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)", - "tab": "Efficiency", - "score": 0.28212652423165063 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=420.609, mean=420.609, max=420.609, sum=841.218 (2)", - "tab": "General information", - "score": 420.6090909090909 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.3223595599738919 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1196.433, mean=1196.433, max=1196.433, sum=2392.865 (2)", - "tab": "General information", - "score": 1196.4326530612245 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.816, - "details": { - "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.288, mean=0.288, max=0.288, sum=0.575 (2)", - "tab": "Efficiency", - "score": 0.2876073993853669 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=446.512, mean=446.512, max=446.512, sum=893.025 (2)", - "tab": "General information", - "score": 446.5124378109453 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47, - "details": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.559 (2)", - "tab": "Efficiency", - "score": 0.27966123316661423 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=352.759, mean=352.759, max=352.759, sum=705.518 (2)", - "tab": "General information", - "score": 352.7590361445783 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.569 (2)", - "tab": "Efficiency", - "score": 0.2843696499428554 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=277.404, mean=277.404, max=277.404, sum=554.807 (2)", - "tab": "General information", - "score": 277.4035087719298 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json deleted file mode 100644 index f1d62a268..000000000 --- a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3 14B", - "id": "microsoft/phi-3-medium-4k-instruct", - "developer": "microsoft", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "details": { - "description": "min=0.5, mean=0.775, max=0.969, sum=88.295 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=2.025, mean=4.948, max=22.342, sum=564.095 (114)", - "tab": "Efficiency", - "score": 4.948199983258553 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=313.474, mean=714.893, max=3168.636, sum=81497.749 (114)", - "tab": "General information", - "score": 714.8925389546507 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=2.63, mean=2.63, max=2.63, sum=5.26 (2)", - "tab": "Efficiency", - "score": 2.63020414352417 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)", - "tab": "General information", - "score": 397.65 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719, - "details": { - "description": "min=0.719, mean=0.719, max=0.719, sum=1.437 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=3.025, mean=3.025, max=3.025, sum=6.051 (2)", - "tab": "Efficiency", - "score": 3.0252625394750523 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=418.133, mean=418.133, max=418.133, sum=836.267 (2)", - "tab": "General information", - "score": 418.1333333333333 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529, - "details": { - "description": "min=0.529, mean=0.529, max=0.529, sum=1.059 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=3.886, mean=3.886, max=3.886, sum=7.772 (2)", - "tab": "Efficiency", - "score": 3.886199688911438 - }, - "College Biology - Observed inference time (s)": { - "description": "min=4.073, mean=4.073, max=4.073, sum=8.146 (2)", - "tab": "Efficiency", - "score": 4.072841899262534 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=6.237, mean=6.237, max=6.237, sum=12.473 (2)", - "tab": "Efficiency", - "score": 6.236730601787567 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=4.541, mean=4.541, max=4.541, sum=9.083 (2)", - "tab": "Efficiency", - "score": 4.541367738246918 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=4.259, mean=4.259, max=4.259, sum=8.518 (2)", - "tab": "Efficiency", - "score": 4.259122938089977 - }, - "College Physics - Observed inference time (s)": { - "description": "min=3.966, mean=3.966, max=3.966, sum=7.933 (2)", - "tab": "Efficiency", - "score": 3.966460019934411 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)", - "tab": "General information", - "score": 622.43 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=562.632, mean=562.632, max=562.632, sum=1125.264 (2)", - "tab": "General information", - "score": 562.6319444444445 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=910.14, mean=910.14, max=910.14, sum=1820.28 (2)", - "tab": "General information", - "score": 910.14 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=655.96, mean=655.96, max=655.96, sum=1311.92 (2)", - "tab": "General information", - "score": 655.96 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=617.671, mean=617.671, max=617.671, sum=1235.341 (2)", - "tab": "General information", - "score": 617.6705202312139 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=560.873, mean=560.873, max=560.873, sum=1121.745 (2)", - "tab": "General information", - "score": 560.8725490196078 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=2.748, mean=2.748, max=2.748, sum=5.496 (2)", - "tab": "Efficiency", - "score": 2.7481748914718627 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)", - "tab": "General information", - "score": 428.17 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614, - "details": { - "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=4.32, mean=4.32, max=4.32, sum=8.639 (2)", - "tab": "Efficiency", - "score": 4.319587314338015 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)", - "tab": "General information", - "score": 684.6754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=3.374, mean=3.374, max=3.374, sum=6.747 (2)", - "tab": "Efficiency", - "score": 3.373600058555603 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=493.54, mean=493.54, max=493.54, sum=987.08 (2)", - "tab": "General information", - "score": 493.54 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=3.225, mean=3.225, max=3.225, sum=6.45 (2)", - "tab": "Efficiency", - "score": 3.2251307015065795 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=458.898, mean=458.898, max=458.898, sum=917.796 (2)", - "tab": "General information", - "score": 458.89814814814815 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=2.591, mean=2.591, max=2.591, sum=5.182 (2)", - "tab": "Efficiency", - "score": 2.591215438781444 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=381.122, mean=381.122, max=381.122, sum=762.244 (2)", - "tab": "General information", - "score": 381.12218649517683 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=9.39, mean=9.39, max=9.39, sum=18.781 (2)", - "tab": "Efficiency", - "score": 9.390463957015205 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=5.784, mean=5.784, max=5.784, sum=11.567 (2)", - "tab": "Efficiency", - "score": 5.7837115450108305 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=13.198, mean=13.198, max=13.198, sum=26.396 (2)", - "tab": "Efficiency", - "score": 13.198108883849024 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=4.667, mean=4.667, max=4.667, sum=9.335 (2)", - "tab": "Efficiency", - "score": 4.667331269753524 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1339.647, mean=1339.647, max=1339.647, sum=2679.294 (2)", - "tab": "General information", - "score": 1339.6470588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=832.277, mean=832.277, max=832.277, sum=1664.553 (2)", - "tab": "General information", - "score": 832.2765957446809 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1924.007, mean=1924.007, max=1924.007, sum=3848.014 (2)", - "tab": "General information", - "score": 1924.0071707953064 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=659.078, mean=659.078, max=659.078, sum=1318.157 (2)", - "tab": "General information", - "score": 659.0784313725491 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=2.982, mean=2.982, max=2.982, sum=5.964 (2)", - "tab": "Efficiency", - "score": 2.98179637670517 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)", - "tab": "General information", - "score": 479.81 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.697 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=4.875, mean=4.875, max=4.875, sum=9.749 (2)", - "tab": "Efficiency", - "score": 4.874531077711206 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=690.079, mean=690.079, max=690.079, sum=1380.158 (2)", - "tab": "General information", - "score": 690.078947368421 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=4.78, mean=4.78, max=4.78, sum=9.559 (2)", - "tab": "Efficiency", - "score": 4.779508647918701 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=683.44, mean=683.44, max=683.44, sum=1366.88 (2)", - "tab": "General information", - "score": 683.44 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=3.474, mean=3.474, max=3.474, sum=6.948 (2)", - "tab": "Efficiency", - "score": 3.474059367629717 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=496.374, mean=496.374, max=496.374, sum=992.747 (2)", - "tab": "General information", - "score": 496.3735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.617 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=2.448, mean=2.448, max=2.448, sum=4.896 (2)", - "tab": "Efficiency", - "score": 2.448020648956299 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=342.153, mean=342.153, max=342.153, sum=684.306 (2)", - "tab": "General information", - "score": 342.1531914893617 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=3.495, mean=3.495, max=3.495, sum=6.99 (2)", - "tab": "Efficiency", - "score": 3.4950728284901587 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=506.779, mean=506.779, max=506.779, sum=1013.559 (2)", - "tab": "General information", - "score": 506.7793103448276 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=4.344, mean=4.344, max=4.344, sum=8.688 (2)", - "tab": "Efficiency", - "score": 4.344110502137078 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=618.156, mean=618.156, max=618.156, sum=1236.312 (2)", - "tab": "General information", - "score": 618.1560846560847 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.587, - "details": { - "description": "min=0.587, mean=0.587, max=0.587, sum=1.175 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=4.988, mean=4.988, max=4.988, sum=9.977 (2)", - "tab": "Efficiency", - "score": 4.988478910355341 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=700.81, mean=700.81, max=700.81, sum=1401.619 (2)", - "tab": "General information", - "score": 700.8095238095239 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=4.253, mean=4.253, max=4.253, sum=8.506 (2)", - "tab": "Efficiency", - "score": 4.253153976317375 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=4.116, mean=4.116, max=4.116, sum=8.232 (2)", - "tab": "Efficiency", - "score": 4.115784048446881 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=6.919, mean=6.919, max=6.919, sum=13.839 (2)", - "tab": "Efficiency", - "score": 6.919438579082489 - }, - "High School European History - Observed inference time (s)": { - "description": "min=22.342, mean=22.342, max=22.342, sum=44.684 (2)", - "tab": "Efficiency", - "score": 22.341962937152747 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=3.01, mean=3.01, max=3.01, sum=6.02 (2)", - "tab": "Efficiency", - "score": 3.010115607820376 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=3.784, mean=3.784, max=3.784, sum=7.567 (2)", - "tab": "Efficiency", - "score": 3.783631190117159 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=3.202, mean=3.202, max=3.202, sum=6.403 (2)", - "tab": "Efficiency", - "score": 3.2015056090477185 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=4.126, mean=4.126, max=4.126, sum=8.251 (2)", - "tab": "Efficiency", - "score": 4.125549591912163 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=3.125, mean=3.125, max=3.125, sum=6.249 (2)", - "tab": "Efficiency", - "score": 3.124516798668549 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=3.882, mean=3.882, max=3.882, sum=7.765 (2)", - "tab": "Efficiency", - "score": 3.88235890154807 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=4.036, mean=4.036, max=4.036, sum=8.072 (2)", - "tab": "Efficiency", - "score": 4.035925890108861 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=6.294, mean=6.294, max=6.294, sum=12.587 (2)", - "tab": "Efficiency", - "score": 6.293625408852542 - }, - "High School US History - Observed inference time (s)": { - "description": "min=17.94, mean=17.94, max=17.94, sum=35.88 (2)", - "tab": "Efficiency", - "score": 17.93984198219636 - }, - "High School World History - Observed inference time (s)": { - "description": "min=11.445, mean=11.445, max=11.445, sum=22.889 (2)", - "tab": "Efficiency", - "score": 11.444628432833193 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=605.894, mean=605.894, max=605.894, sum=1211.787 (2)", - "tab": "General information", - "score": 605.8935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=577.665, mean=577.665, max=577.665, sum=1155.33 (2)", - "tab": "General information", - "score": 577.6650246305419 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=997.57, mean=997.57, max=997.57, sum=1995.14 (2)", - "tab": "General information", - "score": 997.57 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3168.636, mean=3168.636, max=3168.636, sum=6337.273 (2)", - "tab": "General information", - "score": 3168.6363636363635 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=445.657, mean=445.657, max=445.657, sum=891.313 (2)", - "tab": "General information", - "score": 445.65656565656565 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=536.927, mean=536.927, max=536.927, sum=1073.855 (2)", - "tab": "General information", - "score": 536.9274611398964 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=454.662, mean=454.662, max=454.662, sum=909.323 (2)", - "tab": "General information", - "score": 454.66153846153844 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=588.181, mean=588.181, max=588.181, sum=1176.363 (2)", - "tab": "General information", - "score": 588.1814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=458.492, mean=458.492, max=458.492, sum=916.983 (2)", - "tab": "General information", - "score": 458.49159663865544 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=630.788, mean=630.788, max=630.788, sum=1261.576 (2)", - "tab": "General information", - "score": 630.7880794701987 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=594.919, mean=594.919, max=594.919, sum=1189.839 (2)", - "tab": "General information", - "score": 594.9192660550459 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=917.208, mean=917.208, max=917.208, sum=1834.417 (2)", - "tab": "General information", - "score": 917.2083333333334 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2544.324, mean=2544.324, max=2544.324, sum=5088.647 (2)", - "tab": "General information", - "score": 2544.323529411765 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1647.219, mean=1647.219, max=1647.219, sum=3294.439 (2)", - "tab": "General information", - "score": 1647.2194092827003 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.863, - "details": { - "description": "min=0.863, mean=0.863, max=0.863, sum=1.725 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=2.578, mean=2.578, max=2.578, sum=5.157 (2)", - "tab": "Efficiency", - "score": 2.5783249647628033 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=2.963, mean=2.963, max=2.963, sum=5.925 (2)", - "tab": "Efficiency", - "score": 2.9625705234877024 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)", - "tab": "General information", - "score": 370.26008968609864 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=412.382, mean=412.382, max=412.382, sum=824.763 (2)", - "tab": "General information", - "score": 412.381679389313 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=5.179, mean=5.179, max=5.179, sum=10.357 (2)", - "tab": "Efficiency", - "score": 5.1785316802253405 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=738.463, mean=738.463, max=738.463, sum=1476.926 (2)", - "tab": "General information", - "score": 738.4628099173553 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=1.656 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=3.522, mean=3.522, max=3.522, sum=7.045 (2)", - "tab": "Efficiency", - "score": 3.5224247461447686 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=511.755, mean=511.755, max=511.755, sum=1023.509 (2)", - "tab": "General information", - "score": 511.7546012269939 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=5.118, mean=5.118, max=5.118, sum=10.237 (2)", - "tab": "Efficiency", - "score": 5.118442311882973 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=739.402, mean=739.402, max=739.402, sum=1478.804 (2)", - "tab": "General information", - "score": 739.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=2.27, mean=2.27, max=2.27, sum=4.539 (2)", - "tab": "Efficiency", - "score": 2.2697336812621183 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=324.777, mean=324.777, max=324.777, sum=649.553 (2)", - "tab": "General information", - "score": 324.77669902912623 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.919, - "details": { - "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=3.394, mean=3.394, max=3.394, sum=6.788 (2)", - "tab": "Efficiency", - "score": 3.3940892515019474 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=481.628, mean=481.628, max=481.628, sum=963.256 (2)", - "tab": "General information", - "score": 481.62820512820514 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=2.894, mean=2.894, max=2.894, sum=5.787 (2)", - "tab": "Efficiency", - "score": 2.893650698661804 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=417.14, mean=417.14, max=417.14, sum=834.28 (2)", - "tab": "General information", - "score": 417.14 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=2.025, mean=2.025, max=2.025, sum=4.05 (2)", - "tab": "Efficiency", - "score": 2.0249771478075633 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=354.913, mean=354.913, max=354.913, sum=709.826 (2)", - "tab": "General information", - "score": 354.9131545338442 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.639, - "details": { - "description": "min=0.639, mean=0.639, max=0.639, sum=1.278 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=3.877, mean=3.877, max=3.877, sum=7.754 (2)", - "tab": "Efficiency", - "score": 3.877226921175257 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=5.17, mean=5.17, max=5.17, sum=10.34 (2)", - "tab": "Efficiency", - "score": 5.170224364509796 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=551.506, mean=551.506, max=551.506, sum=1103.012 (2)", - "tab": "General information", - "score": 551.5057803468208 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=765.479, mean=765.479, max=765.479, sum=1530.959 (2)", - "tab": "General information", - "score": 765.4793296089385 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.837, - "details": { - "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=4.962, mean=4.962, max=4.962, sum=9.923 (2)", - "tab": "Efficiency", - "score": 4.961673566718507 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=704.922, mean=704.922, max=704.922, sum=1409.843 (2)", - "tab": "General information", - "score": 704.9215686274509 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.867, - "details": { - "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=4.39, mean=4.39, max=4.39, sum=8.779 (2)", - "tab": "Efficiency", - "score": 4.389729757367829 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=628.185, mean=628.185, max=628.185, sum=1256.37 (2)", - "tab": "General information", - "score": 628.1851851851852 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=3.474, mean=3.474, max=3.474, sum=6.948 (2)", - "tab": "Efficiency", - "score": 3.4741735740141437 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=483.827, mean=483.827, max=483.827, sum=967.655 (2)", - "tab": "General information", - "score": 483.8272727272727 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.657 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=9.808, mean=9.808, max=9.808, sum=19.616 (2)", - "tab": "Efficiency", - "score": 9.807938383063492 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1386.531, mean=1386.531, max=1386.531, sum=2773.061 (2)", - "tab": "General information", - "score": 1386.530612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.891, - "details": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=3.643, mean=3.643, max=3.643, sum=7.285 (2)", - "tab": "Efficiency", - "score": 3.642500052997722 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=517.478, mean=517.478, max=517.478, sum=1034.955 (2)", - "tab": "General information", - "score": 517.4776119402985 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=2.911, mean=2.911, max=2.911, sum=5.822 (2)", - "tab": "Efficiency", - "score": 2.910837286926178 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=414.108, mean=414.108, max=414.108, sum=828.217 (2)", - "tab": "General information", - "score": 414.10843373493975 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=2.097, mean=2.097, max=2.097, sum=4.194 (2)", - "tab": "Efficiency", - "score": 2.0972191897052075 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=313.474, mean=313.474, max=313.474, sum=626.947 (2)", - "tab": "General information", - "score": 313.4736842105263 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.015, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json deleted file mode 100644 index bbe3afca0..000000000 --- a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3 7B", - "id": "microsoft/phi-3-small-8k-instruct", - "developer": "microsoft", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757, - "details": { - "description": "min=0.44, mean=0.757, max=0.969, sum=86.273 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.226, mean=0.38, max=1.284, sum=43.298 (114)", - "tab": "Efficiency", - "score": 0.379805443442311 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)", - "tab": "General information", - "score": 614.851634217556 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44, - "details": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.505, mean=0.505, max=0.505, sum=1.009 (2)", - "tab": "Efficiency", - "score": 0.5047230005264283 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.44, mean=373.44, max=373.44, sum=746.88 (2)", - "tab": "General information", - "score": 373.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "description": "min=0.726, mean=0.726, max=0.726, sum=1.452 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)", - "tab": "Efficiency", - "score": 0.4122970881285491 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)", - "tab": "General information", - "score": 353.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)", - "tab": "Efficiency", - "score": 0.3414782953262329 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)", - "tab": "Efficiency", - "score": 0.3002290378014247 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.447, mean=0.447, max=0.447, sum=0.894 (2)", - "tab": "Efficiency", - "score": 0.4468130707740784 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.703 (2)", - "tab": "Efficiency", - "score": 0.35149253606796266 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)", - "tab": "Efficiency", - "score": 0.32299859399740405 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.644 (2)", - "tab": "Efficiency", - "score": 0.32188768246594596 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)", - "tab": "General information", - "score": 549.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.917, mean=473.917, max=473.917, sum=947.833 (2)", - "tab": "General information", - "score": 473.9166666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)", - "tab": "General information", - "score": 828.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)", - "tab": "General information", - "score": 594.52 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)", - "tab": "General information", - "score": 502.728323699422 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)", - "tab": "General information", - "score": 503.6078431372549 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.275, mean=0.275, max=0.275, sum=0.55 (2)", - "tab": "Efficiency", - "score": 0.2747947096824646 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.54, mean=378.54, max=378.54, sum=757.08 (2)", - "tab": "General information", - "score": 378.54 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.362, mean=0.362, max=0.362, sum=0.724 (2)", - "tab": "Efficiency", - "score": 0.36201402178981845 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)", - "tab": "General information", - "score": 614.4298245614035 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)", - "tab": "Efficiency", - "score": 0.5091006135940552 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.685 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)", - "tab": "Efficiency", - "score": 0.2687692134468644 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.639, mean=394.639, max=394.639, sum=789.278 (2)", - "tab": "General information", - "score": 394.6388888888889 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.263, mean=0.263, max=0.263, sum=0.527 (2)", - "tab": "Efficiency", - "score": 0.26347158346145483 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.073 (2)", - "tab": "Efficiency", - "score": 0.5363782968591241 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)", - "tab": "Efficiency", - "score": 0.37297873885919014 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.764, mean=0.764, max=0.764, sum=1.527 (2)", - "tab": "Efficiency", - "score": 0.7635687488620564 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)", - "tab": "Efficiency", - "score": 0.3322232922697379 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)", - "tab": "General information", - "score": 1094.5845588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)", - "tab": "General information", - "score": 658.5921985815603 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)", - "tab": "General information", - "score": 1637.7868318122555 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)", - "tab": "General information", - "score": 575.1143790849674 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)", - "tab": "Efficiency", - "score": 0.5491553211212158 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.697 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)", - "tab": "Efficiency", - "score": 0.35213252902030945 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)", - "tab": "General information", - "score": 579.6907894736842 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.34657839775085447 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)", - "tab": "Efficiency", - "score": 0.2858500345697943 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.947, mean=397.947, max=397.947, sum=795.894 (2)", - "tab": "General information", - "score": 397.94716981132075 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.557 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.507 (2)", - "tab": "Efficiency", - "score": 0.2537446346688778 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.838, mean=304.838, max=304.838, sum=609.677 (2)", - "tab": "General information", - "score": 304.83829787234043 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.379 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)", - "tab": "Efficiency", - "score": 0.3010375532610663 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=440.641, mean=440.641, max=440.641, sum=881.283 (2)", - "tab": "General information", - "score": 440.6413793103448 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.619, - "details": { - "description": "min=0.619, mean=0.619, max=0.619, sum=1.238 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)", - "tab": "Efficiency", - "score": 0.3380681862906804 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)", - "tab": "General information", - "score": 531.8624338624338 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.595, - "details": { - "description": "min=0.595, mean=0.595, max=0.595, sum=1.19 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)", - "tab": "Efficiency", - "score": 0.35805845071399023 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)", - "tab": "General information", - "score": 606.7619047619048 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.848, - "details": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.32748886615999284 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.31104220545350625 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)", - "tab": "Efficiency", - "score": 0.4648329520225525 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.284, mean=1.284, max=1.284, sum=2.569 (2)", - "tab": "Efficiency", - "score": 1.2842581590016684 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.272, mean=0.272, max=0.272, sum=0.544 (2)", - "tab": "Efficiency", - "score": 0.27224273031408136 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.2989391489967781 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.273, mean=0.273, max=0.273, sum=0.546 (2)", - "tab": "Efficiency", - "score": 0.2728824230340811 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.33938890828026663 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.57 (2)", - "tab": "Efficiency", - "score": 0.28512202290927663 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Efficiency", - "score": 0.34992847537362815 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.633 (2)", - "tab": "Efficiency", - "score": 0.31643713986108063 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)", - "tab": "Efficiency", - "score": 0.43886349929703605 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.004, mean=1.004, max=1.004, sum=2.009 (2)", - "tab": "Efficiency", - "score": 1.0044469611317504 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.677, mean=0.677, max=0.677, sum=1.354 (2)", - "tab": "Efficiency", - "score": 0.6767715281072045 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)", - "tab": "General information", - "score": 513.6774193548387 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.714, mean=496.714, max=496.714, sum=993.429 (2)", - "tab": "General information", - "score": 496.7142857142857 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)", - "tab": "General information", - "score": 2798.072727272727 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.045, mean=372.045, max=372.045, sum=744.091 (2)", - "tab": "General information", - "score": 372.04545454545456 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=371.562, mean=371.562, max=371.562, sum=743.123 (2)", - "tab": "General information", - "score": 371.5615384615385 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)", - "tab": "General information", - "score": 532.3740740740741 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.025, mean=399.025, max=399.025, sum=798.05 (2)", - "tab": "General information", - "score": 399.02521008403363 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)", - "tab": "General information", - "score": 560.4635761589404 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.246, mean=495.246, max=495.246, sum=990.492 (2)", - "tab": "General information", - "score": 495.24587155963303 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)", - "tab": "General information", - "score": 795.699074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)", - "tab": "General information", - "score": 1428.2700421940929 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.817, - "details": { - "description": "min=0.817, mean=0.817, max=0.817, sum=1.634 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.258, mean=0.258, max=0.258, sum=0.515 (2)", - "tab": "Efficiency", - "score": 0.2577151257895568 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.264, mean=0.264, max=0.264, sum=0.529 (2)", - "tab": "Efficiency", - "score": 0.26447626470609475 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.906, mean=319.906, max=319.906, sum=639.812 (2)", - "tab": "General information", - "score": 319.90582959641256 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.183, mean=341.183, max=341.183, sum=682.366 (2)", - "tab": "General information", - "score": 341.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.851, - "details": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.371, mean=0.371, max=0.371, sum=0.743 (2)", - "tab": "Efficiency", - "score": 0.3714516399320492 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)", - "tab": "General information", - "score": 639.8512396694215 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.30408222543681324 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.595, mean=449.595, max=449.595, sum=899.19 (2)", - "tab": "General information", - "score": 449.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)", - "tab": "Efficiency", - "score": 0.3823078232152121 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.255, mean=0.255, max=0.255, sum=0.511 (2)", - "tab": "Efficiency", - "score": 0.2552997649294659 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.796, mean=283.796, max=283.796, sum=567.592 (2)", - "tab": "General information", - "score": 283.79611650485435 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.897, - "details": { - "description": "min=0.897, mean=0.897, max=0.897, sum=1.795 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)", - "tab": "Efficiency", - "score": 0.29102008974450266 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", - "tab": "Efficiency", - "score": 0.27023372411727903 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=341, mean=341, max=341, sum=682 (2)", - "tab": "General information", - "score": 341.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.259, mean=0.259, max=0.259, sum=0.518 (2)", - "tab": "Efficiency", - "score": 0.25915825382198565 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.925, mean=299.925, max=299.925, sum=599.849 (2)", - "tab": "General information", - "score": 299.92464878671774 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.617 (2)", - "tab": "Efficiency", - "score": 0.3084571650951584 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)", - "tab": "Efficiency", - "score": 0.3827664223463176 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.145, mean=476.145, max=476.145, sum=952.289 (2)", - "tab": "General information", - "score": 476.1445086705202 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.699 (2)", - "tab": "Efficiency", - "score": 0.34937040011088055 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)", - "tab": "General information", - "score": 586.8169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.858, - "details": { - "description": "min=0.858, mean=0.858, max=0.858, sum=1.716 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)", - "tab": "Efficiency", - "score": 0.32473731188126553 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)", - "tab": "General information", - "score": 514.5586419753087 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.259, mean=0.259, max=0.259, sum=0.517 (2)", - "tab": "Efficiency", - "score": 0.2587012074210427 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.528, mean=0.528, max=0.528, sum=1.057 (2)", - "tab": "Efficiency", - "score": 0.5282714629659847 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)", - "tab": "Efficiency", - "score": 0.2668588197053368 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.522, mean=445.522, max=445.522, sum=891.045 (2)", - "tab": "General information", - "score": 445.5223880597015 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548, - "details": { - "description": "min=0.548, mean=0.548, max=0.548, sum=1.096 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.235, mean=0.235, max=0.235, sum=0.47 (2)", - "tab": "Efficiency", - "score": 0.235107473580234 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.09, mean=343.09, max=343.09, sum=686.181 (2)", - "tab": "General information", - "score": 343.0903614457831 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.226, mean=0.226, max=0.226, sum=0.453 (2)", - "tab": "Efficiency", - "score": 0.22640645016006558 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=275.561, mean=275.561, max=275.561, sum=551.123 (2)", - "tab": "General information", - "score": 275.56140350877195 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.708, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json deleted file mode 100644 index e788149e1..000000000 --- a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Instruct v0.3 7B", - "id": "mistralai/mistral-7b-instruct-v0.3", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599, - "details": { - "description": "min=0.258, mean=0.599, max=0.881, sum=68.3 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.212, mean=0.526, max=1.438, sum=59.959 (114)", - "tab": "Efficiency", - "score": 0.525951832745908 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=317.924, mean=705.273, max=3098.109, sum=80401.178 (114)", - "tab": "General information", - "score": 705.2734899593811 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27, - "details": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)", - "tab": "Efficiency", - "score": 0.32117165088653565 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=411.44, mean=411.44, max=411.44, sum=822.88 (2)", - "tab": "General information", - "score": 411.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.585, - "details": { - "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.246, mean=0.246, max=0.246, sum=0.493 (2)", - "tab": "Efficiency", - "score": 0.24627229902479383 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=416.089, mean=416.089, max=416.089, sum=832.178 (2)", - "tab": "General information", - "score": 416.0888888888889 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.343, - "details": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.221, mean=0.221, max=0.221, sum=0.442 (2)", - "tab": "Efficiency", - "score": 0.22099271774291993 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.399 (2)", - "tab": "Efficiency", - "score": 0.6997380173868604 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.932 (2)", - "tab": "Efficiency", - "score": 0.4661028146743774 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.212, mean=0.212, max=0.212, sum=0.424 (2)", - "tab": "Efficiency", - "score": 0.21210591793060302 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.774 (2)", - "tab": "Efficiency", - "score": 0.3871537646806309 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.455, mean=0.455, max=0.455, sum=0.91 (2)", - "tab": "Efficiency", - "score": 0.45503536392660704 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=636.71, mean=636.71, max=636.71, sum=1273.42 (2)", - "tab": "General information", - "score": 636.71 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=559.799, mean=559.799, max=559.799, sum=1119.597 (2)", - "tab": "General information", - "score": 559.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=911.17, mean=911.17, max=911.17, sum=1822.34 (2)", - "tab": "General information", - "score": 911.17 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=667.31, mean=667.31, max=667.31, sum=1334.62 (2)", - "tab": "General information", - "score": 667.31 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=601.41, mean=601.41, max=601.41, sum=1202.821 (2)", - "tab": "General information", - "score": 601.4104046242775 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=560.029, mean=560.029, max=560.029, sum=1120.059 (2)", - "tab": "General information", - "score": 560.0294117647059 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.853 (2)", - "tab": "Efficiency", - "score": 0.4263953256607056 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=433.94, mean=433.94, max=433.94, sum=867.88 (2)", - "tab": "General information", - "score": 433.94 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421, - "details": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.813 (2)", - "tab": "Efficiency", - "score": 0.406455958098696 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=696.175, mean=696.175, max=696.175, sum=1392.351 (2)", - "tab": "General information", - "score": 696.1754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.29881003856658933 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=492.47, mean=492.47, max=492.47, sum=984.94 (2)", - "tab": "General information", - "score": 492.47 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.713, - "details": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.232, mean=0.232, max=0.232, sum=0.465 (2)", - "tab": "Efficiency", - "score": 0.23237781833719323 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=460.093, mean=460.093, max=460.093, sum=920.185 (2)", - "tab": "General information", - "score": 460.0925925925926 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.899, mean=0.899, max=0.899, sum=1.798 (2)", - "tab": "Efficiency", - "score": 0.8987545852109167 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=382.82, mean=382.82, max=382.82, sum=765.64 (2)", - "tab": "General information", - "score": 382.81993569131834 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.641, - "details": { - "description": "min=0.641, mean=0.641, max=0.641, sum=1.281 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)", - "tab": "Efficiency", - "score": 0.6148438769228318 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.651 (2)", - "tab": "Efficiency", - "score": 0.8254362666015084 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)", - "tab": "Efficiency", - "score": 0.68212915414937 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=1.012 (2)", - "tab": "Efficiency", - "score": 0.505940170459498 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1288.143, mean=1288.143, max=1288.143, sum=2576.287 (2)", - "tab": "General information", - "score": 1288.1433823529412 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=805.496, mean=805.496, max=805.496, sum=1610.993 (2)", - "tab": "General information", - "score": 805.4964539007092 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1858.711, mean=1858.711, max=1858.711, sum=3717.421 (2)", - "tab": "General information", - "score": 1858.7105606258149 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=654.278, mean=654.278, max=654.278, sum=1308.556 (2)", - "tab": "General information", - "score": 654.2777777777778 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.487, mean=0.487, max=0.487, sum=0.973 (2)", - "tab": "Efficiency", - "score": 0.48650413513183594 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=482.19, mean=482.19, max=482.19, sum=964.38 (2)", - "tab": "General information", - "score": 482.19 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.638, - "details": { - "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.678, mean=0.678, max=0.678, sum=1.355 (2)", - "tab": "Efficiency", - "score": 0.6775346147386652 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=674.987, mean=674.987, max=674.987, sum=1349.974 (2)", - "tab": "General information", - "score": 674.9868421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57, - "details": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)", - "tab": "Efficiency", - "score": 0.6446590375900269 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=653.6, mean=653.6, max=653.6, sum=1307.2 (2)", - "tab": "General information", - "score": 653.6 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.687, - "details": { - "description": "min=0.687, mean=0.687, max=0.687, sum=1.374 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.844, mean=0.844, max=0.844, sum=1.687 (2)", - "tab": "Efficiency", - "score": 0.8436905698956184 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=496.174, mean=496.174, max=496.174, sum=992.347 (2)", - "tab": "General information", - "score": 496.1735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549, - "details": { - "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)", - "tab": "Efficiency", - "score": 0.33306963900302317 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=343.285, mean=343.285, max=343.285, sum=686.57 (2)", - "tab": "General information", - "score": 343.2851063829787 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572, - "details": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.784 (2)", - "tab": "Efficiency", - "score": 0.3922290703345989 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=510.379, mean=510.379, max=510.379, sum=1020.759 (2)", - "tab": "General information", - "score": 510.37931034482756 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402, - "details": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.676, mean=0.676, max=0.676, sum=1.352 (2)", - "tab": "Efficiency", - "score": 0.6761655416438188 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=622.386, mean=622.386, max=622.386, sum=1244.772 (2)", - "tab": "General information", - "score": 622.3862433862433 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.397, - "details": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.734, mean=0.734, max=0.734, sum=1.467 (2)", - "tab": "Efficiency", - "score": 0.7336057802987477 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=727.984, mean=727.984, max=727.984, sum=1455.968 (2)", - "tab": "General information", - "score": 727.984126984127 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.759, - "details": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.805, mean=0.805, max=0.805, sum=1.61 (2)", - "tab": "Efficiency", - "score": 0.8049156188964843 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.881 (2)", - "tab": "Efficiency", - "score": 0.44036899529067164 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.869 (2)", - "tab": "Efficiency", - "score": 0.4347002100944519 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.891 (2)", - "tab": "Efficiency", - "score": 0.4453156341205944 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.661 (2)", - "tab": "Efficiency", - "score": 0.3305177327358361 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.089 (2)", - "tab": "Efficiency", - "score": 0.5445178654527417 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.061 (2)", - "tab": "Efficiency", - "score": 0.5302642871172 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.585, mean=0.585, max=0.585, sum=1.169 (2)", - "tab": "Efficiency", - "score": 0.5845282289716932 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)", - "tab": "Efficiency", - "score": 0.23408917118521297 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Efficiency", - "score": 0.3838195042894376 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.274, mean=0.274, max=0.274, sum=0.547 (2)", - "tab": "Efficiency", - "score": 0.2735835779697523 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.654, mean=0.654, max=0.654, sum=1.308 (2)", - "tab": "Efficiency", - "score": 0.6539056665367551 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)", - "tab": "Efficiency", - "score": 0.9417344308366963 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.727 (2)", - "tab": "Efficiency", - "score": 0.8635432951561006 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=609.561, mean=609.561, max=609.561, sum=1219.123 (2)", - "tab": "General information", - "score": 609.5612903225806 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=581.798, mean=581.798, max=581.798, sum=1163.596 (2)", - "tab": "General information", - "score": 581.7980295566502 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=997.24, mean=997.24, max=997.24, sum=1994.48 (2)", - "tab": "General information", - "score": 997.24 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3098.109, mean=3098.109, max=3098.109, sum=6196.218 (2)", - "tab": "General information", - "score": 3098.109090909091 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=438.207, mean=438.207, max=438.207, sum=876.414 (2)", - "tab": "General information", - "score": 438.2070707070707 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=523.808, mean=523.808, max=523.808, sum=1047.617 (2)", - "tab": "General information", - "score": 523.8082901554404 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=432.815, mean=432.815, max=432.815, sum=865.631 (2)", - "tab": "General information", - "score": 432.81538461538463 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=593.13, mean=593.13, max=593.13, sum=1186.259 (2)", - "tab": "General information", - "score": 593.1296296296297 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=452.345, mean=452.345, max=452.345, sum=904.689 (2)", - "tab": "General information", - "score": 452.34453781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=631.775, mean=631.775, max=631.775, sum=1263.55 (2)", - "tab": "General information", - "score": 631.774834437086 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=567.873, mean=567.873, max=567.873, sum=1135.747 (2)", - "tab": "General information", - "score": 567.8733944954129 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=922.644, mean=922.644, max=922.644, sum=1845.287 (2)", - "tab": "General information", - "score": 922.6435185185185 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2486.446, mean=2486.446, max=2486.446, sum=4972.892 (2)", - "tab": "General information", - "score": 2486.4460784313724 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1594.553, mean=1594.553, max=1594.553, sum=3189.105 (2)", - "tab": "General information", - "score": 1594.5527426160338 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Efficiency", - "score": 0.8091403518557014 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=1.438, mean=1.438, max=1.438, sum=2.875 (2)", - "tab": "Efficiency", - "score": 1.437711750278036 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=362.152, mean=362.152, max=362.152, sum=724.305 (2)", - "tab": "General information", - "score": 362.15246636771303 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=403.748, mean=403.748, max=403.748, sum=807.496 (2)", - "tab": "General information", - "score": 403.7480916030534 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)", - "tab": "Efficiency", - "score": 0.3933255593638775 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=729.182, mean=729.182, max=729.182, sum=1458.364 (2)", - "tab": "General information", - "score": 729.1818181818181 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.712, - "details": { - "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.695 (2)", - "tab": "Efficiency", - "score": 0.8476987660296855 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=495.779, mean=495.779, max=495.779, sum=991.558 (2)", - "tab": "General information", - "score": 495.77914110429447 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.455, - "details": { - "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.557, mean=0.557, max=0.557, sum=1.113 (2)", - "tab": "Efficiency", - "score": 0.5566470899752208 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=743.83, mean=743.83, max=743.83, sum=1487.661 (2)", - "tab": "General information", - "score": 743.8303571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.767, - "details": { - "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)", - "tab": "Efficiency", - "score": 0.36507687059420985 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=324.359, mean=324.359, max=324.359, sum=648.718 (2)", - "tab": "General information", - "score": 324.3592233009709 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)", - "tab": "Efficiency", - "score": 0.58499161606161 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=472.423, mean=472.423, max=472.423, sum=944.846 (2)", - "tab": "General information", - "score": 472.4230769230769 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.268, mean=0.268, max=0.268, sum=0.535 (2)", - "tab": "Efficiency", - "score": 0.2675498366355896 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=414.71, mean=414.71, max=414.71, sum=829.42 (2)", - "tab": "General information", - "score": 414.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.504, mean=0.504, max=0.504, sum=1.008 (2)", - "tab": "Efficiency", - "score": 0.5038632959850599 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=357.519, mean=357.519, max=357.519, sum=715.037 (2)", - "tab": "General information", - "score": 357.51851851851853 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.393, - "details": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.777, mean=0.777, max=0.777, sum=1.553 (2)", - "tab": "Efficiency", - "score": 0.7765735477381359 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.493, mean=0.493, max=0.493, sum=0.986 (2)", - "tab": "Efficiency", - "score": 0.4927780463042872 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=549.038, mean=549.038, max=549.038, sum=1098.075 (2)", - "tab": "General information", - "score": 549.0375722543353 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=754.516, mean=754.516, max=754.516, sum=1509.032 (2)", - "tab": "General information", - "score": 754.5162011173185 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.676, - "details": { - "description": "min=0.676, mean=0.676, max=0.676, sum=1.353 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.236, mean=0.236, max=0.236, sum=0.471 (2)", - "tab": "Efficiency", - "score": 0.23563866054310517 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=689.69, mean=689.69, max=689.69, sum=1379.379 (2)", - "tab": "General information", - "score": 689.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673, - "details": { - "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)", - "tab": "Efficiency", - "score": 0.34476134880089465 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=611.145, mean=611.145, max=611.145, sum=1222.29 (2)", - "tab": "General information", - "score": 611.145061728395 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.636, - "details": { - "description": "min=0.636, mean=0.636, max=0.636, sum=1.273 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)", - "tab": "Efficiency", - "score": 0.3271717678416859 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=471.036, mean=471.036, max=471.036, sum=942.073 (2)", - "tab": "General information", - "score": 471.03636363636366 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.682, - "details": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.363 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.121 (2)", - "tab": "Efficiency", - "score": 0.5606838294437954 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1324.865, mean=1324.865, max=1324.865, sum=2649.731 (2)", - "tab": "General information", - "score": 1324.865306122449 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.806, - "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.825 (2)", - "tab": "Efficiency", - "score": 0.41272182962787685 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=496.95, mean=496.95, max=496.95, sum=993.9 (2)", - "tab": "General information", - "score": 496.9502487562189 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47, - "details": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.644, mean=0.644, max=0.644, sum=1.288 (2)", - "tab": "Efficiency", - "score": 0.6437842285776713 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=404.349, mean=404.349, max=404.349, sum=808.699 (2)", - "tab": "General information", - "score": 404.34939759036143 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.266, mean=0.266, max=0.266, sum=0.532 (2)", - "tab": "Efficiency", - "score": 0.26615772330970094 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=317.924, mean=317.924, max=317.924, sum=635.848 (2)", - "tab": "General information", - "score": 317.92397660818716 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json deleted file mode 100644 index 5ca508d3b..000000000 --- a/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral v0.1 7B", - "id": "mistralai/mistral-7b-v0.1", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566, - "details": { - "description": "min=0.25, mean=0.566, max=0.845, sum=64.496 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.665, mean=0.864, max=1.234, sum=98.504 (114)", - "tab": "Efficiency", - "score": 0.8640714937745795 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)", - "tab": "General information", - "score": 696.2734899593811 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25, - "details": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.667 (2)", - "tab": "Efficiency", - "score": 0.8337139582633972 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)", - "tab": "General information", - "score": 402.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.467, - "details": { - "description": "min=0.467, mean=0.467, max=0.467, sum=0.933 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.717, mean=0.717, max=0.717, sum=1.435 (2)", - "tab": "Efficiency", - "score": 0.7173902529257316 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)", - "tab": "General information", - "score": 407.0888888888889 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.314, - "details": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=1.018, mean=1.018, max=1.018, sum=2.036 (2)", - "tab": "Efficiency", - "score": 1.0181659984588622 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.699, mean=0.699, max=0.699, sum=1.398 (2)", - "tab": "Efficiency", - "score": 0.699198540714052 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)", - "tab": "Efficiency", - "score": 0.7115359020233154 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)", - "tab": "Efficiency", - "score": 0.8852152943611145 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.896, mean=0.896, max=0.896, sum=1.793 (2)", - "tab": "Efficiency", - "score": 0.8963309629804137 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)", - "tab": "Efficiency", - "score": 0.8030702249676573 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)", - "tab": "General information", - "score": 627.71 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)", - "tab": "General information", - "score": 550.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)", - "tab": "General information", - "score": 902.17 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)", - "tab": "General information", - "score": 658.31 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)", - "tab": "General information", - "score": 592.4104046242775 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)", - "tab": "General information", - "score": 551.0294117647059 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.033, mean=1.033, max=1.033, sum=2.065 (2)", - "tab": "Efficiency", - "score": 1.032561357021332 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)", - "tab": "General information", - "score": 424.94 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.351, - "details": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)", - "tab": "Efficiency", - "score": 0.7832156043303641 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)", - "tab": "General information", - "score": 687.1754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29, - "details": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)", - "tab": "Efficiency", - "score": 0.6891914677619934 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)", - "tab": "General information", - "score": 483.47 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.787, mean=0.787, max=0.787, sum=1.574 (2)", - "tab": "Efficiency", - "score": 0.7868193630818967 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)", - "tab": "General information", - "score": 451.0925925925926 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63, - "details": { - "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)", - "tab": "Efficiency", - "score": 0.7434952857026716 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)", - "tab": "General information", - "score": 373.81993569131834 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.817, mean=0.817, max=0.817, sum=1.633 (2)", - "tab": "Efficiency", - "score": 0.816552089417682 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)", - "tab": "Efficiency", - "score": 0.7432903905286856 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Efficiency", - "score": 0.8197952300659836 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.936, mean=0.936, max=0.936, sum=1.873 (2)", - "tab": "Efficiency", - "score": 0.9364227648654015 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)", - "tab": "General information", - "score": 1279.1433823529412 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)", - "tab": "General information", - "score": 796.4964539007092 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)", - "tab": "General information", - "score": 1849.7105606258149 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)", - "tab": "General information", - "score": 645.2777777777778 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.863, mean=0.863, max=0.863, sum=1.727 (2)", - "tab": "Efficiency", - "score": 0.8633295917510986 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)", - "tab": "General information", - "score": 473.19 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599, - "details": { - "description": "min=0.599, mean=0.599, max=0.599, sum=1.197 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Efficiency", - "score": 0.8039205105681169 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)", - "tab": "General information", - "score": 665.9868421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=1.014, mean=1.014, max=1.014, sum=2.028 (2)", - "tab": "Efficiency", - "score": 1.013892731666565 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)", - "tab": "General information", - "score": 644.6 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.653, - "details": { - "description": "min=0.653, mean=0.653, max=0.653, sum=1.306 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.613 (2)", - "tab": "Efficiency", - "score": 0.8066773774488917 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)", - "tab": "General information", - "score": 487.1735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451, - "details": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.902 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.666 (2)", - "tab": "Efficiency", - "score": 0.833152520402949 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)", - "tab": "General information", - "score": 334.2851063829787 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.538, - "details": { - "description": "min=0.538, mean=0.538, max=0.538, sum=1.076 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=1.234, mean=1.234, max=1.234, sum=2.468 (2)", - "tab": "Efficiency", - "score": 1.2342401932025777 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)", - "tab": "General information", - "score": 501.37931034482756 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.32, - "details": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.707 (2)", - "tab": "Efficiency", - "score": 0.8535163610700577 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)", - "tab": "General information", - "score": 613.3862433862433 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.365, - "details": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=1.022, mean=1.022, max=1.022, sum=2.044 (2)", - "tab": "Efficiency", - "score": 1.0218302371009949 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)", - "tab": "General information", - "score": 718.984126984127 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "description": "min=0.726, mean=0.726, max=0.726, sum=1.451 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.904, mean=0.904, max=0.904, sum=1.808 (2)", - "tab": "Efficiency", - "score": 0.9039220233117381 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.782 (2)", - "tab": "Efficiency", - "score": 0.8910855988563575 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)", - "tab": "Efficiency", - "score": 0.7594162678718567 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)", - "tab": "Efficiency", - "score": 0.7088880394444321 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)", - "tab": "Efficiency", - "score": 0.9091630006077314 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.665, mean=0.665, max=0.665, sum=1.329 (2)", - "tab": "Efficiency", - "score": 0.6645773976577996 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)", - "tab": "Efficiency", - "score": 0.8412165372799605 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)", - "tab": "Efficiency", - "score": 0.7671932847411544 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.995, mean=0.995, max=0.995, sum=1.99 (2)", - "tab": "Efficiency", - "score": 0.994775929370848 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.805 (2)", - "tab": "Efficiency", - "score": 0.9024771317740939 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)", - "tab": "Efficiency", - "score": 0.7925117606416755 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)", - "tab": "Efficiency", - "score": 0.8837873924661566 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.171, mean=1.171, max=1.171, sum=2.341 (2)", - "tab": "Efficiency", - "score": 1.170638754087336 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.801, mean=0.801, max=0.801, sum=1.603 (2)", - "tab": "Efficiency", - "score": 0.8013244822055479 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)", - "tab": "General information", - "score": 600.5612903225806 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)", - "tab": "General information", - "score": 572.7980295566502 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)", - "tab": "General information", - "score": 988.24 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)", - "tab": "General information", - "score": 3089.109090909091 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)", - "tab": "General information", - "score": 429.2070707070707 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)", - "tab": "General information", - "score": 514.8082901554404 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)", - "tab": "General information", - "score": 423.81538461538463 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)", - "tab": "General information", - "score": 584.1296296296297 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)", - "tab": "General information", - "score": 443.34453781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)", - "tab": "General information", - "score": 622.774834437086 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)", - "tab": "General information", - "score": 558.8733944954129 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)", - "tab": "General information", - "score": 913.6435185185185 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)", - "tab": "General information", - "score": 2477.4460784313724 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)", - "tab": "General information", - "score": 1585.5527426160338 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)", - "tab": "Efficiency", - "score": 0.778804096940387 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.701 (2)", - "tab": "Efficiency", - "score": 0.8504140213245653 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)", - "tab": "General information", - "score": 353.15246636771303 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)", - "tab": "General information", - "score": 394.7480916030534 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Efficiency", - "score": 0.9102441850772574 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)", - "tab": "General information", - "score": 720.1818181818181 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.693, mean=0.693, max=0.693, sum=1.387 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.613 (2)", - "tab": "Efficiency", - "score": 0.8063952381625498 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)", - "tab": "General information", - "score": 486.77914110429447 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.438, - "details": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.875 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.503 (2)", - "tab": "Efficiency", - "score": 0.7514570632151195 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)", - "tab": "General information", - "score": 734.8303571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Efficiency", - "score": 0.9339890294862025 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)", - "tab": "General information", - "score": 315.3592233009709 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=1.072, mean=1.072, max=1.072, sum=2.144 (2)", - "tab": "Efficiency", - "score": 1.0717963163669293 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)", - "tab": "General information", - "score": 463.4230769230769 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68, - "details": { - "description": "min=0.68, mean=0.68, max=0.68, sum=1.36 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.929, mean=0.929, max=0.929, sum=1.859 (2)", - "tab": "Efficiency", - "score": 0.9293915629386902 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)", - "tab": "General information", - "score": 405.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.441 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.874, mean=0.874, max=0.874, sum=1.747 (2)", - "tab": "Efficiency", - "score": 0.8736470007500582 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)", - "tab": "General information", - "score": 348.51851851851853 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33, - "details": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.659 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)", - "tab": "Efficiency", - "score": 0.7723477258847627 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.887, mean=0.887, max=0.887, sum=1.774 (2)", - "tab": "Efficiency", - "score": 0.8867556284259818 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)", - "tab": "General information", - "score": 540.0375722543353 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)", - "tab": "General information", - "score": 745.5162011173185 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.657, - "details": { - "description": "min=0.657, mean=0.657, max=0.657, sum=1.314 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.982, mean=0.982, max=0.982, sum=1.964 (2)", - "tab": "Efficiency", - "score": 0.9817679053038554 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)", - "tab": "General information", - "score": 680.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.642, - "details": { - "description": "min=0.642, mean=0.642, max=0.642, sum=1.284 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)", - "tab": "Efficiency", - "score": 0.7522576863383069 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)", - "tab": "General information", - "score": 602.145061728395 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6, - "details": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=1.121, mean=1.121, max=1.121, sum=2.241 (2)", - "tab": "Efficiency", - "score": 1.120634336905046 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)", - "tab": "General information", - "score": 462.03636363636366 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731, - "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.461 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.837, mean=0.837, max=0.837, sum=1.674 (2)", - "tab": "Efficiency", - "score": 0.8369822920585165 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)", - "tab": "General information", - "score": 1315.865306122449 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.831, - "details": { - "description": "min=0.831, mean=0.831, max=0.831, sum=1.662 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.819 (2)", - "tab": "Efficiency", - "score": 0.9092605125844775 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)", - "tab": "General information", - "score": 487.9502487562189 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44, - "details": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.771 (2)", - "tab": "Efficiency", - "score": 0.8854893704494798 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)", - "tab": "General information", - "score": 395.34939759036143 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.789, - "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.863, mean=0.863, max=0.863, sum=1.726 (2)", - "tab": "Efficiency", - "score": 0.8629393619403505 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)", - "tab": "General information", - "score": 308.92397660818716 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.213, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json deleted file mode 100644 index 6b7873124..000000000 --- a/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Large 2402", - "id": "mistralai/mistral-large-2402", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688, - "details": { - "description": "min=0.211, mean=0.688, max=0.964, sum=78.413 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.356, mean=0.546, max=1.633, sum=62.26 (114)", - "tab": "Efficiency", - "score": 0.5461372164599003 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)", - "tab": "General information", - "score": 696.2734899593811 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45, - "details": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=1.48, mean=1.48, max=1.48, sum=2.959 (2)", - "tab": "Efficiency", - "score": 1.4797466564178468 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)", - "tab": "General information", - "score": 402.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674, - "details": { - "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.484, mean=0.484, max=0.484, sum=0.968 (2)", - "tab": "Efficiency", - "score": 0.4840934417865895 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)", - "tab": "General information", - "score": 407.0888888888889 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.373, - "details": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.745 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=1.226, mean=1.226, max=1.226, sum=2.452 (2)", - "tab": "Efficiency", - "score": 1.2259348821640015 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.875 (2)", - "tab": "Efficiency", - "score": 0.43758388525909847 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)", - "tab": "Efficiency", - "score": 0.41238118410110475 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)", - "tab": "Efficiency", - "score": 0.44315950393676756 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.849 (2)", - "tab": "Efficiency", - "score": 0.4246950163317554 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.511, mean=0.511, max=0.511, sum=1.021 (2)", - "tab": "Efficiency", - "score": 0.510722931693582 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)", - "tab": "General information", - "score": 627.71 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)", - "tab": "General information", - "score": 550.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)", - "tab": "General information", - "score": 902.17 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)", - "tab": "General information", - "score": 658.31 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)", - "tab": "General information", - "score": 592.4104046242775 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)", - "tab": "General information", - "score": 551.0294117647059 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.535, mean=1.535, max=1.535, sum=3.071 (2)", - "tab": "Efficiency", - "score": 1.5353856110572814 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)", - "tab": "General information", - "score": 424.94 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=1.383, mean=1.383, max=1.383, sum=2.766 (2)", - "tab": "Efficiency", - "score": 1.382804548531248 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)", - "tab": "General information", - "score": 687.1754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.34, - "details": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)", - "tab": "Efficiency", - "score": 0.49177081823349 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)", - "tab": "General information", - "score": 483.47 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=1.63 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.505, mean=0.505, max=0.505, sum=1.01 (2)", - "tab": "Efficiency", - "score": 0.5051956353364168 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)", - "tab": "General information", - "score": 451.0925925925926 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.794, - "details": { - "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=1.011 (2)", - "tab": "Efficiency", - "score": 0.5055920081123279 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)", - "tab": "General information", - "score": 373.81993569131834 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.478, mean=0.478, max=0.478, sum=0.956 (2)", - "tab": "Efficiency", - "score": 0.4777693476747064 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)", - "tab": "Efficiency", - "score": 0.4430855546437257 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.493, mean=0.493, max=0.493, sum=0.987 (2)", - "tab": "Efficiency", - "score": 0.4934647888372588 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.483, mean=0.483, max=0.483, sum=0.966 (2)", - "tab": "Efficiency", - "score": 0.4830952575004179 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)", - "tab": "General information", - "score": 1279.1433823529412 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)", - "tab": "General information", - "score": 796.4964539007092 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)", - "tab": "General information", - "score": 1849.7105606258149 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)", - "tab": "General information", - "score": 645.2777777777778 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=1.633, mean=1.633, max=1.633, sum=3.266 (2)", - "tab": "Efficiency", - "score": 1.6332264852523803 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)", - "tab": "General information", - "score": 473.19 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)", - "tab": "Efficiency", - "score": 0.4503253243471447 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)", - "tab": "General information", - "score": 665.9868421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)", - "tab": "Efficiency", - "score": 0.4105031824111938 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)", - "tab": "General information", - "score": 644.6 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.751, - "details": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.521, mean=0.521, max=0.521, sum=1.042 (2)", - "tab": "Efficiency", - "score": 0.5210292402303444 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)", - "tab": "General information", - "score": 487.1735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.574, - "details": { - "description": "min=0.574, mean=0.574, max=0.574, sum=1.149 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.835 (2)", - "tab": "Efficiency", - "score": 0.41761813873940323 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)", - "tab": "General information", - "score": 334.2851063829787 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.545, - "details": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.09 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)", - "tab": "Efficiency", - "score": 0.5400767852520121 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)", - "tab": "General information", - "score": 501.37931034482756 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508, - "details": { - "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)", - "tab": "Efficiency", - "score": 0.4338057312385115 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)", - "tab": "General information", - "score": 613.3862433862433 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532, - "details": { - "description": "min=0.532, mean=0.532, max=0.532, sum=1.063 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.512, mean=0.512, max=0.512, sum=1.024 (2)", - "tab": "Efficiency", - "score": 0.5122278436781869 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)", - "tab": "General information", - "score": 718.984126984127 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.526, mean=0.526, max=0.526, sum=1.052 (2)", - "tab": "Efficiency", - "score": 0.5259702259494412 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.803 (2)", - "tab": "Efficiency", - "score": 0.4016201167271055 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.797 (2)", - "tab": "Efficiency", - "score": 0.3984186482429504 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)", - "tab": "Efficiency", - "score": 0.6488189350474964 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.449, mean=0.449, max=0.449, sum=0.897 (2)", - "tab": "Efficiency", - "score": 0.44867861752558236 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)", - "tab": "Efficiency", - "score": 0.44147809675938104 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.912 (2)", - "tab": "Efficiency", - "score": 0.45610924195020625 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.427, mean=0.427, max=0.427, sum=0.854 (2)", - "tab": "Efficiency", - "score": 0.4269448068406847 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.805 (2)", - "tab": "Efficiency", - "score": 0.4023913435575341 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.861 (2)", - "tab": "Efficiency", - "score": 0.43034561738273164 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)", - "tab": "Efficiency", - "score": 0.4278128755201987 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", - "tab": "Efficiency", - "score": 0.42108922203381854 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)", - "tab": "Efficiency", - "score": 0.5401732255430782 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.471, mean=0.471, max=0.471, sum=0.943 (2)", - "tab": "Efficiency", - "score": 0.47126107075043366 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)", - "tab": "General information", - "score": 600.5612903225806 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)", - "tab": "General information", - "score": 572.7980295566502 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)", - "tab": "General information", - "score": 988.24 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)", - "tab": "General information", - "score": 3089.109090909091 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)", - "tab": "General information", - "score": 429.2070707070707 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)", - "tab": "General information", - "score": 514.8082901554404 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)", - "tab": "General information", - "score": 423.81538461538463 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)", - "tab": "General information", - "score": 584.1296296296297 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)", - "tab": "General information", - "score": 443.34453781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)", - "tab": "General information", - "score": 622.774834437086 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)", - "tab": "General information", - "score": 558.8733944954129 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)", - "tab": "General information", - "score": 913.6435185185185 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)", - "tab": "General information", - "score": 2477.4460784313724 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)", - "tab": "General information", - "score": 1585.5527426160338 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.847, - "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.803 (2)", - "tab": "Efficiency", - "score": 0.4013588674399885 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.711 (2)", - "tab": "Efficiency", - "score": 0.3556434161790455 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)", - "tab": "General information", - "score": 353.15246636771303 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)", - "tab": "General information", - "score": 394.7480916030534 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.404, mean=0.404, max=0.404, sum=0.808 (2)", - "tab": "Efficiency", - "score": 0.40404871081517746 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)", - "tab": "General information", - "score": 720.1818181818181 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.818 (2)", - "tab": "Efficiency", - "score": 0.4088362228650988 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)", - "tab": "General information", - "score": 486.77914110429447 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.802 (2)", - "tab": "Efficiency", - "score": 0.40122431090899874 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)", - "tab": "General information", - "score": 734.8303571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.709 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)", - "tab": "Efficiency", - "score": 0.6141544730917922 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)", - "tab": "General information", - "score": 315.3592233009709 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.897, - "details": { - "description": "min=0.897, mean=0.897, max=0.897, sum=1.795 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)", - "tab": "Efficiency", - "score": 0.46382204895345575 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)", - "tab": "General information", - "score": 463.4230769230769 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.867 (2)", - "tab": "Efficiency", - "score": 0.4333249735832214 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)", - "tab": "General information", - "score": 405.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.955 (2)", - "tab": "Efficiency", - "score": 0.477321812323988 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)", - "tab": "General information", - "score": 348.51851851851853 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.497, mean=0.497, max=0.497, sum=0.995 (2)", - "tab": "Efficiency", - "score": 0.4974138419752176 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.902 (2)", - "tab": "Efficiency", - "score": 0.45121243466212096 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)", - "tab": "General information", - "score": 540.0375722543353 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)", - "tab": "General information", - "score": 745.5162011173185 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.463, mean=0.463, max=0.463, sum=0.927 (2)", - "tab": "Efficiency", - "score": 0.46336324308432786 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)", - "tab": "General information", - "score": 680.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.904, - "details": { - "description": "min=0.904, mean=0.904, max=0.904, sum=1.809 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)", - "tab": "Efficiency", - "score": 0.3928193273367705 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)", - "tab": "General information", - "score": 602.145061728395 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.535, mean=0.535, max=0.535, sum=1.069 (2)", - "tab": "Efficiency", - "score": 0.534747780453075 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)", - "tab": "General information", - "score": 462.03636363636366 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.891 (2)", - "tab": "Efficiency", - "score": 0.44565339964263295 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)", - "tab": "General information", - "score": 1315.865306122449 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)", - "tab": "Efficiency", - "score": 0.44217372296461416 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)", - "tab": "General information", - "score": 487.9502487562189 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.871 (2)", - "tab": "Efficiency", - "score": 0.435666641557073 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)", - "tab": "General information", - "score": 395.34939759036143 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)", - "tab": "Efficiency", - "score": 0.4106302637802927 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)", - "tab": "General information", - "score": 308.92397660818716 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.464, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json deleted file mode 100644 index 58aa6a379..000000000 --- a/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Large 2 2407", - "id": "mistralai/mistral-large-2407", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.37, mean=0.8, max=0.969, sum=91.197 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.672, mean=0.798, max=1.025, sum=90.977 (114)", - "tab": "Efficiency", - "score": 0.798047748433812 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=318.924, mean=706.273, max=3099.109, sum=80515.178 (114)", - "tab": "General information", - "score": 706.2734899593811 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7, - "details": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.732, mean=0.732, max=0.732, sum=1.464 (2)", - "tab": "Efficiency", - "score": 0.7317730689048767 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=412.44, mean=412.44, max=412.44, sum=824.88 (2)", - "tab": "General information", - "score": 412.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)", - "tab": "Efficiency", - "score": 0.807829690862585 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=417.089, mean=417.089, max=417.089, sum=834.178 (2)", - "tab": "General information", - "score": 417.0888888888889 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.838, mean=0.838, max=0.838, sum=1.676 (2)", - "tab": "Efficiency", - "score": 0.8380094933509826 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.768, mean=0.768, max=0.768, sum=1.535 (2)", - "tab": "Efficiency", - "score": 0.76766570409139 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Efficiency", - "score": 0.8529829049110412 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.771, mean=0.771, max=0.771, sum=1.542 (2)", - "tab": "Efficiency", - "score": 0.7712302732467652 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.672, mean=0.672, max=0.672, sum=1.344 (2)", - "tab": "Efficiency", - "score": 0.6721915785287846 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.674, mean=0.674, max=0.674, sum=1.347 (2)", - "tab": "Efficiency", - "score": 0.6735490116418577 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=637.71, mean=637.71, max=637.71, sum=1275.42 (2)", - "tab": "General information", - "score": 637.71 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=560.799, mean=560.799, max=560.799, sum=1121.597 (2)", - "tab": "General information", - "score": 560.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=912.17, mean=912.17, max=912.17, sum=1824.34 (2)", - "tab": "General information", - "score": 912.17 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=668.31, mean=668.31, max=668.31, sum=1336.62 (2)", - "tab": "General information", - "score": 668.31 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=602.41, mean=602.41, max=602.41, sum=1204.821 (2)", - "tab": "General information", - "score": 602.4104046242775 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=561.029, mean=561.029, max=561.029, sum=1122.059 (2)", - "tab": "General information", - "score": 561.0294117647059 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.933, mean=0.933, max=0.933, sum=1.866 (2)", - "tab": "Efficiency", - "score": 0.9331179332733154 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=434.94, mean=434.94, max=434.94, sum=869.88 (2)", - "tab": "General information", - "score": 434.94 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)", - "tab": "Efficiency", - "score": 0.6842389587770429 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=697.175, mean=697.175, max=697.175, sum=1394.351 (2)", - "tab": "General information", - "score": 697.1754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.489 (2)", - "tab": "Efficiency", - "score": 0.744694242477417 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=493.47, mean=493.47, max=493.47, sum=986.94 (2)", - "tab": "General information", - "score": 493.47 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.503 (2)", - "tab": "Efficiency", - "score": 0.751495877901713 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=461.093, mean=461.093, max=461.093, sum=922.185 (2)", - "tab": "General information", - "score": 461.0925925925926 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.609 (2)", - "tab": "Efficiency", - "score": 0.8043544453439988 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=383.82, mean=383.82, max=383.82, sum=767.64 (2)", - "tab": "General information", - "score": 383.81993569131834 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)", - "tab": "Efficiency", - "score": 0.7406316355747335 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)", - "tab": "Efficiency", - "score": 0.8074929325293142 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.774, mean=0.774, max=0.774, sum=1.548 (2)", - "tab": "Efficiency", - "score": 0.7742255473851847 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)", - "tab": "Efficiency", - "score": 0.9228381756084417 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1289.143, mean=1289.143, max=1289.143, sum=2578.287 (2)", - "tab": "General information", - "score": 1289.1433823529412 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=806.496, mean=806.496, max=806.496, sum=1612.993 (2)", - "tab": "General information", - "score": 806.4964539007092 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1859.711, mean=1859.711, max=1859.711, sum=3719.421 (2)", - "tab": "General information", - "score": 1859.7105606258149 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=655.278, mean=655.278, max=655.278, sum=1310.556 (2)", - "tab": "General information", - "score": 655.2777777777778 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.756, mean=0.756, max=0.756, sum=1.512 (2)", - "tab": "Efficiency", - "score": 0.7560967636108399 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=483.19, mean=483.19, max=483.19, sum=966.38 (2)", - "tab": "General information", - "score": 483.19 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.921, - "details": { - "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=1.025, mean=1.025, max=1.025, sum=2.049 (2)", - "tab": "Efficiency", - "score": 1.0245175393004167 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=675.987, mean=675.987, max=675.987, sum=1351.974 (2)", - "tab": "General information", - "score": 675.9868421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)", - "tab": "Efficiency", - "score": 0.9228822708129882 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=654.6, mean=654.6, max=654.6, sum=1309.2 (2)", - "tab": "General information", - "score": 654.6 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.578 (2)", - "tab": "Efficiency", - "score": 0.7888300931678628 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=497.174, mean=497.174, max=497.174, sum=994.347 (2)", - "tab": "General information", - "score": 497.1735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.643 (2)", - "tab": "Efficiency", - "score": 0.8212997264050422 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=344.285, mean=344.285, max=344.285, sum=688.57 (2)", - "tab": "General information", - "score": 344.2851063829787 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)", - "tab": "Efficiency", - "score": 0.701846879104088 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=511.379, mean=511.379, max=511.379, sum=1022.759 (2)", - "tab": "General information", - "score": 511.37931034482756 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.799, - "details": { - "description": "min=0.799, mean=0.799, max=0.799, sum=1.598 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.577 (2)", - "tab": "Efficiency", - "score": 0.7884082762652604 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=623.386, mean=623.386, max=623.386, sum=1246.772 (2)", - "tab": "General information", - "score": 623.3862433862433 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)", - "tab": "Efficiency", - "score": 0.7629275567947872 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=728.984, mean=728.984, max=728.984, sum=1457.968 (2)", - "tab": "General information", - "score": 728.984126984127 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.685, mean=0.685, max=0.685, sum=1.371 (2)", - "tab": "Efficiency", - "score": 0.6854658296031336 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.756, mean=0.756, max=0.756, sum=1.513 (2)", - "tab": "Efficiency", - "score": 0.7563052259642502 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.717, mean=0.717, max=0.717, sum=1.435 (2)", - "tab": "Efficiency", - "score": 0.7174343037605285 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.767, mean=0.767, max=0.767, sum=1.535 (2)", - "tab": "Efficiency", - "score": 0.7674274748021906 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)", - "tab": "Efficiency", - "score": 0.6998175286283397 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)", - "tab": "Efficiency", - "score": 0.8241880792410262 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.863, mean=0.863, max=0.863, sum=1.726 (2)", - "tab": "Efficiency", - "score": 0.8630072312477307 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.815, mean=0.815, max=0.815, sum=1.631 (2)", - "tab": "Efficiency", - "score": 0.8153338502954554 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.818, mean=0.818, max=0.818, sum=1.637 (2)", - "tab": "Efficiency", - "score": 0.8183944405627852 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)", - "tab": "Efficiency", - "score": 0.8296057877951111 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.818, mean=0.818, max=0.818, sum=1.636 (2)", - "tab": "Efficiency", - "score": 0.8179746304083308 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.775, mean=0.775, max=0.775, sum=1.55 (2)", - "tab": "Efficiency", - "score": 0.7749874878812719 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.486 (2)", - "tab": "Efficiency", - "score": 0.7428295682458317 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.832, mean=0.832, max=0.832, sum=1.663 (2)", - "tab": "Efficiency", - "score": 0.8316668367587061 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=610.561, mean=610.561, max=610.561, sum=1221.123 (2)", - "tab": "General information", - "score": 610.5612903225806 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=582.798, mean=582.798, max=582.798, sum=1165.596 (2)", - "tab": "General information", - "score": 582.7980295566502 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=998.24, mean=998.24, max=998.24, sum=1996.48 (2)", - "tab": "General information", - "score": 998.24 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3099.109, mean=3099.109, max=3099.109, sum=6198.218 (2)", - "tab": "General information", - "score": 3099.109090909091 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=439.207, mean=439.207, max=439.207, sum=878.414 (2)", - "tab": "General information", - "score": 439.2070707070707 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=524.808, mean=524.808, max=524.808, sum=1049.617 (2)", - "tab": "General information", - "score": 524.8082901554404 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=433.815, mean=433.815, max=433.815, sum=867.631 (2)", - "tab": "General information", - "score": 433.81538461538463 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=594.13, mean=594.13, max=594.13, sum=1188.259 (2)", - "tab": "General information", - "score": 594.1296296296297 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=453.345, mean=453.345, max=453.345, sum=906.689 (2)", - "tab": "General information", - "score": 453.34453781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=632.775, mean=632.775, max=632.775, sum=1265.55 (2)", - "tab": "General information", - "score": 632.774834437086 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=568.873, mean=568.873, max=568.873, sum=1137.747 (2)", - "tab": "General information", - "score": 568.8733944954129 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=923.644, mean=923.644, max=923.644, sum=1847.287 (2)", - "tab": "General information", - "score": 923.6435185185185 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2487.446, mean=2487.446, max=2487.446, sum=4974.892 (2)", - "tab": "General information", - "score": 2487.4460784313724 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1595.553, mean=1595.553, max=1595.553, sum=3191.105 (2)", - "tab": "General information", - "score": 1595.5527426160338 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.819, mean=0.819, max=0.819, sum=1.639 (2)", - "tab": "Efficiency", - "score": 0.8192698356816587 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)", - "tab": "Efficiency", - "score": 0.732998116325786 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=363.152, mean=363.152, max=363.152, sum=726.305 (2)", - "tab": "General information", - "score": 363.15246636771303 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=404.748, mean=404.748, max=404.748, sum=809.496 (2)", - "tab": "General information", - "score": 404.7480916030534 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.926, - "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.851 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.705 (2)", - "tab": "Efficiency", - "score": 0.8524710600041161 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=730.182, mean=730.182, max=730.182, sum=1460.364 (2)", - "tab": "General information", - "score": 730.1818181818181 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.847, - "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.693 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.944, mean=0.944, max=0.944, sum=1.887 (2)", - "tab": "Efficiency", - "score": 0.9436116130805454 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=496.779, mean=496.779, max=496.779, sum=993.558 (2)", - "tab": "General information", - "score": 496.77914110429447 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.661, mean=0.661, max=0.661, sum=1.321 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)", - "tab": "Efficiency", - "score": 0.8740715363195964 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=744.83, mean=744.83, max=744.83, sum=1489.661 (2)", - "tab": "General information", - "score": 744.8303571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Efficiency", - "score": 0.7901336544925727 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=325.359, mean=325.359, max=325.359, sum=650.718 (2)", - "tab": "General information", - "score": 325.3592233009709 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)", - "tab": "Efficiency", - "score": 0.88404920977405 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=473.423, mean=473.423, max=473.423, sum=946.846 (2)", - "tab": "General information", - "score": 473.4230769230769 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Efficiency", - "score": 0.7701838827133178 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=415.71, mean=415.71, max=415.71, sum=831.42 (2)", - "tab": "General information", - "score": 415.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.936, - "details": { - "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.419 (2)", - "tab": "Efficiency", - "score": 0.7095236696045975 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=358.519, mean=358.519, max=358.519, sum=717.037 (2)", - "tab": "General information", - "score": 358.51851851851853 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.839, - "details": { - "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.721 (2)", - "tab": "Efficiency", - "score": 0.8607459598883039 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.831, mean=0.831, max=0.831, sum=1.663 (2)", - "tab": "Efficiency", - "score": 0.8314023547998354 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=550.038, mean=550.038, max=550.038, sum=1100.075 (2)", - "tab": "General information", - "score": 550.0375722543353 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=755.516, mean=755.516, max=755.516, sum=1511.032 (2)", - "tab": "General information", - "score": 755.5162011173185 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827, - "details": { - "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)", - "tab": "Efficiency", - "score": 0.8157819338094175 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=690.69, mean=690.69, max=690.69, sum=1381.379 (2)", - "tab": "General information", - "score": 690.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.927, mean=0.927, max=0.927, sum=1.854 (2)", - "tab": "Efficiency", - "score": 0.9269687445075424 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=612.145, mean=612.145, max=612.145, sum=1224.29 (2)", - "tab": "General information", - "score": 612.145061728395 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.764, - "details": { - "description": "min=0.764, mean=0.764, max=0.764, sum=1.527 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Efficiency", - "score": 0.7498581886291504 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=472.036, mean=472.036, max=472.036, sum=944.073 (2)", - "tab": "General information", - "score": 472.03636363636366 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.459 (2)", - "tab": "Efficiency", - "score": 0.7295293778789287 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1325.865, mean=1325.865, max=1325.865, sum=2651.731 (2)", - "tab": "General information", - "score": 1325.865306122449 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.501 (2)", - "tab": "Efficiency", - "score": 0.750605917688626 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=497.95, mean=497.95, max=497.95, sum=995.9 (2)", - "tab": "General information", - "score": 497.9502487562189 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)", - "tab": "Efficiency", - "score": 0.8238025544637657 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=405.349, mean=405.349, max=405.349, sum=810.699 (2)", - "tab": "General information", - "score": 405.34939759036143 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)", - "tab": "Efficiency", - "score": 0.8341451960000378 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=318.924, mean=318.924, max=318.924, sum=637.848 (2)", - "tab": "General information", - "score": 318.92397660818716 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.24, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json deleted file mode 100644 index 457d9ed2a..000000000 --- a/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral Small 2402", - "id": "mistralai/mistral-small-2402", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.687, - "details": { - "description": "min=0.215, mean=0.687, max=0.948, sum=78.352 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.279, mean=0.486, max=1.477, sum=55.362 (114)", - "tab": "Efficiency", - "score": 0.4856315259373381 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)", - "tab": "General information", - "score": 696.2734899593811 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.26, - "details": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=1.31, mean=1.31, max=1.31, sum=2.621 (2)", - "tab": "Efficiency", - "score": 1.3102962040901185 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)", - "tab": "General information", - "score": 402.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674, - "details": { - "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.719 (2)", - "tab": "Efficiency", - "score": 0.35931493441263834 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)", - "tab": "General information", - "score": 407.0888888888889 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402, - "details": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=1.152, mean=1.152, max=1.152, sum=2.304 (2)", - "tab": "Efficiency", - "score": 1.151910934448242 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)", - "tab": "Efficiency", - "score": 0.3582056214412053 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)", - "tab": "Efficiency", - "score": 0.29487616300582886 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.896 (2)", - "tab": "Efficiency", - "score": 0.44812692165374757 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.734 (2)", - "tab": "Efficiency", - "score": 0.3668311620723305 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)", - "tab": "Efficiency", - "score": 0.37511497852849024 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)", - "tab": "General information", - "score": 627.71 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)", - "tab": "General information", - "score": 550.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)", - "tab": "General information", - "score": 902.17 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)", - "tab": "General information", - "score": 658.31 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)", - "tab": "General information", - "score": 592.4104046242775 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)", - "tab": "General information", - "score": 551.0294117647059 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.022, mean=1.022, max=1.022, sum=2.044 (2)", - "tab": "Efficiency", - "score": 1.0222336649894714 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)", - "tab": "General information", - "score": 424.94 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614, - "details": { - "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=1.477, mean=1.477, max=1.477, sum=2.954 (2)", - "tab": "Efficiency", - "score": 1.4771089867541665 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)", - "tab": "General information", - "score": 687.1754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45, - "details": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.364, mean=0.364, max=0.364, sum=0.728 (2)", - "tab": "Efficiency", - "score": 0.36384799242019655 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)", - "tab": "General information", - "score": 483.47 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)", - "tab": "Efficiency", - "score": 0.4353830130011947 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)", - "tab": "General information", - "score": 451.0925925925926 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.765, - "details": { - "description": "min=0.765, mean=0.765, max=0.765, sum=1.531 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.877 (2)", - "tab": "Efficiency", - "score": 0.43847233306173344 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)", - "tab": "General information", - "score": 373.81993569131834 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=1.536 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)", - "tab": "Efficiency", - "score": 0.45927367666188407 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.447, mean=0.447, max=0.447, sum=0.895 (2)", - "tab": "Efficiency", - "score": 0.447448378759073 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Efficiency", - "score": 0.407953996390998 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.839 (2)", - "tab": "Efficiency", - "score": 0.41963181386586107 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)", - "tab": "General information", - "score": 1279.1433823529412 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)", - "tab": "General information", - "score": 796.4964539007092 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)", - "tab": "General information", - "score": 1849.7105606258149 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)", - "tab": "General information", - "score": 645.2777777777778 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=1.347, mean=1.347, max=1.347, sum=2.693 (2)", - "tab": "Efficiency", - "score": 1.3467011404037477 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)", - "tab": "General information", - "score": 473.19 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.539 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.689 (2)", - "tab": "Efficiency", - "score": 0.3447367345031939 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)", - "tab": "General information", - "score": 665.9868421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71, - "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)", - "tab": "Efficiency", - "score": 0.4499172067642212 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)", - "tab": "General information", - "score": 644.6 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.766, - "details": { - "description": "min=0.766, mean=0.766, max=0.766, sum=1.532 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)", - "tab": "Efficiency", - "score": 0.4363225082181535 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)", - "tab": "General information", - "score": 487.1735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.685, - "details": { - "description": "min=0.685, mean=0.685, max=0.685, sum=1.37 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.781 (2)", - "tab": "Efficiency", - "score": 0.3906106086487466 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)", - "tab": "General information", - "score": 334.2851063829787 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.628, - "details": { - "description": "min=0.628, mean=0.628, max=0.628, sum=1.255 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)", - "tab": "Efficiency", - "score": 0.4342194343435353 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)", - "tab": "General information", - "score": 501.37931034482756 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415, - "details": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.831 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.869 (2)", - "tab": "Efficiency", - "score": 0.43446689244931336 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)", - "tab": "General information", - "score": 613.3862433862433 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.516, - "details": { - "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.887 (2)", - "tab": "Efficiency", - "score": 0.4434795303950234 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)", - "tab": "General information", - "score": 718.984126984127 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.749 (2)", - "tab": "Efficiency", - "score": 0.3742693070442446 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Efficiency", - "score": 0.3839088602019061 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.4230046820640564 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.911 (2)", - "tab": "Efficiency", - "score": 0.4556852485194351 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.885 (2)", - "tab": "Efficiency", - "score": 0.44265695533367116 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)", - "tab": "Efficiency", - "score": 0.47987033666106704 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.366, mean=0.366, max=0.366, sum=0.731 (2)", - "tab": "Efficiency", - "score": 0.3655165384977292 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)", - "tab": "Efficiency", - "score": 0.4325918674468994 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.41513349929777515 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.834 (2)", - "tab": "Efficiency", - "score": 0.41723605496993915 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.896 (2)", - "tab": "Efficiency", - "score": 0.44808799017459977 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.805 (2)", - "tab": "Efficiency", - "score": 0.4024901666023113 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.767 (2)", - "tab": "Efficiency", - "score": 0.3834606175329171 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.399, mean=0.399, max=0.399, sum=0.798 (2)", - "tab": "Efficiency", - "score": 0.39886615648551327 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)", - "tab": "General information", - "score": 600.5612903225806 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)", - "tab": "General information", - "score": 572.7980295566502 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)", - "tab": "General information", - "score": 988.24 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)", - "tab": "General information", - "score": 3089.109090909091 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)", - "tab": "General information", - "score": 429.2070707070707 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)", - "tab": "General information", - "score": 514.8082901554404 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)", - "tab": "General information", - "score": 423.81538461538463 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)", - "tab": "General information", - "score": 584.1296296296297 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)", - "tab": "General information", - "score": 443.34453781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)", - "tab": "General information", - "score": 622.774834437086 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)", - "tab": "General information", - "score": 558.8733944954129 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)", - "tab": "General information", - "score": 913.6435185185185 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)", - "tab": "General information", - "score": 2477.4460784313724 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)", - "tab": "General information", - "score": 1585.5527426160338 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)", - "tab": "Efficiency", - "score": 0.33194801304907007 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)", - "tab": "Efficiency", - "score": 0.3579711095067381 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)", - "tab": "General information", - "score": 353.15246636771303 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)", - "tab": "General information", - "score": 394.7480916030534 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.378, mean=0.378, max=0.378, sum=0.755 (2)", - "tab": "Efficiency", - "score": 0.37766425668700665 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)", - "tab": "General information", - "score": 720.1818181818181 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.607 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.781 (2)", - "tab": "Efficiency", - "score": 0.3902764905449803 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)", - "tab": "General information", - "score": 486.77914110429447 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.785 (2)", - "tab": "Efficiency", - "score": 0.3927395024469921 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)", - "tab": "General information", - "score": 734.8303571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.518, mean=0.518, max=0.518, sum=1.035 (2)", - "tab": "Efficiency", - "score": 0.5177000564278909 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)", - "tab": "General information", - "score": 315.3592233009709 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.906, - "details": { - "description": "min=0.906, mean=0.906, max=0.906, sum=1.812 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)", - "tab": "Efficiency", - "score": 0.42478426195617414 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)", - "tab": "General information", - "score": 463.4230769230769 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.279, mean=0.279, max=0.279, sum=0.557 (2)", - "tab": "Efficiency", - "score": 0.2786110520362854 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)", - "tab": "General information", - "score": 405.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.844, - "details": { - "description": "min=0.844, mean=0.844, max=0.844, sum=1.688 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Efficiency", - "score": 0.3998657326436439 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)", - "tab": "General information", - "score": 348.51851851851853 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.575, - "details": { - "description": "min=0.575, mean=0.575, max=0.575, sum=1.151 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.949 (2)", - "tab": "Efficiency", - "score": 0.4744071271378181 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.799 (2)", - "tab": "Efficiency", - "score": 0.39967524166213736 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)", - "tab": "General information", - "score": 540.0375722543353 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)", - "tab": "General information", - "score": 745.5162011173185 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.761, - "details": { - "description": "min=0.761, mean=0.761, max=0.761, sum=1.523 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.843 (2)", - "tab": "Efficiency", - "score": 0.42128828927582385 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)", - "tab": "General information", - "score": 680.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802, - "details": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.605 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.875 (2)", - "tab": "Efficiency", - "score": 0.43764398863286147 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)", - "tab": "General information", - "score": 602.145061728395 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.464, mean=0.464, max=0.464, sum=0.929 (2)", - "tab": "Efficiency", - "score": 0.464488469470631 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)", - "tab": "General information", - "score": 462.03636363636366 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.862 (2)", - "tab": "Efficiency", - "score": 0.43111481179996414 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)", - "tab": "General information", - "score": 1315.865306122449 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.741 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.399, mean=0.399, max=0.399, sum=0.799 (2)", - "tab": "Efficiency", - "score": 0.3994969099908326 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)", - "tab": "General information", - "score": 487.9502487562189 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542, - "details": { - "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.538, mean=0.538, max=0.538, sum=1.076 (2)", - "tab": "Efficiency", - "score": 0.5377652975450079 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)", - "tab": "General information", - "score": 395.34939759036143 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.848, - "details": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.404, mean=0.404, max=0.404, sum=0.809 (2)", - "tab": "Efficiency", - "score": 0.4042932554992319 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)", - "tab": "General information", - "score": 308.92397660818716 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json deleted file mode 100644 index c7ab33c35..000000000 --- a/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral 8x22B", - "id": "mistralai/mixtral-8x22b", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.463, mean=0.778, max=0.974, sum=88.715 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.284, mean=0.555, max=4.852, sum=63.286 (114)", - "tab": "Efficiency", - "score": 0.5551394123775506 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)", - "tab": "General information", - "score": 696.2734899593811 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48, - "details": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)", - "tab": "Efficiency", - "score": 0.31304038524627686 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)", - "tab": "General information", - "score": 402.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)", - "tab": "Efficiency", - "score": 0.3308721118503147 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)", - "tab": "General information", - "score": 407.0888888888889 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.569, - "details": { - "description": "min=0.569, mean=0.569, max=0.569, sum=1.137 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)", - "tab": "Efficiency", - "score": 0.35782508373260496 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)", - "tab": "Efficiency", - "score": 0.33555712799231213 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.819 (2)", - "tab": "Efficiency", - "score": 0.40926079750061034 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)", - "tab": "Efficiency", - "score": 0.3824312686920166 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)", - "tab": "Efficiency", - "score": 0.33573296993454066 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)", - "tab": "Efficiency", - "score": 0.34694373841379206 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)", - "tab": "General information", - "score": 627.71 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)", - "tab": "General information", - "score": 550.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)", - "tab": "General information", - "score": 902.17 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)", - "tab": "General information", - "score": 658.31 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)", - "tab": "General information", - "score": 592.4104046242775 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)", - "tab": "General information", - "score": 551.0294117647059 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.689 (2)", - "tab": "Efficiency", - "score": 0.3443935012817383 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)", - "tab": "General information", - "score": 424.94 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.719 (2)", - "tab": "Efficiency", - "score": 0.359416033092298 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)", - "tab": "General information", - "score": 687.1754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.699 (2)", - "tab": "Efficiency", - "score": 0.34949236392974853 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)", - "tab": "General information", - "score": 483.47 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)", - "tab": "Efficiency", - "score": 0.30799298153983223 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)", - "tab": "General information", - "score": 451.0925925925926 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.685 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.4229524595561135 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)", - "tab": "General information", - "score": 373.81993569131834 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.461, mean=0.461, max=0.461, sum=0.921 (2)", - "tab": "Efficiency", - "score": 0.4606352711425108 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)", - "tab": "Efficiency", - "score": 0.3601941665013631 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.156 (2)", - "tab": "Efficiency", - "score": 0.5780843218115815 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.718 (2)", - "tab": "Efficiency", - "score": 0.3589704905460083 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)", - "tab": "General information", - "score": 1279.1433823529412 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)", - "tab": "General information", - "score": 796.4964539007092 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)", - "tab": "General information", - "score": 1849.7105606258149 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)", - "tab": "General information", - "score": 645.2777777777778 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.696 (2)", - "tab": "Efficiency", - "score": 0.3477613878250122 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)", - "tab": "General information", - "score": 473.19 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882, - "details": { - "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)", - "tab": "Efficiency", - "score": 0.34718117117881775 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)", - "tab": "General information", - "score": 665.9868421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74, - "details": { - "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.672, mean=0.672, max=0.672, sum=1.345 (2)", - "tab": "Efficiency", - "score": 0.6724735307693481 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)", - "tab": "General information", - "score": 644.6 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.819, - "details": { - "description": "min=0.819, mean=0.819, max=0.819, sum=1.638 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.953 (2)", - "tab": "Efficiency", - "score": 0.4764475804454875 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)", - "tab": "General information", - "score": 487.1735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.591 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)", - "tab": "Efficiency", - "score": 0.3271778279162468 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)", - "tab": "General information", - "score": 334.2851063829787 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.766, - "details": { - "description": "min=0.766, mean=0.766, max=0.766, sum=1.531 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)", - "tab": "Efficiency", - "score": 0.5787854655035611 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)", - "tab": "General information", - "score": 501.37931034482756 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.622, - "details": { - "description": "min=0.622, mean=0.622, max=0.622, sum=1.243 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=4.852, mean=4.852, max=4.852, sum=9.703 (2)", - "tab": "Efficiency", - "score": 4.851643589438584 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)", - "tab": "General information", - "score": 613.3862433862433 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.627, - "details": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.254 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=4.348, mean=4.348, max=4.348, sum=8.696 (2)", - "tab": "Efficiency", - "score": 4.34797261631678 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)", - "tab": "General information", - "score": 718.984126984127 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.895, - "details": { - "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)", - "tab": "Efficiency", - "score": 0.30645533454033635 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.362, mean=0.362, max=0.362, sum=0.724 (2)", - "tab": "Efficiency", - "score": 0.3618842803785954 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)", - "tab": "Efficiency", - "score": 0.43201621770858767 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.874, mean=0.874, max=0.874, sum=1.747 (2)", - "tab": "Efficiency", - "score": 0.8736377629366788 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)", - "tab": "Efficiency", - "score": 0.3727773331632518 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Efficiency", - "score": 0.380075985903567 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)", - "tab": "Efficiency", - "score": 0.3130294726445125 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.454, mean=0.454, max=0.454, sum=0.909 (2)", - "tab": "Efficiency", - "score": 0.4543530375869186 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.475, mean=0.475, max=0.475, sum=0.95 (2)", - "tab": "Efficiency", - "score": 0.4752031294237666 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)", - "tab": "Efficiency", - "score": 0.3413255830474247 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.32982436013877936 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)", - "tab": "Efficiency", - "score": 0.4059625698460473 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.744, mean=0.744, max=0.744, sum=1.488 (2)", - "tab": "Efficiency", - "score": 0.7440984506233066 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.521, mean=0.521, max=0.521, sum=1.043 (2)", - "tab": "Efficiency", - "score": 0.5214709360388261 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)", - "tab": "General information", - "score": 600.5612903225806 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)", - "tab": "General information", - "score": 572.7980295566502 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)", - "tab": "General information", - "score": 988.24 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)", - "tab": "General information", - "score": 3089.109090909091 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)", - "tab": "General information", - "score": 429.2070707070707 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)", - "tab": "General information", - "score": 514.8082901554404 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)", - "tab": "General information", - "score": 423.81538461538463 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)", - "tab": "General information", - "score": 584.1296296296297 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)", - "tab": "General information", - "score": 443.34453781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)", - "tab": "General information", - "score": 622.774834437086 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)", - "tab": "General information", - "score": 558.8733944954129 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)", - "tab": "General information", - "score": 913.6435185185185 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)", - "tab": "General information", - "score": 2477.4460784313724 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)", - "tab": "General information", - "score": 1585.5527426160338 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.771 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.3039867247166655 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.627 (2)", - "tab": "Efficiency", - "score": 0.3133269229918036 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)", - "tab": "General information", - "score": 353.15246636771303 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)", - "tab": "General information", - "score": 394.7480916030534 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)", - "tab": "Efficiency", - "score": 0.34560450049471264 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)", - "tab": "General information", - "score": 720.1818181818181 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.357, mean=0.357, max=0.357, sum=0.713 (2)", - "tab": "Efficiency", - "score": 0.35657415360760836 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)", - "tab": "General information", - "score": 486.77914110429447 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.661, mean=0.661, max=0.661, sum=1.321 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.751 (2)", - "tab": "Efficiency", - "score": 0.37532309123447966 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)", - "tab": "General information", - "score": 734.8303571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.567 (2)", - "tab": "Efficiency", - "score": 0.2837195535307949 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)", - "tab": "General information", - "score": 315.3592233009709 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "details": { - "description": "min=0.915, mean=0.915, max=0.915, sum=1.829 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.955 (2)", - "tab": "Efficiency", - "score": 0.47738775534507555 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)", - "tab": "General information", - "score": 463.4230769230769 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.358, mean=0.358, max=0.358, sum=0.715 (2)", - "tab": "Efficiency", - "score": 0.35768274068832395 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)", - "tab": "General information", - "score": 405.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.899, - "details": { - "description": "min=0.899, mean=0.899, max=0.899, sum=1.798 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.29965735912931984 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)", - "tab": "General information", - "score": 348.51851851851853 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.646, - "details": { - "description": "min=0.646, mean=0.646, max=0.646, sum=1.292 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.87 (2)", - "tab": "Efficiency", - "score": 0.43506465757513324 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.729 (2)", - "tab": "Efficiency", - "score": 0.36451081030861626 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)", - "tab": "General information", - "score": 540.0375722543353 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)", - "tab": "General information", - "score": 745.5162011173185 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.866, - "details": { - "description": "min=0.866, mean=0.866, max=0.866, sum=1.732 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.458, mean=0.458, max=0.458, sum=0.916 (2)", - "tab": "Efficiency", - "score": 0.4579993447447135 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)", - "tab": "General information", - "score": 680.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)", - "tab": "Efficiency", - "score": 0.4504210890075307 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)", - "tab": "General information", - "score": 602.145061728395 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)", - "tab": "Efficiency", - "score": 0.6507512135939164 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)", - "tab": "General information", - "score": 462.03636363636366 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.919 (2)", - "tab": "Efficiency", - "score": 0.4596467952339017 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)", - "tab": "General information", - "score": 1315.865306122449 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.495, mean=0.495, max=0.495, sum=0.989 (2)", - "tab": "Efficiency", - "score": 0.4945164248717958 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)", - "tab": "General information", - "score": 487.9502487562189 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.3041278597820236 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)", - "tab": "General information", - "score": 395.34939759036143 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)", - "tab": "Efficiency", - "score": 0.29729281252587747 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)", - "tab": "General information", - "score": 308.92397660818716 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.598, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json deleted file mode 100644 index 3ed7c6104..000000000 --- a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral 8x7B 32K seqlen", - "id": "mistralai/mixtral-8x7b-32kseqlen", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.717, - "details": { - "description": "min=0.38, mean=0.717, max=0.933, sum=81.767 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.303, mean=0.364, max=0.667, sum=41.491 (114)", - "tab": "Efficiency", - "score": 0.36396022974729103 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)", - "tab": "General information", - "score": 696.2734899593811 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38, - "details": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)", - "tab": "Efficiency", - "score": 0.3324201321601868 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)", - "tab": "General information", - "score": 402.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)", - "tab": "Efficiency", - "score": 0.33777406480577254 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)", - "tab": "General information", - "score": 407.0888888888889 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.773 (2)", - "tab": "Efficiency", - "score": 0.386492395401001 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.366, mean=0.366, max=0.366, sum=0.733 (2)", - "tab": "Efficiency", - "score": 0.3663763701915741 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.735 (2)", - "tab": "Efficiency", - "score": 0.36740577936172486 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.712 (2)", - "tab": "Efficiency", - "score": 0.35591145277023317 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.695 (2)", - "tab": "Efficiency", - "score": 0.347429724787012 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.361, mean=0.361, max=0.361, sum=0.721 (2)", - "tab": "Efficiency", - "score": 0.3606654686086318 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)", - "tab": "General information", - "score": 627.71 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)", - "tab": "General information", - "score": 550.7986111111111 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)", - "tab": "General information", - "score": 902.17 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)", - "tab": "General information", - "score": 658.31 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)", - "tab": "General information", - "score": 592.4104046242775 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)", - "tab": "General information", - "score": 551.0294117647059 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Efficiency", - "score": 0.34211899518966676 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)", - "tab": "General information", - "score": 424.94 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.605, - "details": { - "description": "min=0.605, mean=0.605, max=0.605, sum=1.211 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.708 (2)", - "tab": "Efficiency", - "score": 0.3541024630529839 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)", - "tab": "General information", - "score": 687.1754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.335 (2)", - "tab": "Efficiency", - "score": 0.667280240058899 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)", - "tab": "General information", - "score": 483.47 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.677 (2)", - "tab": "Efficiency", - "score": 0.3384844925668504 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)", - "tab": "General information", - "score": 451.0925925925926 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.797, - "details": { - "description": "min=0.797, mean=0.797, max=0.797, sum=1.595 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.322712682067773 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)", - "tab": "General information", - "score": 373.81993569131834 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)", - "tab": "Efficiency", - "score": 0.41612808669314666 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)", - "tab": "Efficiency", - "score": 0.34556762884694636 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)", - "tab": "Efficiency", - "score": 0.4395133182309286 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.3276863078665889 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)", - "tab": "General information", - "score": 1279.1433823529412 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)", - "tab": "General information", - "score": 796.4964539007092 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)", - "tab": "General information", - "score": 1849.7105606258149 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)", - "tab": "General information", - "score": 645.2777777777778 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.637 (2)", - "tab": "Efficiency", - "score": 0.3183705282211304 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)", - "tab": "General information", - "score": 473.19 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)", - "tab": "Efficiency", - "score": 0.36493434560926336 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)", - "tab": "General information", - "score": 665.9868421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)", - "tab": "Efficiency", - "score": 0.3650094985961914 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)", - "tab": "General information", - "score": 644.6 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.671 (2)", - "tab": "Efficiency", - "score": 0.33542148392155485 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)", - "tab": "General information", - "score": 487.1735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.681, - "details": { - "description": "min=0.681, mean=0.681, max=0.681, sum=1.362 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.3393338994776949 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)", - "tab": "General information", - "score": 334.2851063829787 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.676, - "details": { - "description": "min=0.676, mean=0.676, max=0.676, sum=1.352 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.35104844159093396 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)", - "tab": "General information", - "score": 501.37931034482756 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.476, - "details": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)", - "tab": "Efficiency", - "score": 0.4298846198137475 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)", - "tab": "General information", - "score": 613.3862433862433 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532, - "details": { - "description": "min=0.532, mean=0.532, max=0.532, sum=1.063 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.741 (2)", - "tab": "Efficiency", - "score": 0.37032828255305217 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)", - "tab": "General information", - "score": 718.984126984127 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.657 (2)", - "tab": "Efficiency", - "score": 0.3284358686016452 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)", - "tab": "Efficiency", - "score": 0.32620196624342446 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.485, mean=0.485, max=0.485, sum=0.969 (2)", - "tab": "Efficiency", - "score": 0.48452038288116456 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.641, mean=0.641, max=0.641, sum=1.283 (2)", - "tab": "Efficiency", - "score": 0.6413424491882325 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.653 (2)", - "tab": "Efficiency", - "score": 0.3266212759595929 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)", - "tab": "Efficiency", - "score": 0.33742881191826857 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)", - "tab": "Efficiency", - "score": 0.3271804552811843 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.3277335458331638 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)", - "tab": "Efficiency", - "score": 0.3291829443779312 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.33715188266425733 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)", - "tab": "Efficiency", - "score": 0.39586829351722647 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.376, mean=0.376, max=0.376, sum=0.753 (2)", - "tab": "Efficiency", - "score": 0.37643481846208926 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)", - "tab": "Efficiency", - "score": 0.531247288573022 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)", - "tab": "Efficiency", - "score": 0.44013202341297003 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)", - "tab": "General information", - "score": 600.5612903225806 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)", - "tab": "General information", - "score": 572.7980295566502 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)", - "tab": "General information", - "score": 988.24 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)", - "tab": "General information", - "score": 3089.109090909091 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)", - "tab": "General information", - "score": 429.2070707070707 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)", - "tab": "General information", - "score": 514.8082901554404 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)", - "tab": "General information", - "score": 423.81538461538463 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)", - "tab": "General information", - "score": 584.1296296296297 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)", - "tab": "General information", - "score": 443.34453781512605 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)", - "tab": "General information", - "score": 622.774834437086 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)", - "tab": "General information", - "score": 558.8733944954129 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)", - "tab": "General information", - "score": 913.6435185185185 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)", - "tab": "General information", - "score": 2477.4460784313724 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)", - "tab": "General information", - "score": 1585.5527426160338 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)", - "tab": "Efficiency", - "score": 0.30348238068311206 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.30424233429304515 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)", - "tab": "General information", - "score": 353.15246636771303 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)", - "tab": "General information", - "score": 394.7480916030534 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.708 (2)", - "tab": "Efficiency", - "score": 0.354031091879222 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)", - "tab": "General information", - "score": 720.1818181818181 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.767, - "details": { - "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)", - "tab": "Efficiency", - "score": 0.3338228237409533 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)", - "tab": "General information", - "score": 486.77914110429447 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.697 (2)", - "tab": "Efficiency", - "score": 0.34853318120752064 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)", - "tab": "General information", - "score": 734.8303571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.651 (2)", - "tab": "Efficiency", - "score": 0.32549439124690677 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)", - "tab": "General information", - "score": 315.3592233009709 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.923, - "details": { - "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.631 (2)", - "tab": "Efficiency", - "score": 0.315602661198021 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)", - "tab": "General information", - "score": 463.4230769230769 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)", - "tab": "Efficiency", - "score": 0.3161799097061157 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)", - "tab": "General information", - "score": 405.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.881, - "details": { - "description": "min=0.881, mean=0.881, max=0.881, sum=1.762 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.32256904598396857 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)", - "tab": "General information", - "score": 348.51851851851853 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444, - "details": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.607 (2)", - "tab": "Efficiency", - "score": 0.3035011126126857 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)", - "tab": "Efficiency", - "score": 0.34521307439111465 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)", - "tab": "General information", - "score": 540.0375722543353 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)", - "tab": "General information", - "score": 745.5162011173185 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.706 (2)", - "tab": "Efficiency", - "score": 0.3528824195363163 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)", - "tab": "General information", - "score": 680.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.32980361028953836 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)", - "tab": "General information", - "score": 602.145061728395 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.682, - "details": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.32145483710549094 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)", - "tab": "General information", - "score": 462.03636363636366 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.792, - "details": { - "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.783 (2)", - "tab": "Efficiency", - "score": 0.3913051323014863 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)", - "tab": "General information", - "score": 1315.865306122449 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.741 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)", - "tab": "Efficiency", - "score": 0.326159788008353 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)", - "tab": "General information", - "score": 487.9502487562189 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.506, - "details": { - "description": "min=0.506, mean=0.506, max=0.506, sum=1.012 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.34297854210956985 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)", - "tab": "General information", - "score": 395.34939759036143 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.633 (2)", - "tab": "Efficiency", - "score": 0.3165940499445151 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)", - "tab": "General information", - "score": 308.92397660818716 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json deleted file mode 100644 index e5aec6b67..000000000 --- a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral NeMo 2402", - "id": "mistralai/open-mistral-nemo-2407", - "developer": "mistralai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.653, - "details": { - "description": "min=0.29, mean=0.653, max=0.912, sum=74.476 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.57, mean=0.852, max=1.185, sum=97.097 (114)", - "tab": "Efficiency", - "score": 0.8517321572873682 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=275.181, mean=627.375, max=2825.394, sum=71520.789 (114)", - "tab": "General information", - "score": 627.3753397392697 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29, - "details": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", - "tab": "Efficiency", - "score": 0.6429726719856262 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=377.89, mean=377.89, max=377.89, sum=755.78 (2)", - "tab": "General information", - "score": 377.89 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.607, - "details": { - "description": "min=0.607, mean=0.607, max=0.607, sum=1.215 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.784, mean=0.784, max=0.784, sum=1.569 (2)", - "tab": "Efficiency", - "score": 0.7843294850102177 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=334.711, mean=334.711, max=334.711, sum=669.422 (2)", - "tab": "General information", - "score": 334.7111111111111 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.373, - "details": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.745 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.635, mean=0.635, max=0.635, sum=1.27 (2)", - "tab": "Efficiency", - "score": 0.6347627878189087 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)", - "tab": "Efficiency", - "score": 0.7433112810055414 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.764, mean=0.764, max=0.764, sum=1.529 (2)", - "tab": "Efficiency", - "score": 0.7643197441101074 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)", - "tab": "Efficiency", - "score": 0.8069064331054687 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)", - "tab": "Efficiency", - "score": 0.9125060442555157 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)", - "tab": "Efficiency", - "score": 0.7920899648292392 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=563.78, mean=563.78, max=563.78, sum=1127.56 (2)", - "tab": "General information", - "score": 563.78 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=471.931, mean=471.931, max=471.931, sum=943.861 (2)", - "tab": "General information", - "score": 471.93055555555554 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=844.21, mean=844.21, max=844.21, sum=1688.42 (2)", - "tab": "General information", - "score": 844.21 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=609.39, mean=609.39, max=609.39, sum=1218.78 (2)", - "tab": "General information", - "score": 609.39 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=497.538, mean=497.538, max=497.538, sum=995.075 (2)", - "tab": "General information", - "score": 497.53757225433526 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=512.941, mean=512.941, max=512.941, sum=1025.882 (2)", - "tab": "General information", - "score": 512.9411764705883 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=1.011, mean=1.011, max=1.011, sum=2.023 (2)", - "tab": "Efficiency", - "score": 1.0114419960975647 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=395.27, mean=395.27, max=395.27, sum=790.54 (2)", - "tab": "General information", - "score": 395.27 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.766, mean=0.766, max=0.766, sum=1.531 (2)", - "tab": "Efficiency", - "score": 0.7657254641516167 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=631.851, mean=631.851, max=631.851, sum=1263.702 (2)", - "tab": "General information", - "score": 631.8508771929825 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4, - "details": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.683 (2)", - "tab": "Efficiency", - "score": 0.8416926956176758 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=433.39, mean=433.39, max=433.39, sum=866.78 (2)", - "tab": "General information", - "score": 433.39 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.837, mean=0.837, max=0.837, sum=1.674 (2)", - "tab": "Efficiency", - "score": 0.8370662177050555 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=399.019, mean=399.019, max=399.019, sum=798.037 (2)", - "tab": "General information", - "score": 399.01851851851853 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)", - "tab": "Efficiency", - "score": 0.8774675686643054 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=331.354, mean=331.354, max=331.354, sum=662.707 (2)", - "tab": "General information", - "score": 331.35369774919616 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.588, - "details": { - "description": "min=0.588, mean=0.588, max=0.588, sum=1.176 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)", - "tab": "Efficiency", - "score": 0.7308363747947356 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)", - "tab": "Efficiency", - "score": 0.824517419152226 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.812, mean=0.812, max=0.812, sum=1.625 (2)", - "tab": "Efficiency", - "score": 0.8123439646761917 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.757, mean=0.757, max=0.757, sum=1.515 (2)", - "tab": "Efficiency", - "score": 0.757308129391639 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1091.357, mean=1091.357, max=1091.357, sum=2182.713 (2)", - "tab": "General information", - "score": 1091.3566176470588 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=749.039, mean=749.039, max=749.039, sum=1498.078 (2)", - "tab": "General information", - "score": 749.0390070921986 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1710.472, mean=1710.472, max=1710.472, sum=3420.944 (2)", - "tab": "General information", - "score": 1710.4719687092568 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=584.748, mean=584.748, max=584.748, sum=1169.497 (2)", - "tab": "General information", - "score": 584.7483660130719 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Efficiency", - "score": 0.8529575586318969 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=430.83, mean=430.83, max=430.83, sum=861.66 (2)", - "tab": "General information", - "score": 430.83 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691, - "details": { - "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.947, mean=0.947, max=0.947, sum=1.895 (2)", - "tab": "Efficiency", - "score": 0.9474252227105593 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=587.053, mean=587.053, max=587.053, sum=1174.105 (2)", - "tab": "General information", - "score": 587.0526315789474 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.719, mean=0.719, max=0.719, sum=1.438 (2)", - "tab": "Efficiency", - "score": 0.7189487242698669 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=597.68, mean=597.68, max=597.68, sum=1195.36 (2)", - "tab": "General information", - "score": 597.68 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736, - "details": { - "description": "min=0.736, mean=0.736, max=0.736, sum=1.472 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.643 (2)", - "tab": "Efficiency", - "score": 0.8215559176678927 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=382.989, mean=382.989, max=382.989, sum=765.977 (2)", - "tab": "General information", - "score": 382.98867924528304 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.647, - "details": { - "description": "min=0.647, mean=0.647, max=0.647, sum=1.294 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)", - "tab": "Efficiency", - "score": 0.7878646302730479 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=301.336, mean=301.336, max=301.336, sum=602.672 (2)", - "tab": "General information", - "score": 301.336170212766 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531, - "details": { - "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.658, mean=0.658, max=0.658, sum=1.316 (2)", - "tab": "Efficiency", - "score": 0.6578493726664576 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=464.697, mean=464.697, max=464.697, sum=929.393 (2)", - "tab": "General information", - "score": 464.6965517241379 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.439, - "details": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)", - "tab": "Efficiency", - "score": 0.7115525694751235 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=580.741, mean=580.741, max=580.741, sum=1161.481 (2)", - "tab": "General information", - "score": 580.7407407407408 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.405, - "details": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=1.185, mean=1.185, max=1.185, sum=2.37 (2)", - "tab": "Efficiency", - "score": 1.1852161146345592 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=619.929, mean=619.929, max=619.929, sum=1239.857 (2)", - "tab": "General information", - "score": 619.9285714285714 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.848, - "details": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.97, mean=0.97, max=0.97, sum=1.94 (2)", - "tab": "Efficiency", - "score": 0.9699527340550577 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.503 (2)", - "tab": "Efficiency", - "score": 0.751325937327493 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.764, mean=0.764, max=0.764, sum=1.528 (2)", - "tab": "Efficiency", - "score": 0.7637556600570679 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)", - "tab": "Efficiency", - "score": 0.7959829893979159 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)", - "tab": "Efficiency", - "score": 0.686434592863526 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.837, mean=0.837, max=0.837, sum=1.674 (2)", - "tab": "Efficiency", - "score": 0.8370978684005342 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=1.045, mean=1.045, max=1.045, sum=2.09 (2)", - "tab": "Efficiency", - "score": 1.045194720610594 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.839, mean=0.839, max=0.839, sum=1.677 (2)", - "tab": "Efficiency", - "score": 0.8386335717307196 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Efficiency", - "score": 0.9010114108814913 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Efficiency", - "score": 0.9301499767808725 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.975, mean=0.975, max=0.975, sum=1.95 (2)", - "tab": "Efficiency", - "score": 0.9747656953444175 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.798, mean=0.798, max=0.798, sum=1.595 (2)", - "tab": "Efficiency", - "score": 0.7976611223485734 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.112, mean=1.112, max=1.112, sum=2.225 (2)", - "tab": "Efficiency", - "score": 1.1124158618496913 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)", - "tab": "Efficiency", - "score": 0.9417288112237987 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=500.994, mean=500.994, max=500.994, sum=1001.987 (2)", - "tab": "General information", - "score": 500.9935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=507.995, mean=507.995, max=507.995, sum=1015.99 (2)", - "tab": "General information", - "score": 507.9950738916256 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=888.78, mean=888.78, max=888.78, sum=1777.56 (2)", - "tab": "General information", - "score": 888.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2825.394, mean=2825.394, max=2825.394, sum=5650.788 (2)", - "tab": "General information", - "score": 2825.3939393939395 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.207, mean=372.207, max=372.207, sum=744.414 (2)", - "tab": "General information", - "score": 372.2070707070707 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=471.202, mean=471.202, max=471.202, sum=942.404 (2)", - "tab": "General information", - "score": 471.2020725388601 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=379.21, mean=379.21, max=379.21, sum=758.421 (2)", - "tab": "General information", - "score": 379.2102564102564 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=555.807, mean=555.807, max=555.807, sum=1111.615 (2)", - "tab": "General information", - "score": 555.8074074074074 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=405.95, mean=405.95, max=405.95, sum=811.899 (2)", - "tab": "General information", - "score": 405.9495798319328 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=584.272, mean=584.272, max=584.272, sum=1168.543 (2)", - "tab": "General information", - "score": 584.2715231788079 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=487.532, mean=487.532, max=487.532, sum=975.064 (2)", - "tab": "General information", - "score": 487.5321100917431 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=831.926, mean=831.926, max=831.926, sum=1663.852 (2)", - "tab": "General information", - "score": 831.925925925926 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2271.559, mean=2271.559, max=2271.559, sum=4543.118 (2)", - "tab": "General information", - "score": 2271.5588235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1458.937, mean=1458.937, max=1458.937, sum=2917.873 (2)", - "tab": "General information", - "score": 1458.9367088607594 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702, - "details": { - "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.851, mean=0.851, max=0.851, sum=1.703 (2)", - "tab": "Efficiency", - "score": 0.8512581602874892 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.139 (2)", - "tab": "Efficiency", - "score": 0.569578381895109 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=320.296, mean=320.296, max=320.296, sum=640.592 (2)", - "tab": "General information", - "score": 320.29596412556054 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=345.45, mean=345.45, max=345.45, sum=690.901 (2)", - "tab": "General information", - "score": 345.4503816793893 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.769, - "details": { - "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)", - "tab": "Efficiency", - "score": 0.7790698473118554 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=649.017, mean=649.017, max=649.017, sum=1298.033 (2)", - "tab": "General information", - "score": 649.0165289256198 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=1.077, mean=1.077, max=1.077, sum=2.154 (2)", - "tab": "Efficiency", - "score": 1.0772201810146402 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.595, mean=449.595, max=449.595, sum=899.19 (2)", - "tab": "General information", - "score": 449.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402, - "details": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=1.123, mean=1.123, max=1.123, sum=2.246 (2)", - "tab": "Efficiency", - "score": 1.1229032427072525 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=684.696, mean=684.696, max=684.696, sum=1369.393 (2)", - "tab": "General information", - "score": 684.6964285714286 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.571 (2)", - "tab": "Efficiency", - "score": 0.7855723436596325 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=286.272, mean=286.272, max=286.272, sum=572.544 (2)", - "tab": "General information", - "score": 286.2718446601942 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.719 (2)", - "tab": "Efficiency", - "score": 0.8593697160737127 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=408.308, mean=408.308, max=408.308, sum=816.615 (2)", - "tab": "General information", - "score": 408.3076923076923 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)", - "tab": "Efficiency", - "score": 0.8288634467124939 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=332.56, mean=332.56, max=332.56, sum=665.12 (2)", - "tab": "General information", - "score": 332.56 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Efficiency", - "score": 0.8490832494440967 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=307.041, mean=307.041, max=307.041, sum=614.082 (2)", - "tab": "General information", - "score": 307.04086845466156 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.381, - "details": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.762 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.615 (2)", - "tab": "Efficiency", - "score": 0.8076560903835848 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.735 (2)", - "tab": "Efficiency", - "score": 0.8676496551023515 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=480.821, mean=480.821, max=480.821, sum=961.642 (2)", - "tab": "General information", - "score": 480.8208092485549 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=672.035, mean=672.035, max=672.035, sum=1344.069 (2)", - "tab": "General information", - "score": 672.0346368715084 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.807 (2)", - "tab": "Efficiency", - "score": 0.9033067834143546 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=590.154, mean=590.154, max=590.154, sum=1180.307 (2)", - "tab": "General information", - "score": 590.1535947712418 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.765, - "details": { - "description": "min=0.765, mean=0.765, max=0.765, sum=1.531 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Efficiency", - "score": 0.8491357167561849 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=526.04, mean=526.04, max=526.04, sum=1052.08 (2)", - "tab": "General information", - "score": 526.0401234567901 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718, - "details": { - "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=1.141, mean=1.141, max=1.141, sum=2.281 (2)", - "tab": "Efficiency", - "score": 1.1407060449773616 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=409.045, mean=409.045, max=409.045, sum=818.091 (2)", - "tab": "General information", - "score": 409.04545454545456 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.771, - "details": { - "description": "min=0.771, mean=0.771, max=0.771, sum=1.543 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.958, mean=0.958, max=0.958, sum=1.915 (2)", - "tab": "Efficiency", - "score": 0.9576426525505222 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1186.502, mean=1186.502, max=1186.502, sum=2373.004 (2)", - "tab": "General information", - "score": 1186.5020408163266 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726, - "details": { - "description": "min=0.726, mean=0.726, max=0.726, sum=1.453 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)", - "tab": "Efficiency", - "score": 0.781044238835425 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=455.348, mean=455.348, max=455.348, sum=910.697 (2)", - "tab": "General information", - "score": 455.3482587064677 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=1.059, mean=1.059, max=1.059, sum=2.118 (2)", - "tab": "Efficiency", - "score": 1.0589684750660355 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.127, mean=336.127, max=336.127, sum=672.253 (2)", - "tab": "General information", - "score": 336.1265060240964 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.789, - "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", - "tab": "Efficiency", - "score": 0.8906254336150766 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=275.181, mean=275.181, max=275.181, sum=550.363 (2)", - "tab": "General information", - "score": 275.1812865497076 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.215, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json deleted file mode 100644 index e429d6dbc..000000000 --- a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-3.5 Turbo 0125", - "id": "openai/gpt-3.5-turbo-0125", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673, - "details": { - "description": "min=0.307, mean=0.673, max=0.922, sum=76.686 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.396, mean=0.476, max=1.242, sum=54.283 (114)", - "tab": "Efficiency", - "score": 0.4761648045252673 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)", - "tab": "General information", - "score": 614.851634217556 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31, - "details": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Efficiency", - "score": 0.4701289844512939 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.44, mean=373.44, max=373.44, sum=746.88 (2)", - "tab": "General information", - "score": 373.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.844 (2)", - "tab": "Efficiency", - "score": 0.42177006050392435 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)", - "tab": "General information", - "score": 353.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.471, - "details": { - "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)", - "tab": "Efficiency", - "score": 0.42796642541885377 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.949 (2)", - "tab": "Efficiency", - "score": 0.47431788014041054 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Efficiency", - "score": 0.5200183248519897 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.897 (2)", - "tab": "Efficiency", - "score": 0.4484861779212952 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.4230213785447137 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.4148852918662277 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)", - "tab": "General information", - "score": 549.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.917, mean=473.917, max=473.917, sum=947.833 (2)", - "tab": "General information", - "score": 473.9166666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)", - "tab": "General information", - "score": 828.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)", - "tab": "General information", - "score": 594.52 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)", - "tab": "General information", - "score": 502.728323699422 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)", - "tab": "General information", - "score": 503.6078431372549 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)", - "tab": "Efficiency", - "score": 0.44357073068618774 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.54, mean=378.54, max=378.54, sum=757.08 (2)", - "tab": "General information", - "score": 378.54 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.474, - "details": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.836 (2)", - "tab": "Efficiency", - "score": 0.4179882564042744 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)", - "tab": "General information", - "score": 614.4298245614035 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)", - "tab": "Efficiency", - "score": 0.4315228652954102 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.806, - "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.611 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.017 (2)", - "tab": "Efficiency", - "score": 0.5086877279811435 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.639, mean=394.639, max=394.639, sum=789.278 (2)", - "tab": "General information", - "score": 394.6388888888889 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.746, mean=0.746, max=0.746, sum=1.492 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)", - "tab": "Efficiency", - "score": 0.4717828660149283 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722, - "details": { - "description": "min=0.722, mean=0.722, max=0.722, sum=1.444 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.485, mean=0.485, max=0.485, sum=0.971 (2)", - "tab": "Efficiency", - "score": 0.4853776947540395 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.42316425692105125 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.883 (2)", - "tab": "Efficiency", - "score": 0.4417385995932011 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)", - "tab": "Efficiency", - "score": 0.42156751132478903 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)", - "tab": "General information", - "score": 1094.5845588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)", - "tab": "General information", - "score": 658.5921985815603 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)", - "tab": "General information", - "score": 1637.7868318122555 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)", - "tab": "General information", - "score": 575.1143790849674 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.456, mean=0.456, max=0.456, sum=0.911 (2)", - "tab": "Efficiency", - "score": 0.4557087206840515 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", - "tab": "Efficiency", - "score": 0.42091869994213704 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)", - "tab": "General information", - "score": 579.6907894736842 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.906 (2)", - "tab": "Efficiency", - "score": 0.4530529642105103 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.837 (2)", - "tab": "Efficiency", - "score": 0.41833644812961795 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.947, mean=397.947, max=397.947, sum=795.894 (2)", - "tab": "General information", - "score": 397.94716981132075 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.634, - "details": { - "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)", - "tab": "Efficiency", - "score": 0.8081990150695152 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.838, mean=304.838, max=304.838, sum=609.677 (2)", - "tab": "General information", - "score": 304.83829787234043 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.669, - "details": { - "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=1.242, mean=1.242, max=1.242, sum=2.485 (2)", - "tab": "Efficiency", - "score": 1.2423763686213 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=440.641, mean=440.641, max=440.641, sum=881.283 (2)", - "tab": "General information", - "score": 440.6413793103448 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534, - "details": { - "description": "min=0.534, mean=0.534, max=0.534, sum=1.069 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)", - "tab": "Efficiency", - "score": 0.4359189442225865 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)", - "tab": "General information", - "score": 531.8624338624338 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444, - "details": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.861 (2)", - "tab": "Efficiency", - "score": 0.43056895051683697 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)", - "tab": "General information", - "score": 606.7619047619048 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.819, - "details": { - "description": "min=0.819, mean=0.819, max=0.819, sum=1.637 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.143 (2)", - "tab": "Efficiency", - "score": 0.5715394450772193 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.491, mean=0.491, max=0.491, sum=0.981 (2)", - "tab": "Efficiency", - "score": 0.49073645046779085 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)", - "tab": "Efficiency", - "score": 0.43273836851119996 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.489, mean=0.489, max=0.489, sum=0.977 (2)", - "tab": "Efficiency", - "score": 0.48863930413217255 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)", - "tab": "Efficiency", - "score": 0.4360258868246367 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.484, mean=0.484, max=0.484, sum=0.967 (2)", - "tab": "Efficiency", - "score": 0.4836950153884492 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)", - "tab": "Efficiency", - "score": 0.4215013412328867 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.979 (2)", - "tab": "Efficiency", - "score": 0.48968876291204383 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)", - "tab": "Efficiency", - "score": 0.4320918882594389 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.932 (2)", - "tab": "Efficiency", - "score": 0.4659363955061957 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.887 (2)", - "tab": "Efficiency", - "score": 0.4434620769745713 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.862 (2)", - "tab": "Efficiency", - "score": 0.43081507749027675 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.971 (2)", - "tab": "Efficiency", - "score": 0.4857361819229874 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)", - "tab": "Efficiency", - "score": 0.44100493620216596 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)", - "tab": "General information", - "score": 513.6774193548387 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.714, mean=496.714, max=496.714, sum=993.429 (2)", - "tab": "General information", - "score": 496.7142857142857 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)", - "tab": "General information", - "score": 2798.072727272727 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.045, mean=372.045, max=372.045, sum=744.091 (2)", - "tab": "General information", - "score": 372.04545454545456 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=371.562, mean=371.562, max=371.562, sum=743.123 (2)", - "tab": "General information", - "score": 371.5615384615385 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)", - "tab": "General information", - "score": 532.3740740740741 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.025, mean=399.025, max=399.025, sum=798.05 (2)", - "tab": "General information", - "score": 399.02521008403363 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)", - "tab": "General information", - "score": 560.4635761589404 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.246, mean=495.246, max=495.246, sum=990.492 (2)", - "tab": "General information", - "score": 495.24587155963303 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)", - "tab": "General information", - "score": 795.699074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)", - "tab": "General information", - "score": 1428.2700421940929 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.557 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.42309954027423946 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.833 (2)", - "tab": "Efficiency", - "score": 0.4166541681944869 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.906, mean=319.906, max=319.906, sum=639.812 (2)", - "tab": "General information", - "score": 319.90582959641256 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.183, mean=341.183, max=341.183, sum=682.366 (2)", - "tab": "General information", - "score": 341.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.525, mean=0.525, max=0.525, sum=1.05 (2)", - "tab": "Efficiency", - "score": 0.5249163257189033 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)", - "tab": "General information", - "score": 639.8512396694215 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.504, mean=0.504, max=0.504, sum=1.008 (2)", - "tab": "Efficiency", - "score": 0.5038382904661214 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.595, mean=449.595, max=449.595, sum=899.19 (2)", - "tab": "General information", - "score": 449.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.455, - "details": { - "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.437, mean=0.437, max=0.437, sum=0.875 (2)", - "tab": "Efficiency", - "score": 0.4374160830463682 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.458, mean=0.458, max=0.458, sum=0.917 (2)", - "tab": "Efficiency", - "score": 0.4584047493425388 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.796, mean=283.796, max=283.796, sum=567.592 (2)", - "tab": "General information", - "score": 283.79611650485435 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", - "tab": "Efficiency", - "score": 0.4209032700611995 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.489, mean=0.489, max=0.489, sum=0.979 (2)", - "tab": "Efficiency", - "score": 0.48938191413879395 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=341, mean=341, max=341, sum=682 (2)", - "tab": "General information", - "score": 341.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.841 (2)", - "tab": "Efficiency", - "score": 0.4205615121590528 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.925, mean=299.925, max=299.925, sum=599.849 (2)", - "tab": "General information", - "score": 299.92464878671774 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.355, - "details": { - "description": "min=0.355, mean=0.355, max=0.355, sum=0.711 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)", - "tab": "Efficiency", - "score": 0.43890244423309505 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)", - "tab": "Efficiency", - "score": 0.4216500338229387 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.145, mean=476.145, max=476.145, sum=952.289 (2)", - "tab": "General information", - "score": 476.1445086705202 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.748, - "details": { - "description": "min=0.748, mean=0.748, max=0.748, sum=1.497 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.876 (2)", - "tab": "Efficiency", - "score": 0.4378981278612723 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)", - "tab": "General information", - "score": 586.8169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.735, mean=0.735, max=0.735, sum=1.469 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)", - "tab": "Efficiency", - "score": 0.4620003163078685 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)", - "tab": "General information", - "score": 514.5586419753087 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.685, mean=0.685, max=0.685, sum=1.371 (2)", - "tab": "Efficiency", - "score": 0.6854934020475908 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.751, - "details": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.077 (2)", - "tab": "Efficiency", - "score": 0.5387308393205915 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.721 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)", - "tab": "Efficiency", - "score": 0.42779283025371495 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.522, mean=445.522, max=445.522, sum=891.045 (2)", - "tab": "General information", - "score": 445.5223880597015 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536, - "details": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.791 (2)", - "tab": "Efficiency", - "score": 0.39562296723744955 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.09, mean=343.09, max=343.09, sum=686.181 (2)", - "tab": "General information", - "score": 343.0903614457831 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.827 (2)", - "tab": "Efficiency", - "score": 0.41344076848169514 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=275.561, mean=275.561, max=275.561, sum=551.123 (2)", - "tab": "General information", - "score": 275.56140350877195 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.493, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json deleted file mode 100644 index 92faf2169..000000000 --- a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-3.5 Turbo 0613", - "id": "openai/gpt-3.5-turbo-0613", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689, - "details": { - "description": "min=0.33, mean=0.689, max=0.922, sum=78.524 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.171, mean=0.411, max=0.659, sum=46.797 (114)", - "tab": "Efficiency", - "score": 0.41050392458578394 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)", - "tab": "General information", - "score": 607.851634217556 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38, - "details": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.177, mean=0.177, max=0.177, sum=0.353 (2)", - "tab": "Efficiency", - "score": 0.17670444011688233 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=366.44, mean=366.44, max=366.44, sum=732.88 (2)", - "tab": "General information", - "score": 366.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.319 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.896 (2)", - "tab": "Efficiency", - "score": 0.448052688881203 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)", - "tab": "General information", - "score": 346.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.461, - "details": { - "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.174, mean=0.174, max=0.174, sum=0.349 (2)", - "tab": "Efficiency", - "score": 0.17441444158554076 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)", - "tab": "Efficiency", - "score": 0.43541959755950504 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.508, mean=0.508, max=0.508, sum=1.015 (2)", - "tab": "Efficiency", - "score": 0.5075832653045654 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.419, mean=0.419, max=0.419, sum=0.839 (2)", - "tab": "Efficiency", - "score": 0.41928773641586303 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.777 (2)", - "tab": "Efficiency", - "score": 0.3885422951913293 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)", - "tab": "Efficiency", - "score": 0.3700263453464882 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)", - "tab": "General information", - "score": 542.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=466.917, mean=466.917, max=466.917, sum=933.833 (2)", - "tab": "General information", - "score": 466.9166666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)", - "tab": "General information", - "score": 821.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)", - "tab": "General information", - "score": 587.52 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=495.728, mean=495.728, max=495.728, sum=991.457 (2)", - "tab": "General information", - "score": 495.728323699422 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=496.608, mean=496.608, max=496.608, sum=993.216 (2)", - "tab": "General information", - "score": 496.6078431372549 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.171, mean=0.171, max=0.171, sum=0.342 (2)", - "tab": "Efficiency", - "score": 0.17102816104888915 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=371.54, mean=371.54, max=371.54, sum=743.08 (2)", - "tab": "General information", - "score": 371.54 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.176, mean=0.176, max=0.176, sum=0.353 (2)", - "tab": "Efficiency", - "score": 0.1764866866563496 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)", - "tab": "General information", - "score": 607.4298245614035 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37, - "details": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.879 (2)", - "tab": "Efficiency", - "score": 0.4393133974075317 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)", - "tab": "General information", - "score": 392.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.806, - "details": { - "description": "min=0.806, mean=0.806, max=0.806, sum=1.611 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.747 (2)", - "tab": "Efficiency", - "score": 0.37349939346313477 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.639, mean=387.639, max=387.639, sum=775.278 (2)", - "tab": "General information", - "score": 387.6388888888889 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.759, - "details": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.763 (2)", - "tab": "Efficiency", - "score": 0.3817227730030415 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)", - "tab": "General information", - "score": 322.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.732, - "details": { - "description": "min=0.732, mean=0.732, max=0.732, sum=1.464 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.485, mean=0.485, max=0.485, sum=0.969 (2)", - "tab": "Efficiency", - "score": 0.48464199637665467 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)", - "tab": "Efficiency", - "score": 0.4387922709715282 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=1.012 (2)", - "tab": "Efficiency", - "score": 0.5061173195012079 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.468, mean=0.468, max=0.468, sum=0.935 (2)", - "tab": "Efficiency", - "score": 0.4675601058536106 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)", - "tab": "General information", - "score": 1087.5845588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)", - "tab": "General information", - "score": 651.5921985815603 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)", - "tab": "General information", - "score": 1630.7868318122555 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)", - "tab": "General information", - "score": 568.1143790849674 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.177, mean=0.177, max=0.177, sum=0.353 (2)", - "tab": "Efficiency", - "score": 0.17667593240737914 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)", - "tab": "General information", - "score": 415.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.763, - "details": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.885 (2)", - "tab": "Efficiency", - "score": 0.44235374111878245 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)", - "tab": "General information", - "score": 572.6907894736842 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.573, mean=0.573, max=0.573, sum=1.147 (2)", - "tab": "Efficiency", - "score": 0.5733751010894775 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)", - "tab": "General information", - "score": 562.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777, - "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=1.555 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.837 (2)", - "tab": "Efficiency", - "score": 0.4183455800110439 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=390.947, mean=390.947, max=390.947, sum=781.894 (2)", - "tab": "General information", - "score": 390.94716981132075 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.613, - "details": { - "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)", - "tab": "Efficiency", - "score": 0.3856722780998717 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=297.838, mean=297.838, max=297.838, sum=595.677 (2)", - "tab": "General information", - "score": 297.83829787234043 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.648, - "details": { - "description": "min=0.648, mean=0.648, max=0.648, sum=1.297 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.867 (2)", - "tab": "Efficiency", - "score": 0.43367810249328614 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=433.641, mean=433.641, max=433.641, sum=867.283 (2)", - "tab": "General information", - "score": 433.6413793103448 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)", - "tab": "Efficiency", - "score": 0.3857186824556381 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)", - "tab": "General information", - "score": 524.8624338624338 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.397, - "details": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.822 (2)", - "tab": "Efficiency", - "score": 0.4109457277116321 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)", - "tab": "General information", - "score": 599.7619047619048 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.857, - "details": { - "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.777 (2)", - "tab": "Efficiency", - "score": 0.38858610660799087 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)", - "tab": "Efficiency", - "score": 0.39599566624082366 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.495, mean=0.495, max=0.495, sum=0.99 (2)", - "tab": "Efficiency", - "score": 0.495233371257782 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.077 (2)", - "tab": "Efficiency", - "score": 0.5386766448165431 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.735 (2)", - "tab": "Efficiency", - "score": 0.36738430129157174 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Efficiency", - "score": 0.38988350463037047 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)", - "tab": "Efficiency", - "score": 0.3604950317969689 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.777 (2)", - "tab": "Efficiency", - "score": 0.38829568756951227 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.743 (2)", - "tab": "Efficiency", - "score": 0.37170837205999036 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Efficiency", - "score": 0.3798852077383079 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.79 (2)", - "tab": "Efficiency", - "score": 0.3950107355730249 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.954 (2)", - "tab": "Efficiency", - "score": 0.4768963897669757 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.528, mean=0.528, max=0.528, sum=1.056 (2)", - "tab": "Efficiency", - "score": 0.5277850253909242 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.517, mean=0.517, max=0.517, sum=1.034 (2)", - "tab": "Efficiency", - "score": 0.5169116002094897 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)", - "tab": "General information", - "score": 506.6774193548387 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=489.714, mean=489.714, max=489.714, sum=979.429 (2)", - "tab": "General information", - "score": 489.7142857142857 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)", - "tab": "General information", - "score": 860.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)", - "tab": "General information", - "score": 2791.072727272727 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.045, mean=365.045, max=365.045, sum=730.091 (2)", - "tab": "General information", - "score": 365.04545454545456 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)", - "tab": "General information", - "score": 458.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=364.562, mean=364.562, max=364.562, sum=729.123 (2)", - "tab": "General information", - "score": 364.5615384615385 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)", - "tab": "General information", - "score": 525.3740740740741 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=392.025, mean=392.025, max=392.025, sum=784.05 (2)", - "tab": "General information", - "score": 392.02521008403363 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)", - "tab": "General information", - "score": 553.4635761589404 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.246, mean=488.246, max=488.246, sum=976.492 (2)", - "tab": "General information", - "score": 488.24587155963303 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)", - "tab": "General information", - "score": 788.699074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)", - "tab": "General information", - "score": 2210.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)", - "tab": "General information", - "score": 1421.2700421940929 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)", - "tab": "Efficiency", - "score": 0.3799830274197018 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.783 (2)", - "tab": "Efficiency", - "score": 0.3914412269155488 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=312.906, mean=312.906, max=312.906, sum=625.812 (2)", - "tab": "General information", - "score": 312.90582959641256 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.183, mean=334.183, max=334.183, sum=668.366 (2)", - "tab": "General information", - "score": 334.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.556, mean=0.556, max=0.556, sum=1.113 (2)", - "tab": "Efficiency", - "score": 0.5563427140890074 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)", - "tab": "General information", - "score": 632.8512396694215 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.811 (2)", - "tab": "Efficiency", - "score": 0.4053135386273905 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.595, mean=442.595, max=442.595, sum=885.19 (2)", - "tab": "General information", - "score": 442.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.455, - "details": { - "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)", - "tab": "Efficiency", - "score": 0.45983841376645224 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)", - "tab": "General information", - "score": 661.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.773 (2)", - "tab": "Efficiency", - "score": 0.38629551535671197 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.796, mean=276.796, max=276.796, sum=553.592 (2)", - "tab": "General information", - "score": 276.79611650485435 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.391, mean=0.391, max=0.391, sum=0.781 (2)", - "tab": "Efficiency", - "score": 0.3906826453331189 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)", - "tab": "General information", - "score": 397.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.378, mean=0.378, max=0.378, sum=0.756 (2)", - "tab": "Efficiency", - "score": 0.3778671717643738 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=334, mean=334, max=334, sum=668 (2)", - "tab": "General information", - "score": 334.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.735 (2)", - "tab": "Efficiency", - "score": 0.36739401007368494 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=292.925, mean=292.925, max=292.925, sum=585.849 (2)", - "tab": "General information", - "score": 292.92464878671774 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.404, - "details": { - "description": "min=0.404, mean=0.404, max=0.404, sum=0.809 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.773 (2)", - "tab": "Efficiency", - "score": 0.38658536858641346 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.609, mean=0.609, max=0.609, sum=1.217 (2)", - "tab": "Efficiency", - "score": 0.6085127204490107 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.145, mean=469.145, max=469.145, sum=938.289 (2)", - "tab": "General information", - "score": 469.1445086705202 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)", - "tab": "General information", - "score": 649.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.758, - "details": { - "description": "min=0.758, mean=0.758, max=0.758, sum=1.516 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.319 (2)", - "tab": "Efficiency", - "score": 0.6593383916842392 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)", - "tab": "General information", - "score": 579.8169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.787, - "details": { - "description": "min=0.787, mean=0.787, max=0.787, sum=1.574 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.828 (2)", - "tab": "Efficiency", - "score": 0.4140352636207769 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)", - "tab": "General information", - "score": 507.55864197530866 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)", - "tab": "Efficiency", - "score": 0.3731096332723444 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)", - "tab": "General information", - "score": 398.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.496, mean=0.496, max=0.496, sum=0.993 (2)", - "tab": "Efficiency", - "score": 0.4963450723764848 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)", - "tab": "General information", - "score": 1157.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.741 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.376, mean=0.376, max=0.376, sum=0.753 (2)", - "tab": "Efficiency", - "score": 0.3763423120204489 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=438.522, mean=438.522, max=438.522, sum=877.045 (2)", - "tab": "General information", - "score": 438.5223880597015 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542, - "details": { - "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.387, mean=0.387, max=0.387, sum=0.775 (2)", - "tab": "Efficiency", - "score": 0.3873033107045185 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.09, mean=336.09, max=336.09, sum=672.181 (2)", - "tab": "General information", - "score": 336.0903614457831 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.673 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)", - "tab": "Efficiency", - "score": 0.4032876603087487 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.561, mean=268.561, max=268.561, sum=537.123 (2)", - "tab": "General information", - "score": 268.56140350877195 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json deleted file mode 100644 index 6ccc418f3..000000000 --- a/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4 0613", - "id": "openai/gpt-4-0613", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.54, mean=0.824, max=0.99, sum=93.978 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.364, mean=0.447, max=0.579, sum=51.005 (114)", - "tab": "Efficiency", - "score": 0.4474144183932911 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)", - "tab": "General information", - "score": 607.851634217556 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63, - "details": { - "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)", - "tab": "Efficiency", - "score": 0.39332568168640136 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=366.44, mean=366.44, max=366.44, sum=732.88 (2)", - "tab": "General information", - "score": 366.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.09 (2)", - "tab": "Efficiency", - "score": 0.5451150911825674 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)", - "tab": "General information", - "score": 346.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.627, - "details": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.389, mean=0.389, max=0.389, sum=0.778 (2)", - "tab": "Efficiency", - "score": 0.3888898015022278 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.866 (2)", - "tab": "Efficiency", - "score": 0.43280420700709027 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)", - "tab": "Efficiency", - "score": 0.49212974786758423 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)", - "tab": "Efficiency", - "score": 0.4354128074645996 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.861 (2)", - "tab": "Efficiency", - "score": 0.4306242893196944 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.41519686287524654 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)", - "tab": "General information", - "score": 542.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=466.917, mean=466.917, max=466.917, sum=933.833 (2)", - "tab": "General information", - "score": 466.9166666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)", - "tab": "General information", - "score": 821.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)", - "tab": "General information", - "score": 587.52 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=495.728, mean=495.728, max=495.728, sum=991.457 (2)", - "tab": "General information", - "score": 495.728323699422 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=496.608, mean=496.608, max=496.608, sum=993.216 (2)", - "tab": "General information", - "score": 496.6078431372549 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)", - "tab": "Efficiency", - "score": 0.3729291558265686 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=371.54, mean=371.54, max=371.54, sum=743.08 (2)", - "tab": "General information", - "score": 371.54 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.364, mean=0.364, max=0.364, sum=0.729 (2)", - "tab": "Efficiency", - "score": 0.36447873241023016 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)", - "tab": "General information", - "score": 607.4298245614035 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62, - "details": { - "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)", - "tab": "Efficiency", - "score": 0.4758000469207764 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)", - "tab": "General information", - "score": 392.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)", - "tab": "Efficiency", - "score": 0.43886900389636 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.639, mean=387.639, max=387.639, sum=775.278 (2)", - "tab": "General information", - "score": 387.6388888888889 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.717 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)", - "tab": "Efficiency", - "score": 0.40341131480177117 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)", - "tab": "General information", - "score": 322.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.891, - "details": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.483, mean=0.483, max=0.483, sum=0.966 (2)", - "tab": "Efficiency", - "score": 0.48306868356816907 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.888 (2)", - "tab": "Efficiency", - "score": 0.44407470006469296 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)", - "tab": "Efficiency", - "score": 0.578451920053017 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.469, mean=0.469, max=0.469, sum=0.938 (2)", - "tab": "Efficiency", - "score": 0.4690242421393301 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)", - "tab": "General information", - "score": 1087.5845588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)", - "tab": "General information", - "score": 651.5921985815603 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)", - "tab": "General information", - "score": 1630.7868318122555 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)", - "tab": "General information", - "score": 568.1143790849674 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.869 (2)", - "tab": "Efficiency", - "score": 0.43441893100738527 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)", - "tab": "General information", - "score": 415.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)", - "tab": "Efficiency", - "score": 0.4718977307018481 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)", - "tab": "General information", - "score": 572.6907894736842 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.953 (2)", - "tab": "Efficiency", - "score": 0.4765148901939392 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)", - "tab": "General information", - "score": 562.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.829 (2)", - "tab": "Efficiency", - "score": 0.414557883424579 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=390.947, mean=390.947, max=390.947, sum=781.894 (2)", - "tab": "General information", - "score": 390.94716981132075 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.767 (2)", - "tab": "Efficiency", - "score": 0.3836827186827964 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=297.838, mean=297.838, max=297.838, sum=595.677 (2)", - "tab": "General information", - "score": 297.83829787234043 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.399, mean=0.399, max=0.399, sum=0.798 (2)", - "tab": "Efficiency", - "score": 0.39915286919166304 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=433.641, mean=433.641, max=433.641, sum=867.283 (2)", - "tab": "General information", - "score": 433.6413793103448 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.845 (2)", - "tab": "Efficiency", - "score": 0.4225258120784053 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)", - "tab": "General information", - "score": 524.8624338624338 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.973 (2)", - "tab": "Efficiency", - "score": 0.48647683007376535 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)", - "tab": "General information", - "score": 599.7619047619048 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "details": { - "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)", - "tab": "Efficiency", - "score": 0.4360047817230225 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.413, mean=0.413, max=0.413, sum=0.827 (2)", - "tab": "Efficiency", - "score": 0.41338158710836775 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.001 (2)", - "tab": "Efficiency", - "score": 0.5002665758132935 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)", - "tab": "Efficiency", - "score": 0.578774525902488 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.829 (2)", - "tab": "Efficiency", - "score": 0.4142996747084338 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)", - "tab": "Efficiency", - "score": 0.43005221001224814 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)", - "tab": "Efficiency", - "score": 0.4160928750649477 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.4231933620240953 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.948 (2)", - "tab": "Efficiency", - "score": 0.4740273321376127 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)", - "tab": "Efficiency", - "score": 0.4620048778736039 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)", - "tab": "Efficiency", - "score": 0.40661886022725235 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)", - "tab": "Efficiency", - "score": 0.46296725780875597 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.546, mean=0.546, max=0.546, sum=1.091 (2)", - "tab": "Efficiency", - "score": 0.5456923538563299 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.517, mean=0.517, max=0.517, sum=1.033 (2)", - "tab": "Efficiency", - "score": 0.5166646488608188 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)", - "tab": "General information", - "score": 506.6774193548387 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=489.714, mean=489.714, max=489.714, sum=979.429 (2)", - "tab": "General information", - "score": 489.7142857142857 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)", - "tab": "General information", - "score": 860.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)", - "tab": "General information", - "score": 2791.072727272727 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.045, mean=365.045, max=365.045, sum=730.091 (2)", - "tab": "General information", - "score": 365.04545454545456 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)", - "tab": "General information", - "score": 458.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=364.562, mean=364.562, max=364.562, sum=729.123 (2)", - "tab": "General information", - "score": 364.5615384615385 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)", - "tab": "General information", - "score": 525.3740740740741 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=392.025, mean=392.025, max=392.025, sum=784.05 (2)", - "tab": "General information", - "score": 392.02521008403363 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)", - "tab": "General information", - "score": 553.4635761589404 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.246, mean=488.246, max=488.246, sum=976.492 (2)", - "tab": "General information", - "score": 488.24587155963303 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)", - "tab": "General information", - "score": 788.699074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)", - "tab": "General information", - "score": 2210.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)", - "tab": "General information", - "score": 1421.2700421940929 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)", - "tab": "Efficiency", - "score": 0.4058152218036053 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.932 (2)", - "tab": "Efficiency", - "score": 0.46620041541470825 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=312.906, mean=312.906, max=312.906, sum=625.812 (2)", - "tab": "General information", - "score": 312.90582959641256 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.183, mean=334.183, max=334.183, sum=668.366 (2)", - "tab": "General information", - "score": 334.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)", - "tab": "Efficiency", - "score": 0.4608367139642889 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)", - "tab": "General information", - "score": 632.8512396694215 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)", - "tab": "Efficiency", - "score": 0.4321035870745138 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.595, mean=442.595, max=442.595, sum=885.19 (2)", - "tab": "General information", - "score": 442.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.759, - "details": { - "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)", - "tab": "Efficiency", - "score": 0.46302694933755056 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)", - "tab": "General information", - "score": 661.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=1.864 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.891 (2)", - "tab": "Efficiency", - "score": 0.4455798760201167 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.796, mean=276.796, max=276.796, sum=553.592 (2)", - "tab": "General information", - "score": 276.79611650485435 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.962, - "details": { - "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.843 (2)", - "tab": "Efficiency", - "score": 0.4213859372668796 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)", - "tab": "General information", - "score": 397.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.823 (2)", - "tab": "Efficiency", - "score": 0.41135803937911986 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=334, mean=334, max=334, sum=668 (2)", - "tab": "General information", - "score": 334.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.901 (2)", - "tab": "Efficiency", - "score": 0.4505587230088001 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=292.925, mean=292.925, max=292.925, sum=585.849 (2)", - "tab": "General information", - "score": 292.92464878671774 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.902, - "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)", - "tab": "Efficiency", - "score": 0.4281756044123214 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)", - "tab": "Efficiency", - "score": 0.44513606945229645 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.145, mean=469.145, max=469.145, sum=938.289 (2)", - "tab": "General information", - "score": 469.1445086705202 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)", - "tab": "General information", - "score": 649.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.892, - "details": { - "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)", - "tab": "Efficiency", - "score": 0.4460979816960354 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)", - "tab": "General information", - "score": 579.8169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.926, - "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)", - "tab": "Efficiency", - "score": 0.42610209665180726 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)", - "tab": "General information", - "score": 507.55864197530866 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.496, mean=0.496, max=0.496, sum=0.992 (2)", - "tab": "Efficiency", - "score": 0.49601870450106533 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)", - "tab": "General information", - "score": 398.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)", - "tab": "Efficiency", - "score": 0.47064581306613223 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)", - "tab": "General information", - "score": 1157.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)", - "tab": "Efficiency", - "score": 0.42976075143956427 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=438.522, mean=438.522, max=438.522, sum=877.045 (2)", - "tab": "General information", - "score": 438.5223880597015 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Efficiency", - "score": 0.42023470890091125 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.09, mean=336.09, max=336.09, sum=672.181 (2)", - "tab": "General information", - "score": 336.0903614457831 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.901 (2)", - "tab": "Efficiency", - "score": 0.4507097779658803 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.561, mean=268.561, max=268.561, sum=537.123 (2)", - "tab": "General information", - "score": 268.56140350877195 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json deleted file mode 100644 index 610be9719..000000000 --- a/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4 Turbo 1106 preview", - "id": "openai/gpt-4-1106-preview", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.093, mean=0.796, max=0.979, sum=90.688 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.397, mean=0.537, max=0.852, sum=61.247 (114)", - "tab": "Efficiency", - "score": 0.5372507053364665 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)", - "tab": "General information", - "score": 607.851634217556 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)", - "tab": "Efficiency", - "score": 0.42504594564437864 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=366.44, mean=366.44, max=366.44, sum=732.88 (2)", - "tab": "General information", - "score": 366.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.569, mean=0.569, max=0.569, sum=1.138 (2)", - "tab": "Efficiency", - "score": 0.5691532982720269 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)", - "tab": "General information", - "score": 346.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402, - "details": { - "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.913 (2)", - "tab": "Efficiency", - "score": 0.456736900806427 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.888 (2)", - "tab": "Efficiency", - "score": 0.44404302537441254 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.516, mean=0.516, max=0.516, sum=1.033 (2)", - "tab": "Efficiency", - "score": 0.516348373889923 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.534, mean=0.534, max=0.534, sum=1.067 (2)", - "tab": "Efficiency", - "score": 0.5335026264190674 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.491, mean=0.491, max=0.491, sum=0.982 (2)", - "tab": "Efficiency", - "score": 0.4908691348368033 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.499 (2)", - "tab": "Efficiency", - "score": 0.7497045245825076 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)", - "tab": "General information", - "score": 542.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=466.917, mean=466.917, max=466.917, sum=933.833 (2)", - "tab": "General information", - "score": 466.9166666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)", - "tab": "General information", - "score": 821.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)", - "tab": "General information", - "score": 587.52 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=495.728, mean=495.728, max=495.728, sum=991.457 (2)", - "tab": "General information", - "score": 495.728323699422 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=496.608, mean=496.608, max=496.608, sum=993.216 (2)", - "tab": "General information", - "score": 496.6078431372549 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)", - "tab": "Efficiency", - "score": 0.4418716287612915 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=371.54, mean=371.54, max=371.54, sum=743.08 (2)", - "tab": "General information", - "score": 371.54 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)", - "tab": "Efficiency", - "score": 0.5149402095560442 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)", - "tab": "General information", - "score": 607.4298245614035 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.973 (2)", - "tab": "Efficiency", - "score": 0.4863955807685852 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)", - "tab": "General information", - "score": 392.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)", - "tab": "Efficiency", - "score": 0.7311423023541769 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.639, mean=387.639, max=387.639, sum=775.278 (2)", - "tab": "General information", - "score": 387.6388888888889 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.973 (2)", - "tab": "Efficiency", - "score": 0.4863421380328212 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)", - "tab": "General information", - "score": 322.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.887, - "details": { - "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.551, mean=0.551, max=0.551, sum=1.103 (2)", - "tab": "Efficiency", - "score": 0.5514215528964996 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.079 (2)", - "tab": "Efficiency", - "score": 0.5395518828791084 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)", - "tab": "Efficiency", - "score": 0.6162493903447317 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.563, mean=0.563, max=0.563, sum=1.126 (2)", - "tab": "Efficiency", - "score": 0.5629562961509804 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)", - "tab": "General information", - "score": 1087.5845588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)", - "tab": "General information", - "score": 651.5921985815603 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)", - "tab": "General information", - "score": 1630.7868318122555 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)", - "tab": "General information", - "score": 568.1143790849674 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", - "tab": "Efficiency", - "score": 0.39724321842193605 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)", - "tab": "General information", - "score": 415.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)", - "tab": "Efficiency", - "score": 0.5192367622726842 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)", - "tab": "General information", - "score": 572.6907894736842 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.495, mean=0.495, max=0.495, sum=0.99 (2)", - "tab": "Efficiency", - "score": 0.49495640993118284 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)", - "tab": "General information", - "score": 562.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.642, mean=0.642, max=0.642, sum=1.284 (2)", - "tab": "Efficiency", - "score": 0.6421918509141454 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=390.947, mean=390.947, max=390.947, sum=781.894 (2)", - "tab": "General information", - "score": 390.94716981132075 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.787 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)", - "tab": "Efficiency", - "score": 0.5859095319788507 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=297.838, mean=297.838, max=297.838, sum=595.677 (2)", - "tab": "General information", - "score": 297.83829787234043 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772, - "details": { - "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.507, mean=0.507, max=0.507, sum=1.014 (2)", - "tab": "Efficiency", - "score": 0.5071375830420133 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=433.641, mean=433.641, max=433.641, sum=867.283 (2)", - "tab": "General information", - "score": 433.6413793103448 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.638, - "details": { - "description": "min=0.638, mean=0.638, max=0.638, sum=1.275 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.972 (2)", - "tab": "Efficiency", - "score": 0.48600239034682985 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)", - "tab": "General information", - "score": 524.8624338624338 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.651, - "details": { - "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.491, mean=0.491, max=0.491, sum=0.983 (2)", - "tab": "Efficiency", - "score": 0.4912937557886517 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)", - "tab": "General information", - "score": 599.7619047619048 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.958, - "details": { - "description": "min=0.958, mean=0.958, max=0.958, sum=1.916 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.144 (2)", - "tab": "Efficiency", - "score": 0.5719813362244637 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)", - "tab": "Efficiency", - "score": 0.6560086276143643 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.568, mean=0.568, max=0.568, sum=1.137 (2)", - "tab": "Efficiency", - "score": 0.5683712005615235 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)", - "tab": "Efficiency", - "score": 0.6399081995992949 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.471, mean=0.471, max=0.471, sum=0.943 (2)", - "tab": "Efficiency", - "score": 0.47148694173254146 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Efficiency", - "score": 0.420210268831006 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)", - "tab": "Efficiency", - "score": 0.4451567802673731 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)", - "tab": "Efficiency", - "score": 0.43410645679191306 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)", - "tab": "Efficiency", - "score": 0.6560712812327537 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.574, mean=0.574, max=0.574, sum=1.148 (2)", - "tab": "Efficiency", - "score": 0.5739512143545593 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)", - "tab": "Efficiency", - "score": 0.4460442779261038 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.586, mean=0.586, max=0.586, sum=1.171 (2)", - "tab": "Efficiency", - "score": 0.5855172486216934 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)", - "tab": "Efficiency", - "score": 0.5790434245969734 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.285 (2)", - "tab": "Efficiency", - "score": 0.6425194448559596 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)", - "tab": "General information", - "score": 506.6774193548387 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=489.714, mean=489.714, max=489.714, sum=979.429 (2)", - "tab": "General information", - "score": 489.7142857142857 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)", - "tab": "General information", - "score": 860.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)", - "tab": "General information", - "score": 2791.072727272727 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.045, mean=365.045, max=365.045, sum=730.091 (2)", - "tab": "General information", - "score": 365.04545454545456 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)", - "tab": "General information", - "score": 458.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=364.562, mean=364.562, max=364.562, sum=729.123 (2)", - "tab": "General information", - "score": 364.5615384615385 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)", - "tab": "General information", - "score": 525.3740740740741 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=392.025, mean=392.025, max=392.025, sum=784.05 (2)", - "tab": "General information", - "score": 392.02521008403363 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)", - "tab": "General information", - "score": 553.4635761589404 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.246, mean=488.246, max=488.246, sum=976.492 (2)", - "tab": "General information", - "score": 488.24587155963303 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)", - "tab": "General information", - "score": 788.699074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)", - "tab": "General information", - "score": 2210.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)", - "tab": "General information", - "score": 1421.2700421940929 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)", - "tab": "Efficiency", - "score": 0.47213134316585526 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)", - "tab": "Efficiency", - "score": 0.5152236923916649 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=312.906, mean=312.906, max=312.906, sum=625.812 (2)", - "tab": "General information", - "score": 312.90582959641256 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.183, mean=334.183, max=334.183, sum=668.366 (2)", - "tab": "General information", - "score": 334.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.926, - "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.851 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.523, mean=0.523, max=0.523, sum=1.046 (2)", - "tab": "Efficiency", - "score": 0.5229926621618349 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)", - "tab": "General information", - "score": 632.8512396694215 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.699, mean=0.699, max=0.699, sum=1.398 (2)", - "tab": "Efficiency", - "score": 0.6990647155083031 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.595, mean=442.595, max=442.595, sum=885.19 (2)", - "tab": "General information", - "score": 442.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723, - "details": { - "description": "min=0.723, mean=0.723, max=0.723, sum=1.446 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.606, mean=0.606, max=0.606, sum=1.211 (2)", - "tab": "Efficiency", - "score": 0.6055374975715365 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)", - "tab": "General information", - "score": 661.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.576, mean=0.576, max=0.576, sum=1.152 (2)", - "tab": "Efficiency", - "score": 0.5760108475546235 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.796, mean=276.796, max=276.796, sum=553.592 (2)", - "tab": "General information", - "score": 276.79611650485435 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=1.863 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.495, mean=0.495, max=0.495, sum=0.991 (2)", - "tab": "Efficiency", - "score": 0.49540983204148775 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)", - "tab": "General information", - "score": 397.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.93, - "details": { - "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.541, mean=0.541, max=0.541, sum=1.082 (2)", - "tab": "Efficiency", - "score": 0.5407642388343811 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=334, mean=334, max=334, sum=668 (2)", - "tab": "General information", - "score": 334.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.946, - "details": { - "description": "min=0.946, mean=0.946, max=0.946, sum=1.893 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)", - "tab": "Efficiency", - "score": 0.4736132238103055 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=292.925, mean=292.925, max=292.925, sum=585.849 (2)", - "tab": "General information", - "score": 292.92464878671774 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.816, - "details": { - "description": "min=0.816, mean=0.816, max=0.816, sum=1.631 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.451, mean=0.451, max=0.451, sum=0.901 (2)", - "tab": "Efficiency", - "score": 0.45068276686475456 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.09 (2)", - "tab": "Efficiency", - "score": 0.5448215519249773 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.145, mean=469.145, max=469.145, sum=938.289 (2)", - "tab": "General information", - "score": 469.1445086705202 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)", - "tab": "General information", - "score": 649.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.879, - "details": { - "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)", - "tab": "Efficiency", - "score": 0.4411514296251185 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)", - "tab": "General information", - "score": 579.8169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.833 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.489, mean=0.489, max=0.489, sum=0.978 (2)", - "tab": "Efficiency", - "score": 0.4891524300163175 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)", - "tab": "General information", - "score": 507.55864197530866 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782, - "details": { - "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)", - "tab": "Efficiency", - "score": 0.46012504534287885 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)", - "tab": "General information", - "score": 398.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.841, - "details": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.546, mean=0.546, max=0.546, sum=1.093 (2)", - "tab": "Efficiency", - "score": 0.546490309189777 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)", - "tab": "General information", - "score": 1157.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.925, - "details": { - "description": "min=0.925, mean=0.925, max=0.925, sum=1.851 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)", - "tab": "Efficiency", - "score": 0.4410626805243801 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=438.522, mean=438.522, max=438.522, sum=877.045 (2)", - "tab": "General information", - "score": 438.5223880597015 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59, - "details": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Efficiency", - "score": 0.851962562066963 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.09, mean=336.09, max=336.09, sum=672.181 (2)", - "tab": "General information", - "score": 336.0903614457831 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)", - "tab": "Efficiency", - "score": 0.5664703581068251 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=268.561, mean=268.561, max=268.561, sum=537.123 (2)", - "tab": "General information", - "score": 268.56140350877195 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json deleted file mode 100644 index a348a9fb9..000000000 --- a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4 Turbo 2024-04-09", - "id": "openai/gpt-4-turbo-2024-04-09", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.813, - "details": { - "description": "min=0.515, mean=0.813, max=0.974, sum=92.65 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.479, mean=0.617, max=0.934, sum=70.3 (114)", - "tab": "Efficiency", - "score": 0.6166649052297876 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)", - "tab": "General information", - "score": 614.851634217556 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)", - "tab": "Efficiency", - "score": 0.539907853603363 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.44, mean=373.44, max=373.44, sum=746.88 (2)", - "tab": "General information", - "score": 373.44 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Efficiency", - "score": 0.5299274744810881 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)", - "tab": "General information", - "score": 353.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539, - "details": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.549, mean=0.549, max=0.549, sum=1.099 (2)", - "tab": "Efficiency", - "score": 0.5493535542488098 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.6, mean=0.6, max=0.6, sum=1.199 (2)", - "tab": "Efficiency", - "score": 0.5995734184980392 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)", - "tab": "Efficiency", - "score": 0.6911867094039917 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.609, mean=0.609, max=0.609, sum=1.219 (2)", - "tab": "Efficiency", - "score": 0.6092576813697815 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)", - "tab": "Efficiency", - "score": 0.6697626251705809 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.412 (2)", - "tab": "Efficiency", - "score": 0.7058592660754335 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)", - "tab": "General information", - "score": 549.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.917, mean=473.917, max=473.917, sum=947.833 (2)", - "tab": "General information", - "score": 473.9166666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)", - "tab": "General information", - "score": 828.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)", - "tab": "General information", - "score": 594.52 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)", - "tab": "General information", - "score": 502.728323699422 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)", - "tab": "General information", - "score": 503.6078431372549 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.061 (2)", - "tab": "Efficiency", - "score": 0.5303381824493408 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.54, mean=378.54, max=378.54, sum=757.08 (2)", - "tab": "General information", - "score": 378.54 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.144 (2)", - "tab": "Efficiency", - "score": 0.5721135453173989 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)", - "tab": "General information", - "score": 614.4298245614035 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.479, mean=0.479, max=0.479, sum=0.958 (2)", - "tab": "Efficiency", - "score": 0.47900029182434084 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.539, mean=0.539, max=0.539, sum=1.079 (2)", - "tab": "Efficiency", - "score": 0.5393155504156042 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.639, mean=394.639, max=394.639, sum=789.278 (2)", - "tab": "General information", - "score": 394.6388888888889 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.543, mean=0.543, max=0.543, sum=1.087 (2)", - "tab": "Efficiency", - "score": 0.5434573969273705 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.873, - "details": { - "description": "min=0.873, mean=0.873, max=0.873, sum=1.745 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)", - "tab": "Efficiency", - "score": 0.5794552100055358 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)", - "tab": "Efficiency", - "score": 0.5898241354218612 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.639, mean=0.639, max=0.639, sum=1.278 (2)", - "tab": "Efficiency", - "score": 0.6388053317424371 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)", - "tab": "Efficiency", - "score": 0.6712259284031936 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)", - "tab": "General information", - "score": 1094.5845588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)", - "tab": "General information", - "score": 658.5921985815603 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)", - "tab": "General information", - "score": 1637.7868318122555 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)", - "tab": "General information", - "score": 575.1143790849674 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.558, mean=0.558, max=0.558, sum=1.115 (2)", - "tab": "Efficiency", - "score": 0.557673556804657 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.666, mean=0.666, max=0.666, sum=1.332 (2)", - "tab": "Efficiency", - "score": 0.6662032525790366 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)", - "tab": "General information", - "score": 579.6907894736842 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)", - "tab": "Efficiency", - "score": 0.5981367039680481 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.591, mean=0.591, max=0.591, sum=1.183 (2)", - "tab": "Efficiency", - "score": 0.5912713131814633 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.947, mean=397.947, max=397.947, sum=795.894 (2)", - "tab": "General information", - "score": 397.94716981132075 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.787 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.685, mean=0.685, max=0.685, sum=1.369 (2)", - "tab": "Efficiency", - "score": 0.684603402969685 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.838, mean=304.838, max=304.838, sum=609.677 (2)", - "tab": "General information", - "score": 304.83829787234043 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)", - "tab": "Efficiency", - "score": 0.6487039006989578 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=440.641, mean=440.641, max=440.641, sum=881.283 (2)", - "tab": "General information", - "score": 440.6413793103448 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72, - "details": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.708, mean=0.708, max=0.708, sum=1.417 (2)", - "tab": "Efficiency", - "score": 0.708430844009238 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)", - "tab": "General information", - "score": 531.8624338624338 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.635, mean=0.635, max=0.635, sum=1.27 (2)", - "tab": "Efficiency", - "score": 0.6347800322941372 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)", - "tab": "General information", - "score": 606.7619047619048 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)", - "tab": "Efficiency", - "score": 0.6741217144073979 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)", - "tab": "Efficiency", - "score": 0.6728476491467706 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.626, mean=0.626, max=0.626, sum=1.252 (2)", - "tab": "Efficiency", - "score": 0.6261640882492066 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.747, mean=0.747, max=0.747, sum=1.495 (2)", - "tab": "Efficiency", - "score": 0.7474224538514108 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.335 (2)", - "tab": "Efficiency", - "score": 0.6672574221485793 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)", - "tab": "Efficiency", - "score": 0.6831059715290762 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)", - "tab": "Efficiency", - "score": 0.6132381714307344 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.594, mean=0.594, max=0.594, sum=1.188 (2)", - "tab": "Efficiency", - "score": 0.5939316025486698 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.585, mean=0.585, max=0.585, sum=1.169 (2)", - "tab": "Efficiency", - "score": 0.5845635728675778 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Efficiency", - "score": 0.9341671135251886 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)", - "tab": "Efficiency", - "score": 0.7410666920723171 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)", - "tab": "Efficiency", - "score": 0.7196061655327126 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", - "tab": "Efficiency", - "score": 0.7454434785188413 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Efficiency", - "score": 0.6665283818788166 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)", - "tab": "General information", - "score": 513.6774193548387 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.714, mean=496.714, max=496.714, sum=993.429 (2)", - "tab": "General information", - "score": 496.7142857142857 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)", - "tab": "General information", - "score": 2798.072727272727 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.045, mean=372.045, max=372.045, sum=744.091 (2)", - "tab": "General information", - "score": 372.04545454545456 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=371.562, mean=371.562, max=371.562, sum=743.123 (2)", - "tab": "General information", - "score": 371.5615384615385 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)", - "tab": "General information", - "score": 532.3740740740741 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.025, mean=399.025, max=399.025, sum=798.05 (2)", - "tab": "General information", - "score": 399.02521008403363 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)", - "tab": "General information", - "score": 560.4635761589404 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.246, mean=495.246, max=495.246, sum=990.492 (2)", - "tab": "General information", - "score": 495.24587155963303 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)", - "tab": "General information", - "score": 795.699074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)", - "tab": "General information", - "score": 1428.2700421940929 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.656, mean=0.656, max=0.656, sum=1.313 (2)", - "tab": "Efficiency", - "score": 0.6564141239286003 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)", - "tab": "Efficiency", - "score": 0.6131143715545422 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.906, mean=319.906, max=319.906, sum=639.812 (2)", - "tab": "General information", - "score": 319.90582959641256 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.183, mean=341.183, max=341.183, sum=682.366 (2)", - "tab": "General information", - "score": 341.1832061068702 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.942, - "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)", - "tab": "Efficiency", - "score": 0.6297830116650289 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)", - "tab": "General information", - "score": 639.8512396694215 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.585, mean=0.585, max=0.585, sum=1.171 (2)", - "tab": "Efficiency", - "score": 0.585445927695994 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.595, mean=449.595, max=449.595, sum=899.19 (2)", - "tab": "General information", - "score": 449.5950920245399 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)", - "tab": "Efficiency", - "score": 0.718035706451961 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.592, mean=0.592, max=0.592, sum=1.184 (2)", - "tab": "Efficiency", - "score": 0.5921963488013999 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.796, mean=283.796, max=283.796, sum=567.592 (2)", - "tab": "General information", - "score": 283.79611650485435 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.588, mean=0.588, max=0.588, sum=1.176 (2)", - "tab": "Efficiency", - "score": 0.5880082672477788 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Efficiency", - "score": 0.5201336288452149 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=341, mean=341, max=341, sum=682 (2)", - "tab": "General information", - "score": 341.0 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "details": { - "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.565, mean=0.565, max=0.565, sum=1.13 (2)", - "tab": "Efficiency", - "score": 0.5650817577561809 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.925, mean=299.925, max=299.925, sum=599.849 (2)", - "tab": "General information", - "score": 299.92464878671774 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.803, - "details": { - "description": "min=0.803, mean=0.803, max=0.803, sum=1.607 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.564, mean=0.564, max=0.564, sum=1.129 (2)", - "tab": "Efficiency", - "score": 0.5643301023913256 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.599, mean=0.599, max=0.599, sum=1.197 (2)", - "tab": "Efficiency", - "score": 0.5985688052363902 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.145, mean=476.145, max=476.145, sum=952.289 (2)", - "tab": "General information", - "score": 476.1445086705202 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.892, - "details": { - "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.532, mean=0.532, max=0.532, sum=1.063 (2)", - "tab": "Efficiency", - "score": 0.5316595968857311 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)", - "tab": "General information", - "score": 586.8169934640523 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.54, mean=0.54, max=0.54, sum=1.079 (2)", - "tab": "Efficiency", - "score": 0.5397091279795141 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)", - "tab": "General information", - "score": 514.5586419753087 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.168 (2)", - "tab": "Efficiency", - "score": 0.5840315688740123 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.529, mean=0.529, max=0.529, sum=1.058 (2)", - "tab": "Efficiency", - "score": 0.529095221538933 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "details": { - "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Efficiency", - "score": 0.5199050891458692 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.522, mean=445.522, max=445.522, sum=891.045 (2)", - "tab": "General information", - "score": 445.5223880597015 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602, - "details": { - "description": "min=0.602, mean=0.602, max=0.602, sum=1.205 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.523, mean=0.523, max=0.523, sum=1.045 (2)", - "tab": "Efficiency", - "score": 0.5226844951330897 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.09, mean=343.09, max=343.09, sum=686.181 (2)", - "tab": "General information", - "score": 343.0903614457831 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.848, - "details": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.494, mean=0.494, max=0.494, sum=0.988 (2)", - "tab": "Efficiency", - "score": 0.49407080739562276 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=275.561, mean=275.561, max=275.561, sum=551.123 (2)", - "tab": "General information", - "score": 275.56140350877195 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.351, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json deleted file mode 100644 index 76ba53d53..000000000 --- a/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4o 2024-05-13", - "id": "openai/gpt-4o-2024-05-13", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.47, mean=0.842, max=0.979, sum=95.957 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.314, mean=0.37, max=0.515, sum=42.144 (114)", - "tab": "Efficiency", - "score": 0.3696883367683005 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=267.936, mean=612.332, max=2793.83, sum=69805.818 (114)", - "tab": "General information", - "score": 612.3317391408493 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.66, - "details": { - "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.761 (2)", - "tab": "Efficiency", - "score": 0.38067533016204835 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=374.53, mean=374.53, max=374.53, sum=749.06 (2)", - "tab": "General information", - "score": 374.53 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911, - "details": { - "description": "min=0.911, mean=0.911, max=0.911, sum=1.822 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)", - "tab": "Efficiency", - "score": 0.3328125264909532 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=350.6, mean=350.6, max=350.6, sum=701.2 (2)", - "tab": "General information", - "score": 350.6 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.473, mean=0.473, max=0.473, sum=0.947 (2)", - "tab": "Efficiency", - "score": 0.4733888053894043 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.855 (2)", - "tab": "Efficiency", - "score": 0.4276181277301576 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.734 (2)", - "tab": "Efficiency", - "score": 0.36701245784759523 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)", - "tab": "Efficiency", - "score": 0.3324534225463867 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)", - "tab": "Efficiency", - "score": 0.3647800649521668 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.699 (2)", - "tab": "Efficiency", - "score": 0.3492975866093355 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=552.07, mean=552.07, max=552.07, sum=1104.14 (2)", - "tab": "General information", - "score": 552.07 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=468.056, mean=468.056, max=468.056, sum=936.111 (2)", - "tab": "General information", - "score": 468.05555555555554 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)", - "tab": "General information", - "score": 828.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.44, mean=594.44, max=594.44, sum=1188.88 (2)", - "tab": "General information", - "score": 594.44 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=499.566, mean=499.566, max=499.566, sum=999.133 (2)", - "tab": "General information", - "score": 499.5664739884393 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=502.412, mean=502.412, max=502.412, sum=1004.824 (2)", - "tab": "General information", - "score": 502.4117647058824 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)", - "tab": "Efficiency", - "score": 0.35994538068771365 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=373.42, mean=373.42, max=373.42, sum=746.84 (2)", - "tab": "General information", - "score": 373.42 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.693, - "details": { - "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.354, mean=0.354, max=0.354, sum=0.709 (2)", - "tab": "Efficiency", - "score": 0.3544190766518576 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=613.228, mean=613.228, max=613.228, sum=1226.456 (2)", - "tab": "General information", - "score": 613.2280701754386 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)", - "tab": "Efficiency", - "score": 0.3264468240737915 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.69, mean=399.69, max=399.69, sum=799.38 (2)", - "tab": "General information", - "score": 399.69 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898, - "details": { - "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.815 (2)", - "tab": "Efficiency", - "score": 0.40749982330534196 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=391.231, mean=391.231, max=391.231, sum=782.463 (2)", - "tab": "General information", - "score": 391.23148148148147 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.963 (2)", - "tab": "Efficiency", - "score": 0.48153685373508665 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=327.92, mean=327.92, max=327.92, sum=655.839 (2)", - "tab": "General information", - "score": 327.91961414790995 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "details": { - "description": "min=0.905, mean=0.905, max=0.905, sum=1.81 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)", - "tab": "Efficiency", - "score": 0.3862454724662444 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)", - "tab": "Efficiency", - "score": 0.3472177982330322 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)", - "tab": "Efficiency", - "score": 0.47372100343915596 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.661 (2)", - "tab": "Efficiency", - "score": 0.330327843528947 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1071.18, mean=1071.18, max=1071.18, sum=2142.36 (2)", - "tab": "General information", - "score": 1071.1801470588234 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=657.206, mean=657.206, max=657.206, sum=1314.411 (2)", - "tab": "General information", - "score": 657.2056737588653 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1629.344, mean=1629.344, max=1629.344, sum=3258.687 (2)", - "tab": "General information", - "score": 1629.3435462842242 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=574.518, mean=574.518, max=574.518, sum=1149.036 (2)", - "tab": "General information", - "score": 574.5179738562091 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)", - "tab": "Efficiency", - "score": 0.335811505317688 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=421.71, mean=421.71, max=421.71, sum=843.42 (2)", - "tab": "General information", - "score": 421.71 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.697 (2)", - "tab": "Efficiency", - "score": 0.34870150528456034 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=577.349, mean=577.349, max=577.349, sum=1154.697 (2)", - "tab": "General information", - "score": 577.3486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)", - "tab": "Efficiency", - "score": 0.3450936794281006 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=565.7, mean=565.7, max=565.7, sum=1131.4 (2)", - "tab": "General information", - "score": 565.7 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.789 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)", - "tab": "Efficiency", - "score": 0.33114023748433813 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=400.985, mean=400.985, max=400.985, sum=801.97 (2)", - "tab": "General information", - "score": 400.98490566037736 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911, - "details": { - "description": "min=0.911, mean=0.911, max=0.911, sum=1.821 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.34625059391589874 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.677, mean=304.677, max=304.677, sum=609.353 (2)", - "tab": "General information", - "score": 304.67659574468087 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.807, - "details": { - "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.717 (2)", - "tab": "Efficiency", - "score": 0.35874251661629514 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=439.228, mean=439.228, max=439.228, sum=878.455 (2)", - "tab": "General information", - "score": 439.22758620689655 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)", - "tab": "Efficiency", - "score": 0.360492156926917 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=532.683, mean=532.683, max=532.683, sum=1065.365 (2)", - "tab": "General information", - "score": 532.6825396825396 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.683, mean=0.683, max=0.683, sum=1.365 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.458, mean=0.458, max=0.458, sum=0.915 (2)", - "tab": "Efficiency", - "score": 0.4577372566102043 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=604.492, mean=604.492, max=604.492, sum=1208.984 (2)", - "tab": "General information", - "score": 604.4920634920635 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "details": { - "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.844 (2)", - "tab": "Efficiency", - "score": 0.42223084818932316 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.703 (2)", - "tab": "Efficiency", - "score": 0.3515606560730582 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Efficiency", - "score": 0.39000784397125243 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.515, mean=0.515, max=0.515, sum=1.029 (2)", - "tab": "Efficiency", - "score": 0.5147185542366721 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.697 (2)", - "tab": "Efficiency", - "score": 0.34874117615247013 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)", - "tab": "Efficiency", - "score": 0.4252293505199215 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Efficiency", - "score": 0.3419678932581192 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)", - "tab": "Efficiency", - "score": 0.3482617440047088 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.41, mean=0.41, max=0.41, sum=0.819 (2)", - "tab": "Efficiency", - "score": 0.4096046676154898 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.731 (2)", - "tab": "Efficiency", - "score": 0.36535484427647874 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.3435875463923183 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.3434795880759204 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)", - "tab": "Efficiency", - "score": 0.5077870616725847 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.891 (2)", - "tab": "Efficiency", - "score": 0.44530287473010616 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=504.874, mean=504.874, max=504.874, sum=1009.748 (2)", - "tab": "General information", - "score": 504.8741935483871 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=495.34, mean=495.34, max=495.34, sum=990.68 (2)", - "tab": "General information", - "score": 495.3399014778325 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=865.8, mean=865.8, max=865.8, sum=1731.6 (2)", - "tab": "General information", - "score": 865.8 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2793.83, mean=2793.83, max=2793.83, sum=5587.661 (2)", - "tab": "General information", - "score": 2793.830303030303 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.783, mean=372.783, max=372.783, sum=745.566 (2)", - "tab": "General information", - "score": 372.7828282828283 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=463.01, mean=463.01, max=463.01, sum=926.021 (2)", - "tab": "General information", - "score": 463.0103626943005 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=371.451, mean=371.451, max=371.451, sum=742.903 (2)", - "tab": "General information", - "score": 371.4512820512821 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.456, mean=532.456, max=532.456, sum=1064.911 (2)", - "tab": "General information", - "score": 532.4555555555555 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=398.739, mean=398.739, max=398.739, sum=797.479 (2)", - "tab": "General information", - "score": 398.73949579831935 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.238, mean=560.238, max=560.238, sum=1120.477 (2)", - "tab": "General information", - "score": 560.2384105960265 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=492.917, mean=492.917, max=492.917, sum=985.835 (2)", - "tab": "General information", - "score": 492.91743119266056 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=787.574, mean=787.574, max=787.574, sum=1575.148 (2)", - "tab": "General information", - "score": 787.574074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2220.005, mean=2220.005, max=2220.005, sum=4440.01 (2)", - "tab": "General information", - "score": 2220.0049019607845 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1424.439, mean=1424.439, max=1424.439, sum=2848.878 (2)", - "tab": "General information", - "score": 1424.4388185654009 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.908, - "details": { - "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.793 (2)", - "tab": "Efficiency", - "score": 0.39673851637562296 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)", - "tab": "Efficiency", - "score": 0.37223931305281077 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=316.453, mean=316.453, max=316.453, sum=632.906 (2)", - "tab": "General information", - "score": 316.4529147982063 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=335.695, mean=335.695, max=335.695, sum=671.389 (2)", - "tab": "General information", - "score": 335.69465648854964 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.336965306731295 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.504, mean=639.504, max=639.504, sum=1279.008 (2)", - "tab": "General information", - "score": 639.5041322314049 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.3214270746781051 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=445.84, mean=445.84, max=445.84, sum=891.681 (2)", - "tab": "General information", - "score": 445.840490797546 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=1.536 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.657 (2)", - "tab": "Efficiency", - "score": 0.3284116280930383 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=666.205, mean=666.205, max=666.205, sum=1332.411 (2)", - "tab": "General information", - "score": 666.2053571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.942, - "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)", - "tab": "Efficiency", - "score": 0.32008614354920617 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=279.485, mean=279.485, max=279.485, sum=558.971 (2)", - "tab": "General information", - "score": 279.4854368932039 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.936, - "details": { - "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)", - "tab": "Efficiency", - "score": 0.3374974228378035 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=399.85, mean=399.85, max=399.85, sum=799.701 (2)", - "tab": "General information", - "score": 399.85042735042737 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.33016372203826905 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=343.23, mean=343.23, max=343.23, sum=686.46 (2)", - "tab": "General information", - "score": 343.23 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.954, - "details": { - "description": "min=0.954, mean=0.954, max=0.954, sum=1.908 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)", - "tab": "Efficiency", - "score": 0.335910246898997 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=296.479, mean=296.479, max=296.479, sum=592.958 (2)", - "tab": "General information", - "score": 296.47892720306515 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.841, - "details": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)", - "tab": "Efficiency", - "score": 0.3332573719796418 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.3436078146183291 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=474.835, mean=474.835, max=474.835, sum=949.671 (2)", - "tab": "General information", - "score": 474.83526011560696 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=655.068, mean=655.068, max=655.068, sum=1310.136 (2)", - "tab": "General information", - "score": 655.068156424581 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.899, - "details": { - "description": "min=0.899, mean=0.899, max=0.899, sum=1.797 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.737 (2)", - "tab": "Efficiency", - "score": 0.36828617722380397 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=581.997, mean=581.997, max=581.997, sum=1163.993 (2)", - "tab": "General information", - "score": 581.9967320261438 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938, - "details": { - "description": "min=0.938, mean=0.938, max=0.938, sum=1.877 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.635 (2)", - "tab": "Efficiency", - "score": 0.31765871430620735 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=513.944, mean=513.944, max=513.944, sum=1027.889 (2)", - "tab": "General information", - "score": 513.9444444444445 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.699 (2)", - "tab": "Efficiency", - "score": 0.3496434450149536 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=402.918, mean=402.918, max=402.918, sum=805.836 (2)", - "tab": "General information", - "score": 402.91818181818184 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.837, - "details": { - "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Efficiency", - "score": 0.3501845612817881 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1166.686, mean=1166.686, max=1166.686, sum=2333.371 (2)", - "tab": "General information", - "score": 1166.6857142857143 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.881 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.346723644294549 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=444.269, mean=444.269, max=444.269, sum=888.537 (2)", - "tab": "General information", - "score": 444.2686567164179 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596, - "details": { - "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.628 (2)", - "tab": "Efficiency", - "score": 0.3142197634800371 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=334.434, mean=334.434, max=334.434, sum=668.867 (2)", - "tab": "General information", - "score": 334.43373493975906 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.889, - "details": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)", - "tab": "Efficiency", - "score": 0.3320118307370191 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=267.936, mean=267.936, max=267.936, sum=535.871 (2)", - "tab": "General information", - "score": 267.9356725146199 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.671, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json deleted file mode 100644 index 2d538eb02..000000000 --- a/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4o 2024-08-06", - "id": "openai/gpt-4o-2024-08-06", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.481, mean=0.843, max=0.984, sum=96.141 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.301, mean=0.459, max=0.88, sum=52.346 (114)", - "tab": "Efficiency", - "score": 0.45917774780314197 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=267.936, mean=612.332, max=2793.83, sum=69805.818 (114)", - "tab": "General information", - "score": 612.3317391408493 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)", - "tab": "Efficiency", - "score": 0.3350093102455139 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=374.53, mean=374.53, max=374.53, sum=749.06 (2)", - "tab": "General information", - "score": 374.53 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911, - "details": { - "description": "min=0.911, mean=0.911, max=0.911, sum=1.822 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.508, mean=0.508, max=0.508, sum=1.015 (2)", - "tab": "Efficiency", - "score": 0.5075124228442157 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=350.6, mean=350.6, max=350.6, sum=701.2 (2)", - "tab": "General information", - "score": 350.6 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.818 (2)", - "tab": "Efficiency", - "score": 0.4090025806427002 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.41, mean=0.41, max=0.41, sum=0.82 (2)", - "tab": "Efficiency", - "score": 0.40991874204741585 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.869, mean=0.869, max=0.869, sum=1.739 (2)", - "tab": "Efficiency", - "score": 0.8693285202980041 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.482, mean=0.482, max=0.482, sum=0.964 (2)", - "tab": "Efficiency", - "score": 0.4821875333786011 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.791 (2)", - "tab": "Efficiency", - "score": 0.3955839837906678 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)", - "tab": "Efficiency", - "score": 0.5307925659067491 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=552.07, mean=552.07, max=552.07, sum=1104.14 (2)", - "tab": "General information", - "score": 552.07 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=468.056, mean=468.056, max=468.056, sum=936.111 (2)", - "tab": "General information", - "score": 468.05555555555554 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)", - "tab": "General information", - "score": 828.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.44, mean=594.44, max=594.44, sum=1188.88 (2)", - "tab": "General information", - "score": 594.44 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=499.566, mean=499.566, max=499.566, sum=999.133 (2)", - "tab": "General information", - "score": 499.5664739884393 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=502.412, mean=502.412, max=502.412, sum=1004.824 (2)", - "tab": "General information", - "score": 502.4117647058824 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)", - "tab": "Efficiency", - "score": 0.5020688962936402 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=373.42, mean=373.42, max=373.42, sum=746.84 (2)", - "tab": "General information", - "score": 373.42 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711, - "details": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)", - "tab": "Efficiency", - "score": 0.44516249497731525 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=613.228, mean=613.228, max=613.228, sum=1226.456 (2)", - "tab": "General information", - "score": 613.2280701754386 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)", - "tab": "Efficiency", - "score": 0.3012181663513184 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.69, mean=399.69, max=399.69, sum=799.38 (2)", - "tab": "General information", - "score": 399.69 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.907, - "details": { - "description": "min=0.907, mean=0.907, max=0.907, sum=1.815 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)", - "tab": "Efficiency", - "score": 0.3880515495936076 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=391.231, mean=391.231, max=391.231, sum=782.463 (2)", - "tab": "General information", - "score": 391.23148148148147 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.483, mean=0.483, max=0.483, sum=0.965 (2)", - "tab": "Efficiency", - "score": 0.48272855795464714 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=327.92, mean=327.92, max=327.92, sum=655.839 (2)", - "tab": "General information", - "score": 327.91961414790995 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.899, - "details": { - "description": "min=0.899, mean=0.899, max=0.899, sum=1.797 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.897 (2)", - "tab": "Efficiency", - "score": 0.4483548367724699 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.419, mean=0.419, max=0.419, sum=0.839 (2)", - "tab": "Efficiency", - "score": 0.4192587585313946 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)", - "tab": "Efficiency", - "score": 0.462134175381418 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)", - "tab": "Efficiency", - "score": 0.5180651210491953 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1071.18, mean=1071.18, max=1071.18, sum=2142.36 (2)", - "tab": "General information", - "score": 1071.1801470588234 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=657.206, mean=657.206, max=657.206, sum=1314.411 (2)", - "tab": "General information", - "score": 657.2056737588653 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1629.344, mean=1629.344, max=1629.344, sum=3258.687 (2)", - "tab": "General information", - "score": 1629.3435462842242 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=574.518, mean=574.518, max=574.518, sum=1149.036 (2)", - "tab": "General information", - "score": 574.5179738562091 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.95, - "details": { - "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.512, mean=0.512, max=0.512, sum=1.025 (2)", - "tab": "Efficiency", - "score": 0.5122887134552002 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=421.71, mean=421.71, max=421.71, sum=843.42 (2)", - "tab": "General information", - "score": 421.71 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.947, - "details": { - "description": "min=0.947, mean=0.947, max=0.947, sum=1.895 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.869 (2)", - "tab": "Efficiency", - "score": 0.4347311226945174 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=577.349, mean=577.349, max=577.349, sum=1154.697 (2)", - "tab": "General information", - "score": 577.3486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)", - "tab": "Efficiency", - "score": 0.5199928903579711 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=565.7, mean=565.7, max=565.7, sum=1131.4 (2)", - "tab": "General information", - "score": 565.7 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.789 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.613 (2)", - "tab": "Efficiency", - "score": 0.3066561905842907 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=400.985, mean=400.985, max=400.985, sum=801.97 (2)", - "tab": "General information", - "score": 400.98490566037736 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.923, - "details": { - "description": "min=0.923, mean=0.923, max=0.923, sum=1.847 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.763 (2)", - "tab": "Efficiency", - "score": 0.3812521427235705 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.677, mean=304.677, max=304.677, sum=609.353 (2)", - "tab": "General information", - "score": 304.67659574468087 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.437, mean=0.437, max=0.437, sum=0.874 (2)", - "tab": "Efficiency", - "score": 0.4368692447399271 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=439.228, mean=439.228, max=439.228, sum=878.455 (2)", - "tab": "General information", - "score": 439.22758620689655 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775, - "details": { - "description": "min=0.775, mean=0.775, max=0.775, sum=1.55 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.747 (2)", - "tab": "Efficiency", - "score": 0.37356801449306426 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=532.683, mean=532.683, max=532.683, sum=1065.365 (2)", - "tab": "General information", - "score": 532.6825396825396 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675, - "details": { - "description": "min=0.675, mean=0.675, max=0.675, sum=1.349 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)", - "tab": "Efficiency", - "score": 0.3414205180274116 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=604.492, mean=604.492, max=604.492, sum=1208.984 (2)", - "tab": "General information", - "score": 604.4920634920635 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.941, - "details": { - "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.511, mean=0.511, max=0.511, sum=1.021 (2)", - "tab": "Efficiency", - "score": 0.5105965960410334 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)", - "tab": "Efficiency", - "score": 0.3379564614131533 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", - "tab": "Efficiency", - "score": 0.3969814705848694 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.594, mean=0.594, max=0.594, sum=1.189 (2)", - "tab": "Efficiency", - "score": 0.5944608587207216 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.706 (2)", - "tab": "Efficiency", - "score": 0.3532402262543187 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)", - "tab": "Efficiency", - "score": 0.8798744147305662 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.501, mean=0.501, max=0.501, sum=1.003 (2)", - "tab": "Efficiency", - "score": 0.501340057911017 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)", - "tab": "Efficiency", - "score": 0.4721549925980745 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)", - "tab": "Efficiency", - "score": 0.4058714473948759 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.484, mean=0.484, max=0.484, sum=0.968 (2)", - "tab": "Efficiency", - "score": 0.48384577075377205 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.532, mean=0.532, max=0.532, sum=1.063 (2)", - "tab": "Efficiency", - "score": 0.5316181160988064 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)", - "tab": "Efficiency", - "score": 0.5179998201352579 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.573, mean=0.573, max=0.573, sum=1.147 (2)", - "tab": "Efficiency", - "score": 0.5734734535217285 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.461, mean=0.461, max=0.461, sum=0.923 (2)", - "tab": "Efficiency", - "score": 0.4614185592796229 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=504.874, mean=504.874, max=504.874, sum=1009.748 (2)", - "tab": "General information", - "score": 504.8741935483871 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=495.34, mean=495.34, max=495.34, sum=990.68 (2)", - "tab": "General information", - "score": 495.3399014778325 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=865.8, mean=865.8, max=865.8, sum=1731.6 (2)", - "tab": "General information", - "score": 865.8 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2793.83, mean=2793.83, max=2793.83, sum=5587.661 (2)", - "tab": "General information", - "score": 2793.830303030303 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.783, mean=372.783, max=372.783, sum=745.566 (2)", - "tab": "General information", - "score": 372.7828282828283 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=463.01, mean=463.01, max=463.01, sum=926.021 (2)", - "tab": "General information", - "score": 463.0103626943005 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=371.451, mean=371.451, max=371.451, sum=742.903 (2)", - "tab": "General information", - "score": 371.4512820512821 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.456, mean=532.456, max=532.456, sum=1064.911 (2)", - "tab": "General information", - "score": 532.4555555555555 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=398.739, mean=398.739, max=398.739, sum=797.479 (2)", - "tab": "General information", - "score": 398.73949579831935 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.238, mean=560.238, max=560.238, sum=1120.477 (2)", - "tab": "General information", - "score": 560.2384105960265 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=492.917, mean=492.917, max=492.917, sum=985.835 (2)", - "tab": "General information", - "score": 492.91743119266056 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=787.574, mean=787.574, max=787.574, sum=1575.148 (2)", - "tab": "General information", - "score": 787.574074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2220.005, mean=2220.005, max=2220.005, sum=4440.01 (2)", - "tab": "General information", - "score": 2220.0049019607845 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1424.439, mean=1424.439, max=1424.439, sum=2848.878 (2)", - "tab": "General information", - "score": 1424.4388185654009 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)", - "tab": "Efficiency", - "score": 0.4033327327180871 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", - "tab": "Efficiency", - "score": 0.3971163625935562 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=316.453, mean=316.453, max=316.453, sum=632.906 (2)", - "tab": "General information", - "score": 316.4529147982063 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=335.695, mean=335.695, max=335.695, sum=671.389 (2)", - "tab": "General information", - "score": 335.69465648854964 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.942, - "details": { - "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.437, mean=0.437, max=0.437, sum=0.875 (2)", - "tab": "Efficiency", - "score": 0.4373398063596615 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.504, mean=639.504, max=639.504, sum=1279.008 (2)", - "tab": "General information", - "score": 639.5041322314049 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.902, - "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.804 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)", - "tab": "Efficiency", - "score": 0.44485992888000114 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=445.84, mean=445.84, max=445.84, sum=891.681 (2)", - "tab": "General information", - "score": 445.840490797546 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777, - "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=1.554 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.414, mean=0.414, max=0.414, sum=0.829 (2)", - "tab": "Efficiency", - "score": 0.41432228897299084 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=666.205, mean=666.205, max=666.205, sum=1332.411 (2)", - "tab": "General information", - "score": 666.2053571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)", - "tab": "Efficiency", - "score": 0.4598746878429524 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=279.485, mean=279.485, max=279.485, sum=558.971 (2)", - "tab": "General information", - "score": 279.4854368932039 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.481, mean=0.481, max=0.481, sum=0.962 (2)", - "tab": "Efficiency", - "score": 0.4812224573559231 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=399.85, mean=399.85, max=399.85, sum=799.701 (2)", - "tab": "General information", - "score": 399.85042735042737 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.98, - "details": { - "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)", - "tab": "Efficiency", - "score": 0.42490904808044433 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=343.23, mean=343.23, max=343.23, sum=686.46 (2)", - "tab": "General information", - "score": 343.23 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.958, - "details": { - "description": "min=0.958, mean=0.958, max=0.958, sum=1.916 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.915 (2)", - "tab": "Efficiency", - "score": 0.457414278734385 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=296.479, mean=296.479, max=296.479, sum=592.958 (2)", - "tab": "General information", - "score": 296.47892720306515 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802, - "details": { - "description": "min=0.802, mean=0.802, max=0.802, sum=1.604 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.364, mean=0.364, max=0.364, sum=0.727 (2)", - "tab": "Efficiency", - "score": 0.3637407087866282 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)", - "tab": "Efficiency", - "score": 0.46217673823820143 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=474.835, mean=474.835, max=474.835, sum=949.671 (2)", - "tab": "General information", - "score": 474.83526011560696 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=655.068, mean=655.068, max=655.068, sum=1310.136 (2)", - "tab": "General information", - "score": 655.068156424581 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.905, - "details": { - "description": "min=0.905, mean=0.905, max=0.905, sum=1.81 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.847 (2)", - "tab": "Efficiency", - "score": 0.42327408541261763 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=581.997, mean=581.997, max=581.997, sum=1163.993 (2)", - "tab": "General information", - "score": 581.9967320261438 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.935, - "details": { - "description": "min=0.935, mean=0.935, max=0.935, sum=1.87 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.486, mean=0.486, max=0.486, sum=0.972 (2)", - "tab": "Efficiency", - "score": 0.48604018452726766 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=513.944, mean=513.944, max=513.944, sum=1027.889 (2)", - "tab": "General information", - "score": 513.9444444444445 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782, - "details": { - "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)", - "tab": "Efficiency", - "score": 0.47211467786268757 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=402.918, mean=402.918, max=402.918, sum=805.836 (2)", - "tab": "General information", - "score": 402.91818181818184 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.665 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)", - "tab": "Efficiency", - "score": 0.45247335336646255 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1166.686, mean=1166.686, max=1166.686, sum=2333.371 (2)", - "tab": "General information", - "score": 1166.6857142857143 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945, - "details": { - "description": "min=0.945, mean=0.945, max=0.945, sum=1.891 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.479, mean=0.479, max=0.479, sum=0.958 (2)", - "tab": "Efficiency", - "score": 0.4788183940583794 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=444.269, mean=444.269, max=444.269, sum=888.537 (2)", - "tab": "General information", - "score": 444.2686567164179 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.473, mean=0.473, max=0.473, sum=0.945 (2)", - "tab": "Efficiency", - "score": 0.47254319794206734 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=334.434, mean=334.434, max=334.434, sum=668.867 (2)", - "tab": "General information", - "score": 334.43373493975906 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.815 (2)", - "tab": "Efficiency", - "score": 0.4075693944741411 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=267.936, mean=267.936, max=267.936, sum=535.871 (2)", - "tab": "General information", - "score": 267.9356725146199 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json deleted file mode 100644 index 7753003a8..000000000 --- a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-4o mini 2024-07-18", - "id": "openai/gpt-4o-mini-2024-07-18", - "developer": "openai", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.767, - "details": { - "description": "min=0.419, mean=0.767, max=0.959, sum=87.464 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.26, mean=0.334, max=0.733, sum=38.043 (114)", - "tab": "Efficiency", - "score": 0.3337143530055209 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=267.936, mean=612.332, max=2793.83, sum=69805.818 (114)", - "tab": "General information", - "score": 612.3317391408493 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42, - "details": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)", - "tab": "Efficiency", - "score": 0.29186195611953736 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=374.53, mean=374.53, max=374.53, sum=749.06 (2)", - "tab": "General information", - "score": 374.53 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.541 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)", - "tab": "Efficiency", - "score": 0.282137664159139 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=350.6, mean=350.6, max=350.6, sum=701.2 (2)", - "tab": "General information", - "score": 350.6 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)", - "tab": "Efficiency", - "score": 0.30902551651000976 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.31521839068995583 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.641 (2)", - "tab": "Efficiency", - "score": 0.3206118988990784 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.31047542572021486 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.625 (2)", - "tab": "Efficiency", - "score": 0.31259707081524624 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.573 (2)", - "tab": "Efficiency", - "score": 0.2866650983399036 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=552.07, mean=552.07, max=552.07, sum=1104.14 (2)", - "tab": "General information", - "score": 552.07 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=468.056, mean=468.056, max=468.056, sum=936.111 (2)", - "tab": "General information", - "score": 468.05555555555554 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)", - "tab": "General information", - "score": 828.39 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.44, mean=594.44, max=594.44, sum=1188.88 (2)", - "tab": "General information", - "score": 594.44 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=499.566, mean=499.566, max=499.566, sum=999.133 (2)", - "tab": "General information", - "score": 499.5664739884393 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=502.412, mean=502.412, max=502.412, sum=1004.824 (2)", - "tab": "General information", - "score": 502.4117647058824 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)", - "tab": "Efficiency", - "score": 0.29681269884109496 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=373.42, mean=373.42, max=373.42, sum=746.84 (2)", - "tab": "General information", - "score": 373.42 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649, - "details": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.29936775199153964 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=613.228, mean=613.228, max=613.228, sum=1226.456 (2)", - "tab": "General information", - "score": 613.2280701754386 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45, - "details": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)", - "tab": "Efficiency", - "score": 0.269585702419281 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.69, mean=399.69, max=399.69, sum=799.38 (2)", - "tab": "General information", - "score": 399.69 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)", - "tab": "Efficiency", - "score": 0.3047747744454278 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=391.231, mean=391.231, max=391.231, sum=782.463 (2)", - "tab": "General information", - "score": 391.23148148148147 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772, - "details": { - "description": "min=0.772, mean=0.772, max=0.772, sum=1.543 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)", - "tab": "Efficiency", - "score": 0.28879288308490125 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=327.92, mean=327.92, max=327.92, sum=655.839 (2)", - "tab": "General information", - "score": 327.91961414790995 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)", - "tab": "Efficiency", - "score": 0.30609772924114675 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)", - "tab": "Efficiency", - "score": 0.31189272336080565 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)", - "tab": "Efficiency", - "score": 0.32692549234885127 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.858 (2)", - "tab": "Efficiency", - "score": 0.42903122792836107 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1071.18, mean=1071.18, max=1071.18, sum=2142.36 (2)", - "tab": "General information", - "score": 1071.1801470588234 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=657.206, mean=657.206, max=657.206, sum=1314.411 (2)", - "tab": "General information", - "score": 657.2056737588653 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1629.344, mean=1629.344, max=1629.344, sum=3258.687 (2)", - "tab": "General information", - "score": 1629.3435462842242 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=574.518, mean=574.518, max=574.518, sum=1149.036 (2)", - "tab": "General information", - "score": 574.5179738562091 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.29943873405456545 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=421.71, mean=421.71, max=421.71, sum=843.42 (2)", - "tab": "General information", - "score": 421.71 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.697 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)", - "tab": "Efficiency", - "score": 0.30577954336216573 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=577.349, mean=577.349, max=577.349, sum=1154.697 (2)", - "tab": "General information", - "score": 577.3486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)", - "tab": "Efficiency", - "score": 0.3009026026725769 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=565.7, mean=565.7, max=565.7, sum=1131.4 (2)", - "tab": "General information", - "score": 565.7 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.585 (2)", - "tab": "Efficiency", - "score": 0.29226316685946485 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=400.985, mean=400.985, max=400.985, sum=801.97 (2)", - "tab": "General information", - "score": 400.98490566037736 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", - "tab": "Efficiency", - "score": 0.26024563261803163 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.677, mean=304.677, max=304.677, sum=609.353 (2)", - "tab": "General information", - "score": 304.67659574468087 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731, - "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)", - "tab": "Efficiency", - "score": 0.287484780673323 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=439.228, mean=439.228, max=439.228, sum=878.455 (2)", - "tab": "General information", - "score": 439.22758620689655 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.651, - "details": { - "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)", - "tab": "Efficiency", - "score": 0.305813713679238 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=532.683, mean=532.683, max=532.683, sum=1065.365 (2)", - "tab": "General information", - "score": 532.6825396825396 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.556, - "details": { - "description": "min=0.556, mean=0.556, max=0.556, sum=1.111 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.355, mean=0.355, max=0.355, sum=0.711 (2)", - "tab": "Efficiency", - "score": 0.3554064962599013 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=604.492, mean=604.492, max=604.492, sum=1208.984 (2)", - "tab": "General information", - "score": 604.4920634920635 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.765 (2)", - "tab": "Efficiency", - "score": 0.3826789717520437 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)", - "tab": "Efficiency", - "score": 0.3358421137767472 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.357, mean=0.357, max=0.357, sum=0.714 (2)", - "tab": "Efficiency", - "score": 0.3572020483016968 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.883 (2)", - "tab": "Efficiency", - "score": 0.44169029033545293 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.331, mean=0.331, max=0.331, sum=0.663 (2)", - "tab": "Efficiency", - "score": 0.33136808029328935 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)", - "tab": "Efficiency", - "score": 0.31024189563612864 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.302, mean=0.302, max=0.302, sum=0.605 (2)", - "tab": "Efficiency", - "score": 0.30249478022257487 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.273, mean=0.273, max=0.273, sum=0.546 (2)", - "tab": "Efficiency", - "score": 0.2731299541614674 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.34336654078058837 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.554 (2)", - "tab": "Efficiency", - "score": 0.27723274167799794 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)", - "tab": "Efficiency", - "score": 0.3419263616614385 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.41491677584471526 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.735 (2)", - "tab": "Efficiency", - "score": 0.3674813041500017 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.678 (2)", - "tab": "Efficiency", - "score": 0.33923840120371884 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=504.874, mean=504.874, max=504.874, sum=1009.748 (2)", - "tab": "General information", - "score": 504.8741935483871 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=495.34, mean=495.34, max=495.34, sum=990.68 (2)", - "tab": "General information", - "score": 495.3399014778325 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=865.8, mean=865.8, max=865.8, sum=1731.6 (2)", - "tab": "General information", - "score": 865.8 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2793.83, mean=2793.83, max=2793.83, sum=5587.661 (2)", - "tab": "General information", - "score": 2793.830303030303 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.783, mean=372.783, max=372.783, sum=745.566 (2)", - "tab": "General information", - "score": 372.7828282828283 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=463.01, mean=463.01, max=463.01, sum=926.021 (2)", - "tab": "General information", - "score": 463.0103626943005 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=371.451, mean=371.451, max=371.451, sum=742.903 (2)", - "tab": "General information", - "score": 371.4512820512821 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.456, mean=532.456, max=532.456, sum=1064.911 (2)", - "tab": "General information", - "score": 532.4555555555555 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=398.739, mean=398.739, max=398.739, sum=797.479 (2)", - "tab": "General information", - "score": 398.73949579831935 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.238, mean=560.238, max=560.238, sum=1120.477 (2)", - "tab": "General information", - "score": 560.2384105960265 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=492.917, mean=492.917, max=492.917, sum=985.835 (2)", - "tab": "General information", - "score": 492.91743119266056 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=787.574, mean=787.574, max=787.574, sum=1575.148 (2)", - "tab": "General information", - "score": 787.574074074074 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2220.005, mean=2220.005, max=2220.005, sum=4440.01 (2)", - "tab": "General information", - "score": 2220.0049019607845 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1424.439, mean=1424.439, max=1424.439, sum=2848.878 (2)", - "tab": "General information", - "score": 1424.4388185654009 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.863, - "details": { - "description": "min=0.863, mean=0.863, max=0.863, sum=1.725 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)", - "tab": "Efficiency", - "score": 0.30522876897734913 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.606 (2)", - "tab": "Efficiency", - "score": 0.30280636285097545 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=316.453, mean=316.453, max=316.453, sum=632.906 (2)", - "tab": "General information", - "score": 316.4529147982063 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=335.695, mean=335.695, max=335.695, sum=671.389 (2)", - "tab": "General information", - "score": 335.69465648854964 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.926, - "details": { - "description": "min=0.926, mean=0.926, max=0.926, sum=1.851 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.685 (2)", - "tab": "Efficiency", - "score": 0.3425306268959991 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.504, mean=639.504, max=639.504, sum=1279.008 (2)", - "tab": "General information", - "score": 639.5041322314049 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)", - "tab": "Efficiency", - "score": 0.29739713961361375 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=445.84, mean=445.84, max=445.84, sum=891.681 (2)", - "tab": "General information", - "score": 445.840490797546 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)", - "tab": "Efficiency", - "score": 0.2970866986683437 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=666.205, mean=666.205, max=666.205, sum=1332.411 (2)", - "tab": "General information", - "score": 666.2053571428571 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.611 (2)", - "tab": "Efficiency", - "score": 0.3053626088262762 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=279.485, mean=279.485, max=279.485, sum=558.971 (2)", - "tab": "General information", - "score": 279.4854368932039 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927, - "details": { - "description": "min=0.927, mean=0.927, max=0.927, sum=1.855 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)", - "tab": "Efficiency", - "score": 0.3060942073153634 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=399.85, mean=399.85, max=399.85, sum=799.701 (2)", - "tab": "General information", - "score": 399.85042735042737 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.31078683137893676 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=343.23, mean=343.23, max=343.23, sum=686.46 (2)", - "tab": "General information", - "score": 343.23 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.302, mean=0.302, max=0.302, sum=0.604 (2)", - "tab": "Efficiency", - "score": 0.3020631249989282 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=296.479, mean=296.479, max=296.479, sum=592.958 (2)", - "tab": "General information", - "score": 296.47892720306515 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.485, - "details": { - "description": "min=0.485, mean=0.485, max=0.485, sum=0.97 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.631 (2)", - "tab": "Efficiency", - "score": 0.31556026577260454 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.637 (2)", - "tab": "Efficiency", - "score": 0.3183864769322912 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=474.835, mean=474.835, max=474.835, sum=949.671 (2)", - "tab": "General information", - "score": 474.83526011560696 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=655.068, mean=655.068, max=655.068, sum=1310.136 (2)", - "tab": "General information", - "score": 655.068156424581 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827, - "details": { - "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.3104910164876701 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=581.997, mean=581.997, max=581.997, sum=1163.993 (2)", - "tab": "General information", - "score": 581.9967320261438 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.3106661284411395 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=513.944, mean=513.944, max=513.944, sum=1027.889 (2)", - "tab": "General information", - "score": 513.9444444444445 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.606 (2)", - "tab": "Efficiency", - "score": 0.30300807519392536 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=402.918, mean=402.918, max=402.918, sum=805.836 (2)", - "tab": "General information", - "score": 402.91818181818184 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788, - "details": { - "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)", - "tab": "Efficiency", - "score": 0.733092721627683 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1166.686, mean=1166.686, max=1166.686, sum=2333.371 (2)", - "tab": "General information", - "score": 1166.6857142857143 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.361, mean=0.361, max=0.361, sum=0.722 (2)", - "tab": "Efficiency", - "score": 0.3608738794848694 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=444.269, mean=444.269, max=444.269, sum=888.537 (2)", - "tab": "General information", - "score": 444.2686567164179 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536, - "details": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.489, mean=0.489, max=0.489, sum=0.978 (2)", - "tab": "Efficiency", - "score": 0.48897463298705685 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=334.434, mean=334.434, max=334.434, sum=668.867 (2)", - "tab": "General information", - "score": 334.43373493975906 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.623, mean=0.623, max=0.623, sum=1.247 (2)", - "tab": "Efficiency", - "score": 0.6232896199700428 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=267.936, mean=267.936, max=267.936, sum=535.871 (2)", - "tab": "General information", - "score": 267.9356725146199 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.774, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json deleted file mode 100644 index 4b924f5af..000000000 --- a/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 Chat 110B", - "id": "qwen/qwen1.5-110b-chat", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.478, mean=0.768, max=0.984, sum=87.534 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.229, mean=0.287, max=0.751, sum=32.77 (114)", - "tab": "Efficiency", - "score": 0.2874531237731517 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=276.07, mean=625.598, max=2814.903, sum=71318.198 (114)", - "tab": "General information", - "score": 625.5982315160392 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57, - "details": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.23, mean=0.23, max=0.23, sum=0.459 (2)", - "tab": "Efficiency", - "score": 0.22966567754745484 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=378.19, mean=378.19, max=378.19, sum=756.38 (2)", - "tab": "General information", - "score": 378.19 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", - "tab": "Efficiency", - "score": 0.2600334096837927 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)", - "tab": "General information", - "score": 353.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.513 (2)", - "tab": "Efficiency", - "score": 0.2566096520423889 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)", - "tab": "Efficiency", - "score": 0.2957576380835639 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)", - "tab": "Efficiency", - "score": 0.3260823440551758 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.2992465353012085 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)", - "tab": "Efficiency", - "score": 0.2690960313543419 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.562 (2)", - "tab": "Efficiency", - "score": 0.28119626699709427 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=568.25, mean=568.25, max=568.25, sum=1136.5 (2)", - "tab": "General information", - "score": 568.25 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=486.979, mean=486.979, max=486.979, sum=973.958 (2)", - "tab": "General information", - "score": 486.9791666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=838.58, mean=838.58, max=838.58, sum=1677.16 (2)", - "tab": "General information", - "score": 838.58 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=607.7, mean=607.7, max=607.7, sum=1215.4 (2)", - "tab": "General information", - "score": 607.7 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=506.098, mean=506.098, max=506.098, sum=1012.197 (2)", - "tab": "General information", - "score": 506.0982658959538 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=516.265, mean=516.265, max=516.265, sum=1032.529 (2)", - "tab": "General information", - "score": 516.2647058823529 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.555 (2)", - "tab": "Efficiency", - "score": 0.2773160576820374 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=386.64, mean=386.64, max=386.64, sum=773.28 (2)", - "tab": "General information", - "score": 386.64 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.496 (2)", - "tab": "Efficiency", - "score": 0.24817464017031485 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=627.939, mean=627.939, max=627.939, sum=1255.877 (2)", - "tab": "General information", - "score": 627.938596491228 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)", - "tab": "Efficiency", - "score": 0.25695453643798827 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=429.06, mean=429.06, max=429.06, sum=858.12 (2)", - "tab": "General information", - "score": 429.06 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)", - "tab": "Efficiency", - "score": 0.25610714267801354 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.713, mean=394.713, max=394.713, sum=789.426 (2)", - "tab": "General information", - "score": 394.712962962963 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.823, - "details": { - "description": "min=0.823, mean=0.823, max=0.823, sum=1.646 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.233, mean=0.233, max=0.233, sum=0.465 (2)", - "tab": "Efficiency", - "score": 0.2326939565959084 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.09, mean=329.09, max=329.09, sum=658.18 (2)", - "tab": "General information", - "score": 329.09003215434086 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)", - "tab": "Efficiency", - "score": 0.39590225675526786 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.243, mean=0.243, max=0.243, sum=0.486 (2)", - "tab": "Efficiency", - "score": 0.24316950554543354 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)", - "tab": "Efficiency", - "score": 0.31920133731200456 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.245, mean=0.245, max=0.245, sum=0.491 (2)", - "tab": "Efficiency", - "score": 0.2452772462290097 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1125.199, mean=1125.199, max=1125.199, sum=2250.397 (2)", - "tab": "General information", - "score": 1125.1985294117646 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=739.34, mean=739.34, max=739.34, sum=1478.681 (2)", - "tab": "General information", - "score": 739.3404255319149 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1663.969, mean=1663.969, max=1663.969, sum=3327.939 (2)", - "tab": "General information", - "score": 1663.9693611473272 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=581.417, mean=581.417, max=581.417, sum=1162.833 (2)", - "tab": "General information", - "score": 581.4166666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.229, mean=0.229, max=0.229, sum=0.459 (2)", - "tab": "Efficiency", - "score": 0.22928016662597656 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=428.16, mean=428.16, max=428.16, sum=856.32 (2)", - "tab": "General information", - "score": 428.16 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.803 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)", - "tab": "Efficiency", - "score": 0.3059707331029992 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=589.849, mean=589.849, max=589.849, sum=1179.697 (2)", - "tab": "General information", - "score": 589.8486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.31108115911483764 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.87, mean=569.87, max=569.87, sum=1139.74 (2)", - "tab": "General information", - "score": 569.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.766, - "details": { - "description": "min=0.766, mean=0.766, max=0.766, sum=1.532 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.268, mean=0.268, max=0.268, sum=0.536 (2)", - "tab": "Efficiency", - "score": 0.26778328283777775 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=400.623, mean=400.623, max=400.623, sum=801.245 (2)", - "tab": "General information", - "score": 400.62264150943395 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838, - "details": { - "description": "min=0.838, mean=0.838, max=0.838, sum=1.677 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.533 (2)", - "tab": "Efficiency", - "score": 0.26653050361795627 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=305.494, mean=305.494, max=305.494, sum=610.987 (2)", - "tab": "General information", - "score": 305.4936170212766 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.24, mean=0.24, max=0.24, sum=0.481 (2)", - "tab": "Efficiency", - "score": 0.24032716751098632 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=463.8, mean=463.8, max=463.8, sum=927.6 (2)", - "tab": "General information", - "score": 463.8 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.669, - "details": { - "description": "min=0.669, mean=0.669, max=0.669, sum=1.339 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.571 (2)", - "tab": "Efficiency", - "score": 0.28569977939444247 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=577.119, mean=577.119, max=577.119, sum=1154.238 (2)", - "tab": "General information", - "score": 577.1190476190476 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.567 (2)", - "tab": "Efficiency", - "score": 0.2836597722674173 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=604.667, mean=604.667, max=604.667, sum=1209.333 (2)", - "tab": "General information", - "score": 604.6666666666666 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.283, mean=0.283, max=0.283, sum=0.566 (2)", - "tab": "Efficiency", - "score": 0.2828109118246263 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)", - "tab": "Efficiency", - "score": 0.29298263935032737 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.615 (2)", - "tab": "Efficiency", - "score": 0.30738641500473024 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)", - "tab": "Efficiency", - "score": 0.5927927941987009 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.553 (2)", - "tab": "Efficiency", - "score": 0.2765737639533149 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.505 (2)", - "tab": "Efficiency", - "score": 0.2526841929539498 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.264, mean=0.264, max=0.264, sum=0.527 (2)", - "tab": "Efficiency", - "score": 0.2636140242601052 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)", - "tab": "Efficiency", - "score": 0.28875163837715434 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)", - "tab": "Efficiency", - "score": 0.2539960216073429 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.562 (2)", - "tab": "Efficiency", - "score": 0.28084811943256305 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.244, mean=0.244, max=0.244, sum=0.489 (2)", - "tab": "Efficiency", - "score": 0.24437280532416947 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.679 (2)", - "tab": "Efficiency", - "score": 0.3396394296928688 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)", - "tab": "Efficiency", - "score": 0.4159782189948886 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.751, mean=0.751, max=0.751, sum=1.501 (2)", - "tab": "Efficiency", - "score": 0.7505324741959069 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.916, mean=513.916, max=513.916, sum=1027.832 (2)", - "tab": "General information", - "score": 513.916129032258 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=517.261, mean=517.261, max=517.261, sum=1034.522 (2)", - "tab": "General information", - "score": 517.2610837438424 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=878.46, mean=878.46, max=878.46, sum=1756.92 (2)", - "tab": "General information", - "score": 878.46 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2814.903, mean=2814.903, max=2814.903, sum=5629.806 (2)", - "tab": "General information", - "score": 2814.9030303030304 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.217, mean=372.217, max=372.217, sum=744.434 (2)", - "tab": "General information", - "score": 372.2171717171717 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=467.311, mean=467.311, max=467.311, sum=934.622 (2)", - "tab": "General information", - "score": 467.31088082901556 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=374.349, mean=374.349, max=374.349, sum=748.697 (2)", - "tab": "General information", - "score": 374.34871794871793 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=565.326, mean=565.326, max=565.326, sum=1130.652 (2)", - "tab": "General information", - "score": 565.325925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=402.277, mean=402.277, max=402.277, sum=804.555 (2)", - "tab": "General information", - "score": 402.2773109243698 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=580.536, mean=580.536, max=580.536, sum=1161.073 (2)", - "tab": "General information", - "score": 580.5364238410596 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.521, mean=495.521, max=495.521, sum=991.042 (2)", - "tab": "General information", - "score": 495.52110091743117 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=830.477, mean=830.477, max=830.477, sum=1660.954 (2)", - "tab": "General information", - "score": 830.4768518518518 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2237.176, mean=2237.176, max=2237.176, sum=4474.353 (2)", - "tab": "General information", - "score": 2237.176470588235 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1448.354, mean=1448.354, max=1448.354, sum=2896.709 (2)", - "tab": "General information", - "score": 1448.3544303797469 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.245, mean=0.245, max=0.245, sum=0.49 (2)", - "tab": "Efficiency", - "score": 0.24486422538757324 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)", - "tab": "Efficiency", - "score": 0.25416288121056013 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=322.121, mean=322.121, max=322.121, sum=644.242 (2)", - "tab": "General information", - "score": 322.1210762331838 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.504, mean=341.504, max=341.504, sum=683.008 (2)", - "tab": "General information", - "score": 341.5038167938931 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.876, - "details": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.555 (2)", - "tab": "Efficiency", - "score": 0.2773902613269396 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=640.579, mean=640.579, max=640.579, sum=1281.157 (2)", - "tab": "General information", - "score": 640.5785123966942 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828, - "details": { - "description": "min=0.828, mean=0.828, max=0.828, sum=1.656 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.496 (2)", - "tab": "Efficiency", - "score": 0.24794307661934134 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.632, mean=449.632, max=449.632, sum=899.264 (2)", - "tab": "General information", - "score": 449.6319018404908 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.634, - "details": { - "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.284, mean=0.284, max=0.284, sum=0.567 (2)", - "tab": "Efficiency", - "score": 0.2835228868893215 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=681.848, mean=681.848, max=681.848, sum=1363.696 (2)", - "tab": "General information", - "score": 681.8482142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)", - "tab": "Efficiency", - "score": 0.28018068804324253 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.854, mean=283.854, max=283.854, sum=567.709 (2)", - "tab": "General information", - "score": 283.8543689320388 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.919, - "details": { - "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.509 (2)", - "tab": "Efficiency", - "score": 0.2544598365441347 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.415, mean=404.415, max=404.415, sum=808.829 (2)", - "tab": "General information", - "score": 404.4145299145299 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.541 (2)", - "tab": "Efficiency", - "score": 0.27034429311752317 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=342.35, mean=342.35, max=342.35, sum=684.7 (2)", - "tab": "General information", - "score": 342.35 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.867 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.246, mean=0.246, max=0.246, sum=0.492 (2)", - "tab": "Efficiency", - "score": 0.24603491085242493 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=303.7, mean=303.7, max=303.7, sum=607.4 (2)", - "tab": "General information", - "score": 303.6998722860792 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.783, - "details": { - "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.256, mean=0.256, max=0.256, sum=0.513 (2)", - "tab": "Efficiency", - "score": 0.2563680651559995 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)", - "tab": "Efficiency", - "score": 0.25722797329865354 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.182, mean=476.182, max=476.182, sum=952.364 (2)", - "tab": "General information", - "score": 476.1820809248555 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=668.494, mean=668.494, max=668.494, sum=1336.988 (2)", - "tab": "General information", - "score": 668.4938547486033 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.271, mean=0.271, max=0.271, sum=0.542 (2)", - "tab": "Efficiency", - "score": 0.27095749721028445 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=599.637, mean=599.637, max=599.637, sum=1199.275 (2)", - "tab": "General information", - "score": 599.6372549019608 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.867, - "details": { - "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.242, mean=0.242, max=0.242, sum=0.483 (2)", - "tab": "Efficiency", - "score": 0.2415844319779196 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=528.364, mean=528.364, max=528.364, sum=1056.728 (2)", - "tab": "General information", - "score": 528.3641975308642 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)", - "tab": "Efficiency", - "score": 0.2501691276376898 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=408.427, mean=408.427, max=408.427, sum=816.855 (2)", - "tab": "General information", - "score": 408.42727272727274 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.735, - "details": { - "description": "min=0.735, mean=0.735, max=0.735, sum=1.469 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.283, mean=0.283, max=0.283, sum=0.565 (2)", - "tab": "Efficiency", - "score": 0.28266452769843897 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1166.931, mean=1166.931, max=1166.931, sum=2333.861 (2)", - "tab": "General information", - "score": 1166.930612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.866, - "details": { - "description": "min=0.866, mean=0.866, max=0.866, sum=1.731 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.258, mean=0.258, max=0.258, sum=0.516 (2)", - "tab": "Efficiency", - "score": 0.258230237818476 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=450.1, mean=450.1, max=450.1, sum=900.199 (2)", - "tab": "General information", - "score": 450.0995024875622 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542, - "details": { - "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.248, mean=0.248, max=0.248, sum=0.495 (2)", - "tab": "Efficiency", - "score": 0.24754508719386825 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.819, mean=343.819, max=343.819, sum=687.639 (2)", - "tab": "General information", - "score": 343.8192771084337 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.871, - "details": { - "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.235, mean=0.235, max=0.235, sum=0.471 (2)", - "tab": "Efficiency", - "score": 0.23539779897321733 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=276.07, mean=276.07, max=276.07, sum=552.14 (2)", - "tab": "General information", - "score": 276.0701754385965 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json deleted file mode 100644 index 9bfc87f91..000000000 --- a/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 14B", - "id": "qwen/qwen1.5-14b", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686, - "details": { - "description": "min=0.368, mean=0.686, max=0.893, sum=78.254 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.282, mean=0.321, max=0.549, sum=36.618 (114)", - "tab": "Efficiency", - "score": 0.3212107113231387 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=269.07, mean=618.598, max=2807.903, sum=70520.198 (114)", - "tab": "General information", - "score": 618.5982315160392 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4, - "details": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.569 (2)", - "tab": "Efficiency", - "score": 0.28459527969360354 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=371.19, mean=371.19, max=371.19, sum=742.38 (2)", - "tab": "General information", - "score": 371.19 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637, - "details": { - "description": "min=0.637, mean=0.637, max=0.637, sum=1.274 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.663 (2)", - "tab": "Efficiency", - "score": 0.33150761745594165 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)", - "tab": "General information", - "score": 346.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48, - "details": { - "description": "min=0.48, mean=0.48, max=0.48, sum=0.961 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)", - "tab": "Efficiency", - "score": 0.33498176813125613 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)", - "tab": "Efficiency", - "score": 0.2946729362010956 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)", - "tab": "Efficiency", - "score": 0.3364031720161438 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.648 (2)", - "tab": "Efficiency", - "score": 0.3238637447357178 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.611 (2)", - "tab": "Efficiency", - "score": 0.3055199033263102 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.31105106250912534 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=561.25, mean=561.25, max=561.25, sum=1122.5 (2)", - "tab": "General information", - "score": 561.25 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=479.979, mean=479.979, max=479.979, sum=959.958 (2)", - "tab": "General information", - "score": 479.9791666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=831.58, mean=831.58, max=831.58, sum=1663.16 (2)", - "tab": "General information", - "score": 831.58 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=600.7, mean=600.7, max=600.7, sum=1201.4 (2)", - "tab": "General information", - "score": 600.7 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=499.098, mean=499.098, max=499.098, sum=998.197 (2)", - "tab": "General information", - "score": 499.0982658959538 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=509.265, mean=509.265, max=509.265, sum=1018.529 (2)", - "tab": "General information", - "score": 509.2647058823529 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.2989851474761963 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=379.64, mean=379.64, max=379.64, sum=759.28 (2)", - "tab": "General information", - "score": 379.64 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)", - "tab": "Efficiency", - "score": 0.3118862185561866 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=620.939, mean=620.939, max=620.939, sum=1241.877 (2)", - "tab": "General information", - "score": 620.938596491228 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.611 (2)", - "tab": "Efficiency", - "score": 0.30553135871887205 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=422.06, mean=422.06, max=422.06, sum=844.12 (2)", - "tab": "General information", - "score": 422.06 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.769, - "details": { - "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)", - "tab": "Efficiency", - "score": 0.3092155566921941 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.713, mean=387.713, max=387.713, sum=775.426 (2)", - "tab": "General information", - "score": 387.712962962963 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.717, - "details": { - "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.3108927659283114 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.09, mean=322.09, max=322.09, sum=644.18 (2)", - "tab": "General information", - "score": 322.09003215434086 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.699, - "details": { - "description": "min=0.699, mean=0.699, max=0.699, sum=1.399 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.475, mean=0.475, max=0.475, sum=0.951 (2)", - "tab": "Efficiency", - "score": 0.47532147870344277 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)", - "tab": "Efficiency", - "score": 0.31895153404127624 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Efficiency", - "score": 0.4000247932941382 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)", - "tab": "Efficiency", - "score": 0.3012406826019287 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1118.199, mean=1118.199, max=1118.199, sum=2236.397 (2)", - "tab": "General information", - "score": 1118.1985294117646 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=732.34, mean=732.34, max=732.34, sum=1464.681 (2)", - "tab": "General information", - "score": 732.3404255319149 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1656.969, mean=1656.969, max=1656.969, sum=3313.939 (2)", - "tab": "General information", - "score": 1656.9693611473272 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=574.417, mean=574.417, max=574.417, sum=1148.833 (2)", - "tab": "General information", - "score": 574.4166666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)", - "tab": "Efficiency", - "score": 0.31888857364654544 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=421.16, mean=421.16, max=421.16, sum=842.32 (2)", - "tab": "General information", - "score": 421.16 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724, - "details": { - "description": "min=0.724, mean=0.724, max=0.724, sum=1.447 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)", - "tab": "Efficiency", - "score": 0.29459338125429657 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=582.849, mean=582.849, max=582.849, sum=1165.697 (2)", - "tab": "General information", - "score": 582.8486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.32330512285232543 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.87, mean=562.87, max=562.87, sum=1125.74 (2)", - "tab": "General information", - "score": 562.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736, - "details": { - "description": "min=0.736, mean=0.736, max=0.736, sum=1.472 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.2987864755234628 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=393.623, mean=393.623, max=393.623, sum=787.245 (2)", - "tab": "General information", - "score": 393.62264150943395 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694, - "details": { - "description": "min=0.694, mean=0.694, max=0.694, sum=1.387 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)", - "tab": "Efficiency", - "score": 0.2873024098416592 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=298.494, mean=298.494, max=298.494, sum=596.987 (2)", - "tab": "General information", - "score": 298.4936170212766 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.683, - "details": { - "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.573 (2)", - "tab": "Efficiency", - "score": 0.2863943790567332 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=456.8, mean=456.8, max=456.8, sum=913.6 (2)", - "tab": "General information", - "score": 456.8 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.603, - "details": { - "description": "min=0.603, mean=0.603, max=0.603, sum=1.206 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.635 (2)", - "tab": "Efficiency", - "score": 0.3172515391041993 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=570.119, mean=570.119, max=570.119, sum=1140.238 (2)", - "tab": "General information", - "score": 570.1190476190476 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.492, - "details": { - "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)", - "tab": "Efficiency", - "score": 0.31694961918724907 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=597.667, mean=597.667, max=597.667, sum=1195.333 (2)", - "tab": "General information", - "score": 597.6666666666666 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)", - "tab": "Efficiency", - "score": 0.3025627659213158 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.3108991178972968 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)", - "tab": "Efficiency", - "score": 0.30484641551971436 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)", - "tab": "Efficiency", - "score": 0.548761223301743 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)", - "tab": "Efficiency", - "score": 0.3120840137655085 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.29960165616761836 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.29392006519513253 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.625 (2)", - "tab": "Efficiency", - "score": 0.3124903016620212 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.565 (2)", - "tab": "Efficiency", - "score": 0.28235371273104887 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.615 (2)", - "tab": "Efficiency", - "score": 0.30758162681630113 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)", - "tab": "Efficiency", - "score": 0.3172066456680998 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)", - "tab": "Efficiency", - "score": 0.33508766580511024 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.906 (2)", - "tab": "Efficiency", - "score": 0.4531192370489532 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)", - "tab": "Efficiency", - "score": 0.3856232206529706 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.916, mean=506.916, max=506.916, sum=1013.832 (2)", - "tab": "General information", - "score": 506.9161290322581 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=510.261, mean=510.261, max=510.261, sum=1020.522 (2)", - "tab": "General information", - "score": 510.2610837438424 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=871.46, mean=871.46, max=871.46, sum=1742.92 (2)", - "tab": "General information", - "score": 871.46 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2807.903, mean=2807.903, max=2807.903, sum=5615.806 (2)", - "tab": "General information", - "score": 2807.9030303030304 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.217, mean=365.217, max=365.217, sum=730.434 (2)", - "tab": "General information", - "score": 365.2171717171717 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=460.311, mean=460.311, max=460.311, sum=920.622 (2)", - "tab": "General information", - "score": 460.31088082901556 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=367.349, mean=367.349, max=367.349, sum=734.697 (2)", - "tab": "General information", - "score": 367.34871794871793 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=558.326, mean=558.326, max=558.326, sum=1116.652 (2)", - "tab": "General information", - "score": 558.325925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=395.277, mean=395.277, max=395.277, sum=790.555 (2)", - "tab": "General information", - "score": 395.2773109243698 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=573.536, mean=573.536, max=573.536, sum=1147.073 (2)", - "tab": "General information", - "score": 573.5364238410596 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.521, mean=488.521, max=488.521, sum=977.042 (2)", - "tab": "General information", - "score": 488.52110091743117 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=823.477, mean=823.477, max=823.477, sum=1646.954 (2)", - "tab": "General information", - "score": 823.4768518518518 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2230.176, mean=2230.176, max=2230.176, sum=4460.353 (2)", - "tab": "General information", - "score": 2230.176470588235 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1441.354, mean=1441.354, max=1441.354, sum=2882.709 (2)", - "tab": "General information", - "score": 1441.3544303797469 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.756, - "details": { - "description": "min=0.756, mean=0.756, max=0.756, sum=1.511 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)", - "tab": "Efficiency", - "score": 0.29016303160799994 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.3224487978083487 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=315.121, mean=315.121, max=315.121, sum=630.242 (2)", - "tab": "General information", - "score": 315.1210762331838 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.504, mean=334.504, max=334.504, sum=669.008 (2)", - "tab": "General information", - "score": 334.5038167938931 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.615 (2)", - "tab": "Efficiency", - "score": 0.307678321176324 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=633.579, mean=633.579, max=633.579, sum=1267.157 (2)", - "tab": "General information", - "score": 633.5785123966942 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736, - "details": { - "description": "min=0.736, mean=0.736, max=0.736, sum=1.472 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)", - "tab": "Efficiency", - "score": 0.3051488355624895 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.632, mean=442.632, max=442.632, sum=885.264 (2)", - "tab": "General information", - "score": 442.6319018404908 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509, - "details": { - "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)", - "tab": "Efficiency", - "score": 0.3079095014504024 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=674.848, mean=674.848, max=674.848, sum=1349.696 (2)", - "tab": "General information", - "score": 674.8482142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.816, - "details": { - "description": "min=0.816, mean=0.816, max=0.816, sum=1.631 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.317, mean=0.317, max=0.317, sum=0.633 (2)", - "tab": "Efficiency", - "score": 0.316567536696647 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.854, mean=276.854, max=276.854, sum=553.709 (2)", - "tab": "General information", - "score": 276.8543689320388 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.3104041937070015 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.415, mean=397.415, max=397.415, sum=794.829 (2)", - "tab": "General information", - "score": 397.4145299145299 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.302, mean=0.302, max=0.302, sum=0.603 (2)", - "tab": "Efficiency", - "score": 0.30150007486343383 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=335.35, mean=335.35, max=335.35, sum=670.7 (2)", - "tab": "General information", - "score": 335.35 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.835, - "details": { - "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.29396778352720376 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=296.7, mean=296.7, max=296.7, sum=593.4 (2)", - "tab": "General information", - "score": 296.6998722860792 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.368, - "details": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.30380174465951204 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.601 (2)", - "tab": "Efficiency", - "score": 0.3006620183337334 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.182, mean=469.182, max=469.182, sum=938.364 (2)", - "tab": "General information", - "score": 469.1820809248555 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=661.494, mean=661.494, max=661.494, sum=1322.988 (2)", - "tab": "General information", - "score": 661.4938547486033 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742, - "details": { - "description": "min=0.742, mean=0.742, max=0.742, sum=1.484 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.639 (2)", - "tab": "Efficiency", - "score": 0.31930122655980725 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=592.637, mean=592.637, max=592.637, sum=1185.275 (2)", - "tab": "General information", - "score": 592.6372549019608 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71, - "details": { - "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.625 (2)", - "tab": "Efficiency", - "score": 0.3125371013158633 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=521.364, mean=521.364, max=521.364, sum=1042.728 (2)", - "tab": "General information", - "score": 521.3641975308642 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655, - "details": { - "description": "min=0.655, mean=0.655, max=0.655, sum=1.309 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)", - "tab": "Efficiency", - "score": 0.29603702588514846 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=401.427, mean=401.427, max=401.427, sum=802.855 (2)", - "tab": "General information", - "score": 401.42727272727274 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)", - "tab": "Efficiency", - "score": 0.3521312304905483 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1159.931, mean=1159.931, max=1159.931, sum=2319.861 (2)", - "tab": "General information", - "score": 1159.930612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.841, - "details": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.609 (2)", - "tab": "Efficiency", - "score": 0.3044381426341498 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=443.1, mean=443.1, max=443.1, sum=886.199 (2)", - "tab": "General information", - "score": 443.0995024875622 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.458, - "details": { - "description": "min=0.458, mean=0.458, max=0.458, sum=0.916 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)", - "tab": "Efficiency", - "score": 0.297343333083463 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.819, mean=336.819, max=336.819, sum=673.639 (2)", - "tab": "General information", - "score": 336.8192771084337 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)", - "tab": "Efficiency", - "score": 0.3027164573557893 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=269.07, mean=269.07, max=269.07, sum=538.14 (2)", - "tab": "General information", - "score": 269.0701754385965 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json deleted file mode 100644 index d1a9f19e1..000000000 --- a/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 32B", - "id": "qwen/qwen1.5-32b", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.744, - "details": { - "description": "min=0.4, mean=0.744, max=0.974, sum=84.853 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.294, mean=0.413, max=0.973, sum=47.06 (114)", - "tab": "Efficiency", - "score": 0.41280544410672226 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=269.07, mean=618.598, max=2807.903, sum=70520.198 (114)", - "tab": "General information", - "score": 618.5982315160392 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4, - "details": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)", - "tab": "Efficiency", - "score": 0.33740817070007323 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=371.19, mean=371.19, max=371.19, sum=742.38 (2)", - "tab": "General information", - "score": 371.19 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644, - "details": { - "description": "min=0.644, mean=0.644, max=0.644, sum=1.289 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.706 (2)", - "tab": "Efficiency", - "score": 0.35299032705801503 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)", - "tab": "General information", - "score": 346.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.677 (2)", - "tab": "Efficiency", - "score": 0.33828389167785644 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.57, mean=0.57, max=0.57, sum=1.141 (2)", - "tab": "Efficiency", - "score": 0.5704119238588545 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)", - "tab": "Efficiency", - "score": 0.4065530586242676 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)", - "tab": "Efficiency", - "score": 0.6829782605171204 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Efficiency", - "score": 0.34014028896486137 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.616, mean=0.616, max=0.616, sum=1.231 (2)", - "tab": "Efficiency", - "score": 0.6156594987009086 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=561.25, mean=561.25, max=561.25, sum=1122.5 (2)", - "tab": "General information", - "score": 561.25 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=479.979, mean=479.979, max=479.979, sum=959.958 (2)", - "tab": "General information", - "score": 479.9791666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=831.58, mean=831.58, max=831.58, sum=1663.16 (2)", - "tab": "General information", - "score": 831.58 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=600.7, mean=600.7, max=600.7, sum=1201.4 (2)", - "tab": "General information", - "score": 600.7 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=499.098, mean=499.098, max=499.098, sum=998.197 (2)", - "tab": "General information", - "score": 499.0982658959538 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=509.265, mean=509.265, max=509.265, sum=1018.529 (2)", - "tab": "General information", - "score": 509.2647058823529 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.339, mean=0.339, max=0.339, sum=0.678 (2)", - "tab": "Efficiency", - "score": 0.3387904930114746 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=379.64, mean=379.64, max=379.64, sum=759.28 (2)", - "tab": "General information", - "score": 379.64 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561, - "details": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.367, mean=0.367, max=0.367, sum=0.733 (2)", - "tab": "Efficiency", - "score": 0.3666987272731045 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=620.939, mean=620.939, max=620.939, sum=1241.877 (2)", - "tab": "General information", - "score": 620.938596491228 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47, - "details": { - "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)", - "tab": "Efficiency", - "score": 0.6499223327636718 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=422.06, mean=422.06, max=422.06, sum=844.12 (2)", - "tab": "General information", - "score": 422.06 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.685 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.601 (2)", - "tab": "Efficiency", - "score": 0.30060131240774085 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.713, mean=387.713, max=387.713, sum=775.426 (2)", - "tab": "General information", - "score": 387.712962962963 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)", - "tab": "Efficiency", - "score": 0.2974156122115647 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.09, mean=322.09, max=322.09, sum=644.18 (2)", - "tab": "General information", - "score": 322.09003215434086 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)", - "tab": "Efficiency", - "score": 0.46517644997905283 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.762 (2)", - "tab": "Efficiency", - "score": 0.3812122328061584 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.299 (2)", - "tab": "Efficiency", - "score": 0.6492582102642532 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.775 (2)", - "tab": "Efficiency", - "score": 0.38769422676049026 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1118.199, mean=1118.199, max=1118.199, sum=2236.397 (2)", - "tab": "General information", - "score": 1118.1985294117646 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=732.34, mean=732.34, max=732.34, sum=1464.681 (2)", - "tab": "General information", - "score": 732.3404255319149 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1656.969, mean=1656.969, max=1656.969, sum=3313.939 (2)", - "tab": "General information", - "score": 1656.9693611473272 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=574.417, mean=574.417, max=574.417, sum=1148.833 (2)", - "tab": "General information", - "score": 574.4166666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.3429260540008545 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=421.16, mean=421.16, max=421.16, sum=842.32 (2)", - "tab": "General information", - "score": 421.16 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.855, - "details": { - "description": "min=0.855, mean=0.855, max=0.855, sum=1.711 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.33687377132867513 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=582.849, mean=582.849, max=582.849, sum=1165.697 (2)", - "tab": "General information", - "score": 582.8486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.77, - "details": { - "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.356, mean=0.356, max=0.356, sum=0.713 (2)", - "tab": "Efficiency", - "score": 0.3564377498626709 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.87, mean=562.87, max=562.87, sum=1125.74 (2)", - "tab": "General information", - "score": 562.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.781, - "details": { - "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)", - "tab": "Efficiency", - "score": 0.3190377280397235 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=393.623, mean=393.623, max=393.623, sum=787.245 (2)", - "tab": "General information", - "score": 393.62264150943395 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.766, - "details": { - "description": "min=0.766, mean=0.766, max=0.766, sum=1.532 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)", - "tab": "Efficiency", - "score": 0.4358475421337371 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=298.494, mean=298.494, max=298.494, sum=596.987 (2)", - "tab": "General information", - "score": 298.4936170212766 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731, - "details": { - "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)", - "tab": "Efficiency", - "score": 0.32112578523570096 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=456.8, mean=456.8, max=456.8, sum=913.6 (2)", - "tab": "General information", - "score": 456.8 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.685, - "details": { - "description": "min=0.685, mean=0.685, max=0.685, sum=1.37 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.705 (2)", - "tab": "Efficiency", - "score": 0.3522766809614878 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=570.119, mean=570.119, max=570.119, sum=1140.238 (2)", - "tab": "General information", - "score": 570.1190476190476 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524, - "details": { - "description": "min=0.524, mean=0.524, max=0.524, sum=1.048 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.37, mean=0.37, max=0.37, sum=0.739 (2)", - "tab": "Efficiency", - "score": 0.3697236606052944 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=597.667, mean=597.667, max=597.667, sum=1195.333 (2)", - "tab": "General information", - "score": 597.6666666666666 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.869, mean=0.869, max=0.869, sum=1.738 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.639 (2)", - "tab": "Efficiency", - "score": 0.3195470579208866 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.369, mean=0.369, max=0.369, sum=0.739 (2)", - "tab": "Efficiency", - "score": 0.36928989969450854 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)", - "tab": "Efficiency", - "score": 0.7240336751937866 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.973, mean=0.973, max=0.973, sum=1.946 (2)", - "tab": "Efficiency", - "score": 0.9729607683239561 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Efficiency", - "score": 0.30711602562605733 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.675 (2)", - "tab": "Efficiency", - "score": 0.3376439371257248 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.41, mean=0.41, max=0.41, sum=0.82 (2)", - "tab": "Efficiency", - "score": 0.410240764495654 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)", - "tab": "Efficiency", - "score": 0.36270895887304233 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.629 (2)", - "tab": "Efficiency", - "score": 0.3144632788265453 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.368, mean=0.368, max=0.368, sum=0.736 (2)", - "tab": "Efficiency", - "score": 0.3679169850633634 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.903 (2)", - "tab": "Efficiency", - "score": 0.45166520109964076 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.378, mean=0.378, max=0.378, sum=0.757 (2)", - "tab": "Efficiency", - "score": 0.37830896068502357 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.888, mean=0.888, max=0.888, sum=1.776 (2)", - "tab": "Efficiency", - "score": 0.8882208957391626 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.551, mean=0.551, max=0.551, sum=1.102 (2)", - "tab": "Efficiency", - "score": 0.5509252004985568 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.916, mean=506.916, max=506.916, sum=1013.832 (2)", - "tab": "General information", - "score": 506.9161290322581 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=510.261, mean=510.261, max=510.261, sum=1020.522 (2)", - "tab": "General information", - "score": 510.2610837438424 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=871.46, mean=871.46, max=871.46, sum=1742.92 (2)", - "tab": "General information", - "score": 871.46 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2807.903, mean=2807.903, max=2807.903, sum=5615.806 (2)", - "tab": "General information", - "score": 2807.9030303030304 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.217, mean=365.217, max=365.217, sum=730.434 (2)", - "tab": "General information", - "score": 365.2171717171717 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=460.311, mean=460.311, max=460.311, sum=920.622 (2)", - "tab": "General information", - "score": 460.31088082901556 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=367.349, mean=367.349, max=367.349, sum=734.697 (2)", - "tab": "General information", - "score": 367.34871794871793 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=558.326, mean=558.326, max=558.326, sum=1116.652 (2)", - "tab": "General information", - "score": 558.325925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=395.277, mean=395.277, max=395.277, sum=790.555 (2)", - "tab": "General information", - "score": 395.2773109243698 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=573.536, mean=573.536, max=573.536, sum=1147.073 (2)", - "tab": "General information", - "score": 573.5364238410596 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.521, mean=488.521, max=488.521, sum=977.042 (2)", - "tab": "General information", - "score": 488.52110091743117 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=823.477, mean=823.477, max=823.477, sum=1646.954 (2)", - "tab": "General information", - "score": 823.4768518518518 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2230.176, mean=2230.176, max=2230.176, sum=4460.353 (2)", - "tab": "General information", - "score": 2230.176470588235 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1441.354, mean=1441.354, max=1441.354, sum=2882.709 (2)", - "tab": "General information", - "score": 1441.3544303797469 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.847, - "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)", - "tab": "Efficiency", - "score": 0.31371782071921855 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.32332972897828083 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=315.121, mean=315.121, max=315.121, sum=630.242 (2)", - "tab": "General information", - "score": 315.1210762331838 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.504, mean=334.504, max=334.504, sum=669.008 (2)", - "tab": "General information", - "score": 334.5038167938931 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)", - "tab": "Efficiency", - "score": 0.38232671130787244 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=633.579, mean=633.579, max=633.579, sum=1267.157 (2)", - "tab": "General information", - "score": 633.5785123966942 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.625 (2)", - "tab": "Efficiency", - "score": 0.31269068220641716 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.632, mean=442.632, max=442.632, sum=885.264 (2)", - "tab": "General information", - "score": 442.6319018404908 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.719 (2)", - "tab": "Efficiency", - "score": 0.3593791680676596 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=674.848, mean=674.848, max=674.848, sum=1349.696 (2)", - "tab": "General information", - "score": 674.8482142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.874, - "details": { - "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.633, mean=0.633, max=0.633, sum=1.265 (2)", - "tab": "Efficiency", - "score": 0.6326094113507317 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.854, mean=276.854, max=276.854, sum=553.709 (2)", - "tab": "General information", - "score": 276.8543689320388 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.936, - "details": { - "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.3277416534912892 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.415, mean=397.415, max=397.415, sum=794.829 (2)", - "tab": "General information", - "score": 397.4145299145299 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.2937913846969604 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=335.35, mean=335.35, max=335.35, sum=670.7 (2)", - "tab": "General information", - "score": 335.35 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.884, - "details": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.34673521040652144 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=296.7, mean=296.7, max=296.7, sum=593.4 (2)", - "tab": "General information", - "score": 296.6998722860792 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.545, - "details": { - "description": "min=0.545, mean=0.545, max=0.545, sum=1.091 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.667 (2)", - "tab": "Efficiency", - "score": 0.3335799164854722 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)", - "tab": "Efficiency", - "score": 0.3961469775471607 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.182, mean=469.182, max=469.182, sum=938.364 (2)", - "tab": "General information", - "score": 469.1820809248555 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=661.494, mean=661.494, max=661.494, sum=1322.988 (2)", - "tab": "General information", - "score": 661.4938547486033 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.621 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)", - "tab": "Efficiency", - "score": 0.33816951162674846 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=592.637, mean=592.637, max=592.637, sum=1185.275 (2)", - "tab": "General information", - "score": 592.6372549019608 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)", - "tab": "Efficiency", - "score": 0.3270495865080092 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=521.364, mean=521.364, max=521.364, sum=1042.728 (2)", - "tab": "General information", - "score": 521.3641975308642 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.664, - "details": { - "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.609 (2)", - "tab": "Efficiency", - "score": 0.3046790404753251 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=401.427, mean=401.427, max=401.427, sum=802.855 (2)", - "tab": "General information", - "score": 401.42727272727274 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.829, - "details": { - "description": "min=0.829, mean=0.829, max=0.829, sum=1.657 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.478, mean=0.478, max=0.478, sum=0.956 (2)", - "tab": "Efficiency", - "score": 0.47783534575481806 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1159.931, mean=1159.931, max=1159.931, sum=2319.861 (2)", - "tab": "General information", - "score": 1159.930612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.881, - "details": { - "description": "min=0.881, mean=0.881, max=0.881, sum=1.761 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.341, mean=0.341, max=0.341, sum=0.681 (2)", - "tab": "Efficiency", - "score": 0.3407213664173487 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=443.1, mean=443.1, max=443.1, sum=886.199 (2)", - "tab": "General information", - "score": 443.0995024875622 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)", - "tab": "Efficiency", - "score": 0.3289937297981906 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.819, mean=336.819, max=336.819, sum=673.639 (2)", - "tab": "General information", - "score": 336.8192771084337 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)", - "tab": "Efficiency", - "score": 0.31992746933161864 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=269.07, mean=269.07, max=269.07, sum=538.14 (2)", - "tab": "General information", - "score": 269.0701754385965 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.624, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json deleted file mode 100644 index 94c5e4e80..000000000 --- a/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 72B", - "id": "qwen/qwen1.5-72b", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.774, - "details": { - "description": "min=0.44, mean=0.774, max=0.99, sum=88.227 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.323, mean=0.375, max=0.713, sum=42.762 (114)", - "tab": "Efficiency", - "score": 0.37510459085651054 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=269.07, mean=618.598, max=2807.903, sum=70520.198 (114)", - "tab": "General information", - "score": 618.5982315160392 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44, - "details": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.696 (2)", - "tab": "Efficiency", - "score": 0.3480935263633728 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=371.19, mean=371.19, max=371.19, sum=742.38 (2)", - "tab": "General information", - "score": 371.19 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.467 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.685 (2)", - "tab": "Efficiency", - "score": 0.3424220985836453 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)", - "tab": "General information", - "score": 346.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.396, mean=0.396, max=0.396, sum=0.791 (2)", - "tab": "Efficiency", - "score": 0.39563153505325316 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.698 (2)", - "tab": "Efficiency", - "score": 0.3488144195742077 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.398, mean=0.398, max=0.398, sum=0.797 (2)", - "tab": "Efficiency", - "score": 0.39839950799942014 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.743 (2)", - "tab": "Efficiency", - "score": 0.3715039682388306 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.34641625977665014 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Efficiency", - "score": 0.38388992290870816 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=561.25, mean=561.25, max=561.25, sum=1122.5 (2)", - "tab": "General information", - "score": 561.25 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=479.979, mean=479.979, max=479.979, sum=959.958 (2)", - "tab": "General information", - "score": 479.9791666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=831.58, mean=831.58, max=831.58, sum=1663.16 (2)", - "tab": "General information", - "score": 831.58 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=600.7, mean=600.7, max=600.7, sum=1201.4 (2)", - "tab": "General information", - "score": 600.7 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=499.098, mean=499.098, max=499.098, sum=998.197 (2)", - "tab": "General information", - "score": 499.0982658959538 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=509.265, mean=509.265, max=509.265, sum=1018.529 (2)", - "tab": "General information", - "score": 509.2647058823529 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81, - "details": { - "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)", - "tab": "Efficiency", - "score": 0.3379603147506714 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=379.64, mean=379.64, max=379.64, sum=759.28 (2)", - "tab": "General information", - "score": 379.64 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.544, - "details": { - "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)", - "tab": "Efficiency", - "score": 0.3857871189452054 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=620.939, mean=620.939, max=620.939, sum=1241.877 (2)", - "tab": "General information", - "score": 620.938596491228 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.669 (2)", - "tab": "Efficiency", - "score": 0.3347077107429504 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=422.06, mean=422.06, max=422.06, sum=844.12 (2)", - "tab": "General information", - "score": 422.06 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.3512495689921909 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.713, mean=387.713, max=387.713, sum=775.426 (2)", - "tab": "General information", - "score": 387.712962962963 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Efficiency", - "score": 0.34987031455208634 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.09, mean=322.09, max=322.09, sum=644.18 (2)", - "tab": "General information", - "score": 322.09003215434086 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)", - "tab": "Efficiency", - "score": 0.4260168829384972 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)", - "tab": "Efficiency", - "score": 0.3750799666059778 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.501, mean=0.501, max=0.501, sum=1.002 (2)", - "tab": "Efficiency", - "score": 0.501238130839272 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.719 (2)", - "tab": "Efficiency", - "score": 0.3593972987598843 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1118.199, mean=1118.199, max=1118.199, sum=2236.397 (2)", - "tab": "General information", - "score": 1118.1985294117646 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=732.34, mean=732.34, max=732.34, sum=1464.681 (2)", - "tab": "General information", - "score": 732.3404255319149 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1656.969, mean=1656.969, max=1656.969, sum=3313.939 (2)", - "tab": "General information", - "score": 1656.9693611473272 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=574.417, mean=574.417, max=574.417, sum=1148.833 (2)", - "tab": "General information", - "score": 574.4166666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.703 (2)", - "tab": "Efficiency", - "score": 0.3515354657173157 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=421.16, mean=421.16, max=421.16, sum=842.32 (2)", - "tab": "General information", - "score": 421.16 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.737 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)", - "tab": "Efficiency", - "score": 0.3729873691734515 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=582.849, mean=582.849, max=582.849, sum=1165.697 (2)", - "tab": "General information", - "score": 582.8486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)", - "tab": "Efficiency", - "score": 0.40487982749938967 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.87, mean=562.87, max=562.87, sum=1125.74 (2)", - "tab": "General information", - "score": 562.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.698 (2)", - "tab": "Efficiency", - "score": 0.34907986892844145 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=393.623, mean=393.623, max=393.623, sum=787.245 (2)", - "tab": "General information", - "score": 393.62264150943395 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.821, - "details": { - "description": "min=0.821, mean=0.821, max=0.821, sum=1.643 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)", - "tab": "Efficiency", - "score": 0.3290608903194996 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=298.494, mean=298.494, max=298.494, sum=596.987 (2)", - "tab": "General information", - "score": 298.4936170212766 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)", - "tab": "Efficiency", - "score": 0.32275488458830737 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=456.8, mean=456.8, max=456.8, sum=913.6 (2)", - "tab": "General information", - "score": 456.8 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)", - "tab": "Efficiency", - "score": 0.364848568325951 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=570.119, mean=570.119, max=570.119, sum=1140.238 (2)", - "tab": "General information", - "score": 570.1190476190476 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.556, - "details": { - "description": "min=0.556, mean=0.556, max=0.556, sum=1.111 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.359, mean=0.359, max=0.359, sum=0.718 (2)", - "tab": "Efficiency", - "score": 0.3588152726491292 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=597.667, mean=597.667, max=597.667, sum=1195.333 (2)", - "tab": "General information", - "score": 597.6666666666666 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.899, - "details": { - "description": "min=0.899, mean=0.899, max=0.899, sum=1.797 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.729 (2)", - "tab": "Efficiency", - "score": 0.3646186044139247 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.366, mean=0.366, max=0.366, sum=0.731 (2)", - "tab": "Efficiency", - "score": 0.36553433728335527 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.761 (2)", - "tab": "Efficiency", - "score": 0.38066073894500735 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)", - "tab": "Efficiency", - "score": 0.7130387075019605 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)", - "tab": "Efficiency", - "score": 0.36007895975401905 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)", - "tab": "Efficiency", - "score": 0.3358402029837969 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.663 (2)", - "tab": "Efficiency", - "score": 0.3316040589259221 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.747 (2)", - "tab": "Efficiency", - "score": 0.3736002833754928 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)", - "tab": "Efficiency", - "score": 0.32468783655086486 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.785 (2)", - "tab": "Efficiency", - "score": 0.3924832533526894 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)", - "tab": "Efficiency", - "score": 0.3602875184575352 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.399, mean=0.399, max=0.399, sum=0.798 (2)", - "tab": "Efficiency", - "score": 0.39876955968362315 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)", - "tab": "Efficiency", - "score": 0.5536784272567898 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.475, mean=0.475, max=0.475, sum=0.949 (2)", - "tab": "Efficiency", - "score": 0.474577054695741 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.916, mean=506.916, max=506.916, sum=1013.832 (2)", - "tab": "General information", - "score": 506.9161290322581 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=510.261, mean=510.261, max=510.261, sum=1020.522 (2)", - "tab": "General information", - "score": 510.2610837438424 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=871.46, mean=871.46, max=871.46, sum=1742.92 (2)", - "tab": "General information", - "score": 871.46 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2807.903, mean=2807.903, max=2807.903, sum=5615.806 (2)", - "tab": "General information", - "score": 2807.9030303030304 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.217, mean=365.217, max=365.217, sum=730.434 (2)", - "tab": "General information", - "score": 365.2171717171717 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=460.311, mean=460.311, max=460.311, sum=920.622 (2)", - "tab": "General information", - "score": 460.31088082901556 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=367.349, mean=367.349, max=367.349, sum=734.697 (2)", - "tab": "General information", - "score": 367.34871794871793 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=558.326, mean=558.326, max=558.326, sum=1116.652 (2)", - "tab": "General information", - "score": 558.325925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=395.277, mean=395.277, max=395.277, sum=790.555 (2)", - "tab": "General information", - "score": 395.2773109243698 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=573.536, mean=573.536, max=573.536, sum=1147.073 (2)", - "tab": "General information", - "score": 573.5364238410596 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.521, mean=488.521, max=488.521, sum=977.042 (2)", - "tab": "General information", - "score": 488.52110091743117 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=823.477, mean=823.477, max=823.477, sum=1646.954 (2)", - "tab": "General information", - "score": 823.4768518518518 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2230.176, mean=2230.176, max=2230.176, sum=4460.353 (2)", - "tab": "General information", - "score": 2230.176470588235 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1441.354, mean=1441.354, max=1441.354, sum=2882.709 (2)", - "tab": "General information", - "score": 1441.3544303797469 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)", - "tab": "Efficiency", - "score": 0.34584820110167086 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.357, mean=0.357, max=0.357, sum=0.714 (2)", - "tab": "Efficiency", - "score": 0.35706568856275717 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=315.121, mean=315.121, max=315.121, sum=630.242 (2)", - "tab": "General information", - "score": 315.1210762331838 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.504, mean=334.504, max=334.504, sum=669.008 (2)", - "tab": "General information", - "score": 334.5038167938931 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.909, - "details": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)", - "tab": "Efficiency", - "score": 0.37501588931753616 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=633.579, mean=633.579, max=633.579, sum=1267.157 (2)", - "tab": "General information", - "score": 633.5785123966942 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.853, - "details": { - "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)", - "tab": "Efficiency", - "score": 0.34693217131257786 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.632, mean=442.632, max=442.632, sum=885.264 (2)", - "tab": "General information", - "score": 442.6319018404908 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=1.339 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.719 (2)", - "tab": "Efficiency", - "score": 0.3595333376101085 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=674.848, mean=674.848, max=674.848, sum=1349.696 (2)", - "tab": "General information", - "score": 674.8482142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.709 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)", - "tab": "Efficiency", - "score": 0.3462491313230644 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.854, mean=276.854, max=276.854, sum=553.709 (2)", - "tab": "General information", - "score": 276.8543689320388 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.949, - "details": { - "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Efficiency", - "score": 0.3498607089376857 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.415, mean=397.415, max=397.415, sum=794.829 (2)", - "tab": "General information", - "score": 397.4145299145299 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)", - "tab": "Efficiency", - "score": 0.3427603816986084 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=335.35, mean=335.35, max=335.35, sum=670.7 (2)", - "tab": "General information", - "score": 335.35 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.921, - "details": { - "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)", - "tab": "Efficiency", - "score": 0.3433326785744074 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=296.7, mean=296.7, max=296.7, sum=593.4 (2)", - "tab": "General information", - "score": 296.6998722860792 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.669, - "details": { - "description": "min=0.669, mean=0.669, max=0.669, sum=1.339 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.34657375729841994 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.374, mean=0.374, max=0.374, sum=0.749 (2)", - "tab": "Efficiency", - "score": 0.37438980161144747 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.182, mean=469.182, max=469.182, sum=938.364 (2)", - "tab": "General information", - "score": 469.1820809248555 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=661.494, mean=661.494, max=661.494, sum=1322.988 (2)", - "tab": "General information", - "score": 661.4938547486033 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.719 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)", - "tab": "Efficiency", - "score": 0.3719378265680051 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=592.637, mean=592.637, max=592.637, sum=1185.275 (2)", - "tab": "General information", - "score": 592.6372549019608 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)", - "tab": "Efficiency", - "score": 0.35996099313100177 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=521.364, mean=521.364, max=521.364, sum=1042.728 (2)", - "tab": "General information", - "score": 521.3641975308642 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755, - "details": { - "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)", - "tab": "Efficiency", - "score": 0.340008375861428 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=401.427, mean=401.427, max=401.427, sum=802.855 (2)", - "tab": "General information", - "score": 401.42727272727274 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)", - "tab": "Efficiency", - "score": 0.43211937923820654 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1159.931, mean=1159.931, max=1159.931, sum=2319.861 (2)", - "tab": "General information", - "score": 1159.930612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.353, mean=0.353, max=0.353, sum=0.707 (2)", - "tab": "Efficiency", - "score": 0.35334858491053034 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=443.1, mean=443.1, max=443.1, sum=886.199 (2)", - "tab": "General information", - "score": 443.0995024875622 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)", - "tab": "Efficiency", - "score": 0.33793931696788376 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.819, mean=336.819, max=336.819, sum=673.639 (2)", - "tab": "General information", - "score": 336.8192771084337 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)", - "tab": "Efficiency", - "score": 0.358185218788727 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=269.07, mean=269.07, max=269.07, sum=538.14 (2)", - "tab": "General information", - "score": 269.0701754385965 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json deleted file mode 100644 index 166da7894..000000000 --- a/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5 7B", - "id": "qwen/qwen1.5-7b", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.626, - "details": { - "description": "min=0.364, mean=0.626, max=0.863, sum=71.339 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.269, mean=0.302, max=0.42, sum=34.377 (114)", - "tab": "Efficiency", - "score": 0.3015485066726155 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=269.07, mean=618.598, max=2807.903, sum=70520.198 (114)", - "tab": "General information", - "score": 618.5982315160392 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.562 (2)", - "tab": "Efficiency", - "score": 0.28086970567703246 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=371.19, mean=371.19, max=371.19, sum=742.38 (2)", - "tab": "General information", - "score": 371.19 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.526, - "details": { - "description": "min=0.526, mean=0.526, max=0.526, sum=1.052 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)", - "tab": "Efficiency", - "score": 0.2861745004300718 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)", - "tab": "General information", - "score": 346.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.471, - "details": { - "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)", - "tab": "Efficiency", - "score": 0.2962386703491211 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)", - "tab": "Efficiency", - "score": 0.3117961171600554 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.299501326084137 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)", - "tab": "Efficiency", - "score": 0.3033126187324524 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.577 (2)", - "tab": "Efficiency", - "score": 0.2886359746745556 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.32153993026882993 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=561.25, mean=561.25, max=561.25, sum=1122.5 (2)", - "tab": "General information", - "score": 561.25 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=479.979, mean=479.979, max=479.979, sum=959.958 (2)", - "tab": "General information", - "score": 479.9791666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=831.58, mean=831.58, max=831.58, sum=1663.16 (2)", - "tab": "General information", - "score": 831.58 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=600.7, mean=600.7, max=600.7, sum=1201.4 (2)", - "tab": "General information", - "score": 600.7 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=499.098, mean=499.098, max=499.098, sum=998.197 (2)", - "tab": "General information", - "score": 499.0982658959538 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=509.265, mean=509.265, max=509.265, sum=1018.529 (2)", - "tab": "General information", - "score": 509.2647058823529 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.597 (2)", - "tab": "Efficiency", - "score": 0.2982983756065369 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=379.64, mean=379.64, max=379.64, sum=759.28 (2)", - "tab": "General information", - "score": 379.64 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.447, - "details": { - "description": "min=0.447, mean=0.447, max=0.447, sum=0.895 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.283, mean=0.283, max=0.283, sum=0.566 (2)", - "tab": "Efficiency", - "score": 0.282820323057342 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=620.939, mean=620.939, max=620.939, sum=1241.877 (2)", - "tab": "General information", - "score": 620.938596491228 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4, - "details": { - "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.2939557838439941 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=422.06, mean=422.06, max=422.06, sum=844.12 (2)", - "tab": "General information", - "score": 422.06 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)", - "tab": "Efficiency", - "score": 0.2966193402255023 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=387.713, mean=387.713, max=387.713, sum=775.426 (2)", - "tab": "General information", - "score": 387.712962962963 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691, - "details": { - "description": "min=0.691, mean=0.691, max=0.691, sum=1.383 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)", - "tab": "Efficiency", - "score": 0.28725898534155353 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=322.09, mean=322.09, max=322.09, sum=644.18 (2)", - "tab": "General information", - "score": 322.09003215434086 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.603, - "details": { - "description": "min=0.603, mean=0.603, max=0.603, sum=1.206 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.617 (2)", - "tab": "Efficiency", - "score": 0.30863527515355277 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)", - "tab": "Efficiency", - "score": 0.2926285613513162 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)", - "tab": "Efficiency", - "score": 0.32274515889925004 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)", - "tab": "Efficiency", - "score": 0.30344173058964846 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1118.199, mean=1118.199, max=1118.199, sum=2236.397 (2)", - "tab": "General information", - "score": 1118.1985294117646 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=732.34, mean=732.34, max=732.34, sum=1464.681 (2)", - "tab": "General information", - "score": 732.3404255319149 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1656.969, mean=1656.969, max=1656.969, sum=3313.939 (2)", - "tab": "General information", - "score": 1656.9693611473272 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=574.417, mean=574.417, max=574.417, sum=1148.833 (2)", - "tab": "General information", - "score": 574.4166666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)", - "tab": "Efficiency", - "score": 0.28910151720046995 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=421.16, mean=421.16, max=421.16, sum=842.32 (2)", - "tab": "General information", - "score": 421.16 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.671, - "details": { - "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)", - "tab": "Efficiency", - "score": 0.30717346699614273 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=582.849, mean=582.849, max=582.849, sum=1165.697 (2)", - "tab": "General information", - "score": 582.8486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)", - "tab": "Efficiency", - "score": 0.3062057161331177 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=562.87, mean=562.87, max=562.87, sum=1125.74 (2)", - "tab": "General information", - "score": 562.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691, - "details": { - "description": "min=0.691, mean=0.691, max=0.691, sum=1.381 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)", - "tab": "Efficiency", - "score": 0.2947473319071644 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=393.623, mean=393.623, max=393.623, sum=787.245 (2)", - "tab": "General information", - "score": 393.62264150943395 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.157 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)", - "tab": "Efficiency", - "score": 0.2803657531738281 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=298.494, mean=298.494, max=298.494, sum=596.987 (2)", - "tab": "General information", - "score": 298.4936170212766 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572, - "details": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.269, mean=0.269, max=0.269, sum=0.539 (2)", - "tab": "Efficiency", - "score": 0.2693853361853238 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=456.8, mean=456.8, max=456.8, sum=913.6 (2)", - "tab": "General information", - "score": 456.8 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)", - "tab": "Efficiency", - "score": 0.2938981220204994 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=570.119, mean=570.119, max=570.119, sum=1140.238 (2)", - "tab": "General information", - "score": 570.1190476190476 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.397, - "details": { - "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.601 (2)", - "tab": "Efficiency", - "score": 0.300293557227604 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=597.667, mean=597.667, max=597.667, sum=1195.333 (2)", - "tab": "General information", - "score": 597.6666666666666 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.789, - "details": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.578 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)", - "tab": "Efficiency", - "score": 0.30256526470184325 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)", - "tab": "Efficiency", - "score": 0.29262745321677824 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.3042095494270325 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.839 (2)", - "tab": "Efficiency", - "score": 0.4195035573207971 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)", - "tab": "Efficiency", - "score": 0.3027432386321251 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.589 (2)", - "tab": "Efficiency", - "score": 0.29444977903613156 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)", - "tab": "Efficiency", - "score": 0.2909054010342329 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)", - "tab": "Efficiency", - "score": 0.29262985565044264 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.3041165916859603 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.603 (2)", - "tab": "Efficiency", - "score": 0.3013988425400083 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)", - "tab": "Efficiency", - "score": 0.3090610066685108 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.635 (2)", - "tab": "Efficiency", - "score": 0.31764531577074967 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.364, mean=0.364, max=0.364, sum=0.727 (2)", - "tab": "Efficiency", - "score": 0.3635554044854407 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)", - "tab": "Efficiency", - "score": 0.32297819073190165 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=506.916, mean=506.916, max=506.916, sum=1013.832 (2)", - "tab": "General information", - "score": 506.9161290322581 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=510.261, mean=510.261, max=510.261, sum=1020.522 (2)", - "tab": "General information", - "score": 510.2610837438424 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=871.46, mean=871.46, max=871.46, sum=1742.92 (2)", - "tab": "General information", - "score": 871.46 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2807.903, mean=2807.903, max=2807.903, sum=5615.806 (2)", - "tab": "General information", - "score": 2807.9030303030304 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=365.217, mean=365.217, max=365.217, sum=730.434 (2)", - "tab": "General information", - "score": 365.2171717171717 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=460.311, mean=460.311, max=460.311, sum=920.622 (2)", - "tab": "General information", - "score": 460.31088082901556 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=367.349, mean=367.349, max=367.349, sum=734.697 (2)", - "tab": "General information", - "score": 367.34871794871793 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=558.326, mean=558.326, max=558.326, sum=1116.652 (2)", - "tab": "General information", - "score": 558.325925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=395.277, mean=395.277, max=395.277, sum=790.555 (2)", - "tab": "General information", - "score": 395.2773109243698 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=573.536, mean=573.536, max=573.536, sum=1147.073 (2)", - "tab": "General information", - "score": 573.5364238410596 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=488.521, mean=488.521, max=488.521, sum=977.042 (2)", - "tab": "General information", - "score": 488.52110091743117 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=823.477, mean=823.477, max=823.477, sum=1646.954 (2)", - "tab": "General information", - "score": 823.4768518518518 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2230.176, mean=2230.176, max=2230.176, sum=4460.353 (2)", - "tab": "General information", - "score": 2230.176470588235 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1441.354, mean=1441.354, max=1441.354, sum=2882.709 (2)", - "tab": "General information", - "score": 1441.3544303797469 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.695, - "details": { - "description": "min=0.695, mean=0.695, max=0.695, sum=1.389 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)", - "tab": "Efficiency", - "score": 0.28891397057092777 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)", - "tab": "Efficiency", - "score": 0.2980237170940137 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=315.121, mean=315.121, max=315.121, sum=630.242 (2)", - "tab": "General information", - "score": 315.1210762331838 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=334.504, mean=334.504, max=334.504, sum=669.008 (2)", - "tab": "General information", - "score": 334.5038167938931 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.2993730572629566 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=633.579, mean=633.579, max=633.579, sum=1267.157 (2)", - "tab": "General information", - "score": 633.5785123966942 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706, - "details": { - "description": "min=0.706, mean=0.706, max=0.706, sum=1.411 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.283, mean=0.283, max=0.283, sum=0.566 (2)", - "tab": "Efficiency", - "score": 0.28320794456575543 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=442.632, mean=442.632, max=442.632, sum=885.264 (2)", - "tab": "General information", - "score": 442.6319018404908 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.411, - "details": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.292, mean=0.292, max=0.292, sum=0.583 (2)", - "tab": "Efficiency", - "score": 0.2917012700012752 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=674.848, mean=674.848, max=674.848, sum=1349.696 (2)", - "tab": "General information", - "score": 674.8482142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.816, - "details": { - "description": "min=0.816, mean=0.816, max=0.816, sum=1.631 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.607 (2)", - "tab": "Efficiency", - "score": 0.3037459641984365 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=276.854, mean=276.854, max=276.854, sum=553.709 (2)", - "tab": "General information", - "score": 276.8543689320388 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.863, - "details": { - "description": "min=0.863, mean=0.863, max=0.863, sum=1.726 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)", - "tab": "Efficiency", - "score": 0.30402050364730704 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=397.415, mean=397.415, max=397.415, sum=794.829 (2)", - "tab": "General information", - "score": 397.4145299145299 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)", - "tab": "Efficiency", - "score": 0.3079418969154358 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=335.35, mean=335.35, max=335.35, sum=670.7 (2)", - "tab": "General information", - "score": 335.35 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.765, - "details": { - "description": "min=0.765, mean=0.765, max=0.765, sum=1.53 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)", - "tab": "Efficiency", - "score": 0.2874623727372171 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=296.7, mean=296.7, max=296.7, sum=593.4 (2)", - "tab": "General information", - "score": 296.6998722860792 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.372, - "details": { - "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.294, mean=0.294, max=0.294, sum=0.587 (2)", - "tab": "Efficiency", - "score": 0.29359787530292664 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)", - "tab": "Efficiency", - "score": 0.2979323072806417 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=469.182, mean=469.182, max=469.182, sum=938.364 (2)", - "tab": "General information", - "score": 469.1820809248555 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=661.494, mean=661.494, max=661.494, sum=1322.988 (2)", - "tab": "General information", - "score": 661.4938547486033 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.696, - "details": { - "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)", - "tab": "Efficiency", - "score": 0.29277056572484034 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=592.637, mean=592.637, max=592.637, sum=1185.275 (2)", - "tab": "General information", - "score": 592.6372549019608 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688, - "details": { - "description": "min=0.688, mean=0.688, max=0.688, sum=1.377 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)", - "tab": "Efficiency", - "score": 0.30120949097621585 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=521.364, mean=521.364, max=521.364, sum=1042.728 (2)", - "tab": "General information", - "score": 521.3641975308642 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.627, - "details": { - "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)", - "tab": "Efficiency", - "score": 0.30815364880995316 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=401.427, mean=401.427, max=401.427, sum=802.855 (2)", - "tab": "General information", - "score": 401.42727272727274 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.727, - "details": { - "description": "min=0.727, mean=0.727, max=0.727, sum=1.453 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)", - "tab": "Efficiency", - "score": 0.2958566675380785 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1159.931, mean=1159.931, max=1159.931, sum=2319.861 (2)", - "tab": "General information", - "score": 1159.930612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.672 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)", - "tab": "Efficiency", - "score": 0.29908941278410195 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=443.1, mean=443.1, max=443.1, sum=886.199 (2)", - "tab": "General information", - "score": 443.0995024875622 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.488, - "details": { - "description": "min=0.488, mean=0.488, max=0.488, sum=0.976 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)", - "tab": "Efficiency", - "score": 0.2861345144639532 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=336.819, mean=336.819, max=336.819, sum=673.639 (2)", - "tab": "General information", - "score": 336.8192771084337 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.3150970712739822 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=269.07, mean=269.07, max=269.07, sum=538.14 (2)", - "tab": "General information", - "score": 269.0701754385965 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json deleted file mode 100644 index 6f8b955e0..000000000 --- a/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2 Instruct 72B", - "id": "qwen/qwen2-72b-instruct", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824, - "details": { - "description": "min=0.52, mean=0.824, max=0.979, sum=93.879 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.195, mean=0.359, max=2.502, sum=40.898 (114)", - "tab": "Efficiency", - "score": 0.3587521754503106 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=276.07, mean=625.598, max=2814.903, sum=71318.198 (114)", - "tab": "General information", - "score": 625.5982315160392 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67, - "details": { - "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.395, mean=0.395, max=0.395, sum=0.79 (2)", - "tab": "Efficiency", - "score": 0.3948828268051148 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=378.19, mean=378.19, max=378.19, sum=756.38 (2)", - "tab": "General information", - "score": 378.19 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.266, mean=0.266, max=0.266, sum=0.531 (2)", - "tab": "Efficiency", - "score": 0.2657013893127441 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)", - "tab": "General information", - "score": 353.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.598, - "details": { - "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)", - "tab": "Efficiency", - "score": 0.24894725322723388 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)", - "tab": "Efficiency", - "score": 0.2977961285246743 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)", - "tab": "Efficiency", - "score": 0.3207618069648743 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.334, mean=0.334, max=0.334, sum=0.667 (2)", - "tab": "Efficiency", - "score": 0.3337481117248535 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)", - "tab": "Efficiency", - "score": 0.2340707227673834 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)", - "tab": "Efficiency", - "score": 0.25010308097390566 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=568.25, mean=568.25, max=568.25, sum=1136.5 (2)", - "tab": "General information", - "score": 568.25 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=486.979, mean=486.979, max=486.979, sum=973.958 (2)", - "tab": "General information", - "score": 486.9791666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=838.58, mean=838.58, max=838.58, sum=1677.16 (2)", - "tab": "General information", - "score": 838.58 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=607.7, mean=607.7, max=607.7, sum=1215.4 (2)", - "tab": "General information", - "score": 607.7 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=506.098, mean=506.098, max=506.098, sum=1012.197 (2)", - "tab": "General information", - "score": 506.0982658959538 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=516.265, mean=516.265, max=516.265, sum=1032.529 (2)", - "tab": "General information", - "score": 516.2647058823529 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.281, mean=0.281, max=0.281, sum=0.563 (2)", - "tab": "Efficiency", - "score": 0.2812828135490417 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=386.64, mean=386.64, max=386.64, sum=773.28 (2)", - "tab": "General information", - "score": 386.64 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.737, - "details": { - "description": "min=0.737, mean=0.737, max=0.737, sum=1.474 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.265, mean=0.265, max=0.265, sum=0.53 (2)", - "tab": "Efficiency", - "score": 0.26492034552390115 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=627.939, mean=627.939, max=627.939, sum=1255.877 (2)", - "tab": "General information", - "score": 627.938596491228 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58, - "details": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.254, mean=0.254, max=0.254, sum=0.507 (2)", - "tab": "Efficiency", - "score": 0.25351563215255735 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=429.06, mean=429.06, max=429.06, sum=858.12 (2)", - "tab": "General information", - "score": 429.06 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.513 (2)", - "tab": "Efficiency", - "score": 0.256509714656406 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.713, mean=394.713, max=394.713, sum=789.426 (2)", - "tab": "General information", - "score": 394.712962962963 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.717 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.409 (2)", - "tab": "Efficiency", - "score": 0.20427469348600824 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.09, mean=329.09, max=329.09, sum=658.18 (2)", - "tab": "General information", - "score": 329.09003215434086 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.507, mean=0.507, max=0.507, sum=1.014 (2)", - "tab": "Efficiency", - "score": 0.5070785135030746 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.31040529579135545 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)", - "tab": "Efficiency", - "score": 0.40680916352875074 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.32369842482548133 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1125.199, mean=1125.199, max=1125.199, sum=2250.397 (2)", - "tab": "General information", - "score": 1125.1985294117646 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=739.34, mean=739.34, max=739.34, sum=1478.681 (2)", - "tab": "General information", - "score": 739.3404255319149 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1663.969, mean=1663.969, max=1663.969, sum=3327.939 (2)", - "tab": "General information", - "score": 1663.9693611473272 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=581.417, mean=581.417, max=581.417, sum=1162.833 (2)", - "tab": "General information", - "score": 581.4166666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.195, mean=0.195, max=0.195, sum=0.389 (2)", - "tab": "Efficiency", - "score": 0.19451653003692626 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=428.16, mean=428.16, max=428.16, sum=856.32 (2)", - "tab": "General information", - "score": 428.16 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.641 (2)", - "tab": "Efficiency", - "score": 0.32045089571099533 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=589.849, mean=589.849, max=589.849, sum=1179.697 (2)", - "tab": "General information", - "score": 589.8486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.701 (2)", - "tab": "Efficiency", - "score": 0.350736882686615 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.87, mean=569.87, max=569.87, sum=1139.74 (2)", - "tab": "General information", - "score": 569.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)", - "tab": "Efficiency", - "score": 0.2597639983555056 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=400.623, mean=400.623, max=400.623, sum=801.245 (2)", - "tab": "General information", - "score": 400.62264150943395 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "description": "min=0.872, mean=0.872, max=0.872, sum=1.745 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.242, mean=0.242, max=0.242, sum=0.484 (2)", - "tab": "Efficiency", - "score": 0.2420806296328281 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=305.494, mean=305.494, max=305.494, sum=610.987 (2)", - "tab": "General information", - "score": 305.4936170212766 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.235, mean=0.235, max=0.235, sum=0.47 (2)", - "tab": "Efficiency", - "score": 0.23504354542699354 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=463.8, mean=463.8, max=463.8, sum=927.6 (2)", - "tab": "General information", - "score": 463.8 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.651 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.261, mean=0.261, max=0.261, sum=0.523 (2)", - "tab": "Efficiency", - "score": 0.2613614286695208 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=577.119, mean=577.119, max=577.119, sum=1154.238 (2)", - "tab": "General information", - "score": 577.1190476190476 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667, - "details": { - "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)", - "tab": "Efficiency", - "score": 0.3330562947288392 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=604.667, mean=604.667, max=604.667, sum=1209.333 (2)", - "tab": "General information", - "score": 604.6666666666666 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=1.865 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.247, mean=0.247, max=0.247, sum=0.495 (2)", - "tab": "Efficiency", - "score": 0.24744614170443627 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)", - "tab": "Efficiency", - "score": 0.3010592906933113 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.29, mean=0.29, max=0.29, sum=0.581 (2)", - "tab": "Efficiency", - "score": 0.2903395962715149 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.629, mean=0.629, max=0.629, sum=1.258 (2)", - "tab": "Efficiency", - "score": 0.6291334065524015 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.913 (2)", - "tab": "Efficiency", - "score": 0.4567244630871397 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)", - "tab": "Efficiency", - "score": 0.24882311524504824 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.245, mean=0.245, max=0.245, sum=0.489 (2)", - "tab": "Efficiency", - "score": 0.24466082010513696 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)", - "tab": "Efficiency", - "score": 0.2570408988881994 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)", - "tab": "Efficiency", - "score": 0.26973113893460826 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.57 (2)", - "tab": "Efficiency", - "score": 0.2847776444542487 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.641 (2)", - "tab": "Efficiency", - "score": 0.32032192956416977 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.357, mean=0.357, max=0.357, sum=0.714 (2)", - "tab": "Efficiency", - "score": 0.3567825931089896 - }, - "High School US History - Observed inference time (s)": { - "description": "min=2.502, mean=2.502, max=2.502, sum=5.003 (2)", - "tab": "Efficiency", - "score": 2.501642145362555 - }, - "High School World History - Observed inference time (s)": { - "description": "min=2.182, mean=2.182, max=2.182, sum=4.364 (2)", - "tab": "Efficiency", - "score": 2.18210094890514 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.916, mean=513.916, max=513.916, sum=1027.832 (2)", - "tab": "General information", - "score": 513.916129032258 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=517.261, mean=517.261, max=517.261, sum=1034.522 (2)", - "tab": "General information", - "score": 517.2610837438424 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=878.46, mean=878.46, max=878.46, sum=1756.92 (2)", - "tab": "General information", - "score": 878.46 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2814.903, mean=2814.903, max=2814.903, sum=5629.806 (2)", - "tab": "General information", - "score": 2814.9030303030304 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.217, mean=372.217, max=372.217, sum=744.434 (2)", - "tab": "General information", - "score": 372.2171717171717 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=467.311, mean=467.311, max=467.311, sum=934.622 (2)", - "tab": "General information", - "score": 467.31088082901556 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=374.349, mean=374.349, max=374.349, sum=748.697 (2)", - "tab": "General information", - "score": 374.34871794871793 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=565.326, mean=565.326, max=565.326, sum=1130.652 (2)", - "tab": "General information", - "score": 565.325925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=402.277, mean=402.277, max=402.277, sum=804.555 (2)", - "tab": "General information", - "score": 402.2773109243698 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=580.536, mean=580.536, max=580.536, sum=1161.073 (2)", - "tab": "General information", - "score": 580.5364238410596 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.521, mean=495.521, max=495.521, sum=991.042 (2)", - "tab": "General information", - "score": 495.52110091743117 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=830.477, mean=830.477, max=830.477, sum=1660.954 (2)", - "tab": "General information", - "score": 830.4768518518518 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2237.176, mean=2237.176, max=2237.176, sum=4474.353 (2)", - "tab": "General information", - "score": 2237.176470588235 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1448.354, mean=1448.354, max=1448.354, sum=2896.709 (2)", - "tab": "General information", - "score": 1448.3544303797469 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.275, mean=0.275, max=0.275, sum=0.55 (2)", - "tab": "Efficiency", - "score": 0.2751739634526685 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)", - "tab": "Efficiency", - "score": 0.32726097470931426 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=322.121, mean=322.121, max=322.121, sum=644.242 (2)", - "tab": "General information", - "score": 322.1210762331838 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.504, mean=341.504, max=341.504, sum=683.008 (2)", - "tab": "General information", - "score": 341.5038167938931 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)", - "tab": "Efficiency", - "score": 0.2972275757592572 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=640.579, mean=640.579, max=640.579, sum=1281.157 (2)", - "tab": "General information", - "score": 640.5785123966942 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.218, mean=0.218, max=0.218, sum=0.436 (2)", - "tab": "Efficiency", - "score": 0.21798631311194297 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.632, mean=449.632, max=449.632, sum=899.264 (2)", - "tab": "General information", - "score": 449.6319018404908 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.768, - "details": { - "description": "min=0.768, mean=0.768, max=0.768, sum=1.536 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.223, mean=0.223, max=0.223, sum=0.446 (2)", - "tab": "Efficiency", - "score": 0.22287436042513167 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=681.848, mean=681.848, max=681.848, sum=1363.696 (2)", - "tab": "General information", - "score": 681.8482142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.239, mean=0.239, max=0.239, sum=0.478 (2)", - "tab": "Efficiency", - "score": 0.23922002662732764 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.854, mean=283.854, max=283.854, sum=567.709 (2)", - "tab": "General information", - "score": 283.8543689320388 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.953, - "details": { - "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)", - "tab": "Efficiency", - "score": 0.2568996777901283 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.415, mean=404.415, max=404.415, sum=808.829 (2)", - "tab": "General information", - "score": 404.4145299145299 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9, - "details": { - "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)", - "tab": "Efficiency", - "score": 0.26675461292266844 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=342.35, mean=342.35, max=342.35, sum=684.7 (2)", - "tab": "General information", - "score": 342.35 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.943, - "details": { - "description": "min=0.943, mean=0.943, max=0.943, sum=1.885 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.227, mean=0.227, max=0.227, sum=0.453 (2)", - "tab": "Efficiency", - "score": 0.22672867470469663 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=303.7, mean=303.7, max=303.7, sum=607.4 (2)", - "tab": "General information", - "score": 303.6998722860792 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.815, - "details": { - "description": "min=0.815, mean=0.815, max=0.815, sum=1.629 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.237, mean=0.237, max=0.237, sum=0.473 (2)", - "tab": "Efficiency", - "score": 0.23662481900584492 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.242, mean=0.242, max=0.242, sum=0.483 (2)", - "tab": "Efficiency", - "score": 0.241705964264257 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.182, mean=476.182, max=476.182, sum=952.364 (2)", - "tab": "General information", - "score": 476.1820809248555 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=668.494, mean=668.494, max=668.494, sum=1336.988 (2)", - "tab": "General information", - "score": 668.4938547486033 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.902, - "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.804 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)", - "tab": "Efficiency", - "score": 0.2500531182569616 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=599.637, mean=599.637, max=599.637, sum=1199.275 (2)", - "tab": "General information", - "score": 599.6372549019608 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.914, - "details": { - "description": "min=0.914, mean=0.914, max=0.914, sum=1.827 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.257, mean=0.257, max=0.257, sum=0.515 (2)", - "tab": "Efficiency", - "score": 0.25728267504845137 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=528.364, mean=528.364, max=528.364, sum=1056.728 (2)", - "tab": "General information", - "score": 528.3641975308642 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.745, - "details": { - "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.219, mean=0.219, max=0.219, sum=0.437 (2)", - "tab": "Efficiency", - "score": 0.2186152393167669 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=408.427, mean=408.427, max=408.427, sum=816.855 (2)", - "tab": "General information", - "score": 408.42727272727274 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.837, - "details": { - "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.595 (2)", - "tab": "Efficiency", - "score": 0.29758678261114624 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1166.931, mean=1166.931, max=1166.931, sum=2333.861 (2)", - "tab": "General information", - "score": 1166.930612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.935, - "details": { - "description": "min=0.935, mean=0.935, max=0.935, sum=1.871 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.228, mean=0.228, max=0.228, sum=0.457 (2)", - "tab": "Efficiency", - "score": 0.22830370172339293 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=450.1, mean=450.1, max=450.1, sum=900.199 (2)", - "tab": "General information", - "score": 450.0995024875622 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56, - "details": { - "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.25, mean=0.25, max=0.25, sum=0.499 (2)", - "tab": "Efficiency", - "score": 0.24956520206956978 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.819, mean=343.819, max=343.819, sum=687.639 (2)", - "tab": "General information", - "score": 343.8192771084337 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.848, - "details": { - "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.197, mean=0.197, max=0.197, sum=0.394 (2)", - "tab": "Efficiency", - "score": 0.19691006342569986 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=276.07, mean=276.07, max=276.07, sum=552.14 (2)", - "tab": "General information", - "score": 276.0701754385965 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json deleted file mode 100644 index a61d620fd..000000000 --- a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5 Instruct Turbo 72B", - "id": "qwen/qwen2.5-72b-instruct-turbo", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.834, - "details": { - "description": "min=0.584, mean=0.834, max=0.99, sum=95.044 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.277, mean=0.504, max=1.68, sum=57.492 (114)", - "tab": "Efficiency", - "score": 0.5043123259817794 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=276.07, mean=625.598, max=2814.903, sum=71318.198 (114)", - "tab": "General information", - "score": 625.5982315160392 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68, - "details": { - "description": "min=0.68, mean=0.68, max=0.68, sum=1.36 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.877 (2)", - "tab": "Efficiency", - "score": 0.438259596824646 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=378.19, mean=378.19, max=378.19, sum=756.38 (2)", - "tab": "General information", - "score": 378.19 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.729 (2)", - "tab": "Efficiency", - "score": 0.3645249543366609 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)", - "tab": "General information", - "score": 353.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.588, - "details": { - "description": "min=0.588, mean=0.588, max=0.588, sum=1.176 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)", - "tab": "Efficiency", - "score": 0.5187593793869019 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.406, mean=0.406, max=0.406, sum=0.811 (2)", - "tab": "Efficiency", - "score": 0.40557659500175053 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.485, mean=0.485, max=0.485, sum=0.97 (2)", - "tab": "Efficiency", - "score": 0.48524248123168945 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.132 (2)", - "tab": "Efficiency", - "score": 0.5662378907203675 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)", - "tab": "Efficiency", - "score": 0.5277049872227487 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)", - "tab": "Efficiency", - "score": 0.4500672326368444 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=568.25, mean=568.25, max=568.25, sum=1136.5 (2)", - "tab": "General information", - "score": 568.25 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=486.979, mean=486.979, max=486.979, sum=973.958 (2)", - "tab": "General information", - "score": 486.9791666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=838.58, mean=838.58, max=838.58, sum=1677.16 (2)", - "tab": "General information", - "score": 838.58 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=607.7, mean=607.7, max=607.7, sum=1215.4 (2)", - "tab": "General information", - "score": 607.7 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=506.098, mean=506.098, max=506.098, sum=1012.197 (2)", - "tab": "General information", - "score": 506.0982658959538 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=516.265, mean=516.265, max=516.265, sum=1032.529 (2)", - "tab": "General information", - "score": 516.2647058823529 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.506, mean=0.506, max=0.506, sum=1.011 (2)", - "tab": "Efficiency", - "score": 0.5056298255920411 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=386.64, mean=386.64, max=386.64, sum=773.28 (2)", - "tab": "General information", - "score": 386.64 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.728, - "details": { - "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.646, mean=0.646, max=0.646, sum=1.293 (2)", - "tab": "Efficiency", - "score": 0.6464532927462929 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=627.939, mean=627.939, max=627.939, sum=1255.877 (2)", - "tab": "General information", - "score": 627.938596491228 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61, - "details": { - "description": "min=0.61, mean=0.61, max=0.61, sum=1.22 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.517, mean=0.517, max=0.517, sum=1.035 (2)", - "tab": "Efficiency", - "score": 0.5174938654899597 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=429.06, mean=429.06, max=429.06, sum=858.12 (2)", - "tab": "General information", - "score": 429.06 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.552, mean=0.552, max=0.552, sum=1.105 (2)", - "tab": "Efficiency", - "score": 0.55242551918383 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.713, mean=394.713, max=394.713, sum=789.426 (2)", - "tab": "General information", - "score": 394.712962962963 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.839, - "details": { - "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=1.352, mean=1.352, max=1.352, sum=2.704 (2)", - "tab": "Efficiency", - "score": 1.3517981679493207 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.09, mean=329.09, max=329.09, sum=658.18 (2)", - "tab": "General information", - "score": 329.09003215434086 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.729 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=1.02, mean=1.02, max=1.02, sum=2.039 (2)", - "tab": "Efficiency", - "score": 1.019735706203124 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.282, mean=0.282, max=0.282, sum=0.565 (2)", - "tab": "Efficiency", - "score": 0.2822888328673992 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.68, mean=1.68, max=1.68, sum=3.36 (2)", - "tab": "Efficiency", - "score": 1.6800112862630494 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.573, mean=0.573, max=0.573, sum=1.145 (2)", - "tab": "Efficiency", - "score": 0.5726091144910825 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1125.199, mean=1125.199, max=1125.199, sum=2250.397 (2)", - "tab": "General information", - "score": 1125.1985294117646 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=739.34, mean=739.34, max=739.34, sum=1478.681 (2)", - "tab": "General information", - "score": 739.3404255319149 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1663.969, mean=1663.969, max=1663.969, sum=3327.939 (2)", - "tab": "General information", - "score": 1663.9693611473272 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=581.417, mean=581.417, max=581.417, sum=1162.833 (2)", - "tab": "General information", - "score": 581.4166666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.789, mean=0.789, max=0.789, sum=1.578 (2)", - "tab": "Efficiency", - "score": 0.7888539290428161 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=428.16, mean=428.16, max=428.16, sum=856.32 (2)", - "tab": "General information", - "score": 428.16 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.991, mean=0.991, max=0.991, sum=1.983 (2)", - "tab": "Efficiency", - "score": 0.9913477442766491 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=589.849, mean=589.849, max=589.849, sum=1179.697 (2)", - "tab": "General information", - "score": 589.8486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.581, mean=0.581, max=0.581, sum=1.163 (2)", - "tab": "Efficiency", - "score": 0.5813773083686828 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.87, mean=569.87, max=569.87, sum=1139.74 (2)", - "tab": "General information", - "score": 569.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.872, - "details": { - "description": "min=0.872, mean=0.872, max=0.872, sum=1.743 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)", - "tab": "Efficiency", - "score": 0.7399316436839554 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=400.623, mean=400.623, max=400.623, sum=801.245 (2)", - "tab": "General information", - "score": 400.62264150943395 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.32127690010882437 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=305.494, mean=305.494, max=305.494, sum=610.987 (2)", - "tab": "General information", - "score": 305.4936170212766 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.513, mean=0.513, max=0.513, sum=1.026 (2)", - "tab": "Efficiency", - "score": 0.5130313610208446 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=463.8, mean=463.8, max=463.8, sum=927.6 (2)", - "tab": "General information", - "score": 463.8 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=1.022, mean=1.022, max=1.022, sum=2.044 (2)", - "tab": "Efficiency", - "score": 1.0221643580330744 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=577.119, mean=577.119, max=577.119, sum=1154.238 (2)", - "tab": "General information", - "score": 577.1190476190476 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.73, - "details": { - "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.489, mean=0.489, max=0.489, sum=0.978 (2)", - "tab": "Efficiency", - "score": 0.48887844501979766 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=604.667, mean=604.667, max=604.667, sum=1209.333 (2)", - "tab": "General information", - "score": 604.6666666666666 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.685 (2)", - "tab": "Efficiency", - "score": 0.34227523111527963 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)", - "tab": "Efficiency", - "score": 0.3364456193200473 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)", - "tab": "Efficiency", - "score": 0.38405280351638793 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.582, mean=0.582, max=0.582, sum=1.165 (2)", - "tab": "Efficiency", - "score": 0.5822634451317065 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.366, mean=0.366, max=0.366, sum=0.731 (2)", - "tab": "Efficiency", - "score": 0.3657490508724945 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)", - "tab": "Efficiency", - "score": 0.3882344139672314 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.623 (2)", - "tab": "Efficiency", - "score": 0.31144848542335707 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.364, mean=0.364, max=0.364, sum=0.727 (2)", - "tab": "Efficiency", - "score": 0.3636930130146168 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", - "tab": "Efficiency", - "score": 0.5723558383829453 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.782 (2)", - "tab": "Efficiency", - "score": 0.8909238490047834 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.623 (2)", - "tab": "Efficiency", - "score": 0.31171117397623327 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.376, mean=0.376, max=0.376, sum=0.751 (2)", - "tab": "Efficiency", - "score": 0.3756344163859332 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.453, mean=0.453, max=0.453, sum=0.907 (2)", - "tab": "Efficiency", - "score": 0.45333802466299017 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.526, mean=0.526, max=0.526, sum=1.051 (2)", - "tab": "Efficiency", - "score": 0.5255286924949678 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.916, mean=513.916, max=513.916, sum=1027.832 (2)", - "tab": "General information", - "score": 513.916129032258 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=517.261, mean=517.261, max=517.261, sum=1034.522 (2)", - "tab": "General information", - "score": 517.2610837438424 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=878.46, mean=878.46, max=878.46, sum=1756.92 (2)", - "tab": "General information", - "score": 878.46 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2814.903, mean=2814.903, max=2814.903, sum=5629.806 (2)", - "tab": "General information", - "score": 2814.9030303030304 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.217, mean=372.217, max=372.217, sum=744.434 (2)", - "tab": "General information", - "score": 372.2171717171717 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=467.311, mean=467.311, max=467.311, sum=934.622 (2)", - "tab": "General information", - "score": 467.31088082901556 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=374.349, mean=374.349, max=374.349, sum=748.697 (2)", - "tab": "General information", - "score": 374.34871794871793 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=565.326, mean=565.326, max=565.326, sum=1130.652 (2)", - "tab": "General information", - "score": 565.325925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=402.277, mean=402.277, max=402.277, sum=804.555 (2)", - "tab": "General information", - "score": 402.2773109243698 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=580.536, mean=580.536, max=580.536, sum=1161.073 (2)", - "tab": "General information", - "score": 580.5364238410596 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.521, mean=495.521, max=495.521, sum=991.042 (2)", - "tab": "General information", - "score": 495.52110091743117 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=830.477, mean=830.477, max=830.477, sum=1660.954 (2)", - "tab": "General information", - "score": 830.4768518518518 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2237.176, mean=2237.176, max=2237.176, sum=4474.353 (2)", - "tab": "General information", - "score": 2237.176470588235 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1448.354, mean=1448.354, max=1448.354, sum=2896.709 (2)", - "tab": "General information", - "score": 1448.3544303797469 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)", - "tab": "Efficiency", - "score": 0.42812311168208783 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.318, mean=0.318, max=0.318, sum=0.635 (2)", - "tab": "Efficiency", - "score": 0.3175856612110866 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=322.121, mean=322.121, max=322.121, sum=644.242 (2)", - "tab": "General information", - "score": 322.1210762331838 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.504, mean=341.504, max=341.504, sum=683.008 (2)", - "tab": "General information", - "score": 341.5038167938931 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.893, - "details": { - "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)", - "tab": "Efficiency", - "score": 0.4248029200498723 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=640.579, mean=640.579, max=640.579, sum=1281.157 (2)", - "tab": "General information", - "score": 640.5785123966942 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.89, - "details": { - "description": "min=0.89, mean=0.89, max=0.89, sum=1.779 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)", - "tab": "Efficiency", - "score": 0.3458571419394089 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.632, mean=449.632, max=449.632, sum=899.264 (2)", - "tab": "General information", - "score": 449.6319018404908 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777, - "details": { - "description": "min=0.777, mean=0.777, max=0.777, sum=1.554 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)", - "tab": "Efficiency", - "score": 0.3483003888811384 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=681.848, mean=681.848, max=681.848, sum=1363.696 (2)", - "tab": "General information", - "score": 681.8482142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.913, - "details": { - "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.587 (2)", - "tab": "Efficiency", - "score": 0.2933675108604061 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.854, mean=283.854, max=283.854, sum=567.709 (2)", - "tab": "General information", - "score": 283.8543689320388 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.953, - "details": { - "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.475, mean=0.475, max=0.475, sum=0.949 (2)", - "tab": "Efficiency", - "score": 0.4746182779980521 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.415, mean=404.415, max=404.415, sum=808.829 (2)", - "tab": "General information", - "score": 404.4145299145299 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)", - "tab": "Efficiency", - "score": 0.3110049200057983 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=342.35, mean=342.35, max=342.35, sum=684.7 (2)", - "tab": "General information", - "score": 342.35 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=1.865 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.345, mean=0.345, max=0.345, sum=0.689 (2)", - "tab": "Efficiency", - "score": 0.3445042967035091 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=303.7, mean=303.7, max=303.7, sum=607.4 (2)", - "tab": "General information", - "score": 303.6998722860792 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.787, - "details": { - "description": "min=0.787, mean=0.787, max=0.787, sum=1.573 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.583 (2)", - "tab": "Efficiency", - "score": 0.2913500532249495 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.641 (2)", - "tab": "Efficiency", - "score": 0.32045427327715487 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.182, mean=476.182, max=476.182, sum=952.364 (2)", - "tab": "General information", - "score": 476.1820809248555 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=668.494, mean=668.494, max=668.494, sum=1336.988 (2)", - "tab": "General information", - "score": 668.4938547486033 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)", - "tab": "Efficiency", - "score": 0.29262306565552754 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=599.637, mean=599.637, max=599.637, sum=1199.275 (2)", - "tab": "General information", - "score": 599.6372549019608 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.34, mean=0.34, max=0.34, sum=0.681 (2)", - "tab": "Efficiency", - "score": 0.340311410986347 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=528.364, mean=528.364, max=528.364, sum=1056.728 (2)", - "tab": "General information", - "score": 528.3641975308642 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782, - "details": { - "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.277, mean=0.277, max=0.277, sum=0.554 (2)", - "tab": "Efficiency", - "score": 0.2769838809967041 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=408.427, mean=408.427, max=408.427, sum=816.855 (2)", - "tab": "General information", - "score": 408.42727272727274 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.377, mean=0.377, max=0.377, sum=0.754 (2)", - "tab": "Efficiency", - "score": 0.3771621781952527 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1166.931, mean=1166.931, max=1166.931, sum=2333.861 (2)", - "tab": "General information", - "score": 1166.930612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.925, - "details": { - "description": "min=0.925, mean=0.925, max=0.925, sum=1.851 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)", - "tab": "Efficiency", - "score": 0.2910151019025205 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=450.1, mean=450.1, max=450.1, sum=900.199 (2)", - "tab": "General information", - "score": 450.0995024875622 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)", - "tab": "Efficiency", - "score": 0.35115946631833733 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.819, mean=343.819, max=343.819, sum=687.639 (2)", - "tab": "General information", - "score": 343.8192771084337 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.381, mean=0.381, max=0.381, sum=0.762 (2)", - "tab": "Efficiency", - "score": 0.3812444461019416 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=276.07, mean=276.07, max=276.07, sum=552.14 (2)", - "tab": "General information", - "score": 276.0701754385965 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json deleted file mode 100644 index c045e519d..000000000 --- a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5 Instruct Turbo 7B", - "id": "qwen/qwen2.5-7b-instruct-turbo", - "developer": "qwen", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.729, - "details": { - "description": "min=0.42, mean=0.729, max=0.919, sum=83.073 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.162, mean=0.242, max=0.44, sum=27.616 (114)", - "tab": "Efficiency", - "score": 0.24224721190343979 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=276.07, mean=625.598, max=2814.903, sum=71318.198 (114)", - "tab": "General information", - "score": 625.5982315160392 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49, - "details": { - "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.431, mean=0.431, max=0.431, sum=0.863 (2)", - "tab": "Efficiency", - "score": 0.43148461580276487 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=378.19, mean=378.19, max=378.19, sum=756.38 (2)", - "tab": "General information", - "score": 378.19 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689, - "details": { - "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)", - "tab": "Efficiency", - "score": 0.3332981339207402 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)", - "tab": "General information", - "score": 353.97777777777776 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51, - "details": { - "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.285, mean=0.285, max=0.285, sum=0.571 (2)", - "tab": "Efficiency", - "score": 0.28538883924484254 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.619 (2)", - "tab": "Efficiency", - "score": 0.309537861082289 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.302, mean=0.302, max=0.302, sum=0.604 (2)", - "tab": "Efficiency", - "score": 0.30183048248291017 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.279, mean=0.279, max=0.279, sum=0.558 (2)", - "tab": "Efficiency", - "score": 0.2791933488845825 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)", - "tab": "Efficiency", - "score": 0.3032711007002461 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)", - "tab": "Efficiency", - "score": 0.2996697425842285 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=568.25, mean=568.25, max=568.25, sum=1136.5 (2)", - "tab": "General information", - "score": 568.25 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=486.979, mean=486.979, max=486.979, sum=973.958 (2)", - "tab": "General information", - "score": 486.9791666666667 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=838.58, mean=838.58, max=838.58, sum=1677.16 (2)", - "tab": "General information", - "score": 838.58 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=607.7, mean=607.7, max=607.7, sum=1215.4 (2)", - "tab": "General information", - "score": 607.7 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=506.098, mean=506.098, max=506.098, sum=1012.197 (2)", - "tab": "General information", - "score": 506.0982658959538 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=516.265, mean=516.265, max=516.265, sum=1032.529 (2)", - "tab": "General information", - "score": 516.2647058823529 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.705 (2)", - "tab": "Efficiency", - "score": 0.3522661328315735 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=386.64, mean=386.64, max=386.64, sum=773.28 (2)", - "tab": "General information", - "score": 386.64 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.64, - "details": { - "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)", - "tab": "Efficiency", - "score": 0.34558368356604324 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=627.939, mean=627.939, max=627.939, sum=1255.877 (2)", - "tab": "General information", - "score": 627.938596491228 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42, - "details": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.314766480922699 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=429.06, mean=429.06, max=429.06, sum=858.12 (2)", - "tab": "General information", - "score": 429.06 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)", - "tab": "Efficiency", - "score": 0.32116924391852486 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.713, mean=394.713, max=394.713, sum=789.426 (2)", - "tab": "General information", - "score": 394.712962962963 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.746, - "details": { - "description": "min=0.746, mean=0.746, max=0.746, sum=1.492 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)", - "tab": "Efficiency", - "score": 0.4401504610129108 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.09, mean=329.09, max=329.09, sum=658.18 (2)", - "tab": "General information", - "score": 329.09003215434086 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757, - "details": { - "description": "min=0.757, mean=0.757, max=0.757, sum=1.513 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.394, mean=0.394, max=0.394, sum=0.788 (2)", - "tab": "Efficiency", - "score": 0.393971232806935 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.185, mean=0.185, max=0.185, sum=0.371 (2)", - "tab": "Efficiency", - "score": 0.18525678553479782 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.205, mean=0.205, max=0.205, sum=0.409 (2)", - "tab": "Efficiency", - "score": 0.20459390463698485 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.166, mean=0.166, max=0.166, sum=0.332 (2)", - "tab": "Efficiency", - "score": 0.16597708611706502 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1125.199, mean=1125.199, max=1125.199, sum=2250.397 (2)", - "tab": "General information", - "score": 1125.1985294117646 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=739.34, mean=739.34, max=739.34, sum=1478.681 (2)", - "tab": "General information", - "score": 739.3404255319149 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1663.969, mean=1663.969, max=1663.969, sum=3327.939 (2)", - "tab": "General information", - "score": 1663.9693611473272 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=581.417, mean=581.417, max=581.417, sum=1162.833 (2)", - "tab": "General information", - "score": 581.4166666666666 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)", - "tab": "Efficiency", - "score": 0.33019849777221677 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=428.16, mean=428.16, max=428.16, sum=856.32 (2)", - "tab": "General information", - "score": 428.16 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.671 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.629 (2)", - "tab": "Efficiency", - "score": 0.3143457660549565 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=589.849, mean=589.849, max=589.849, sum=1179.697 (2)", - "tab": "General information", - "score": 589.8486842105264 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.308, mean=0.308, max=0.308, sum=0.615 (2)", - "tab": "Efficiency", - "score": 0.3076848840713501 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.87, mean=569.87, max=569.87, sum=1139.74 (2)", - "tab": "General information", - "score": 569.87 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785, - "details": { - "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)", - "tab": "Efficiency", - "score": 0.33518469288664043 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=400.623, mean=400.623, max=400.623, sum=801.245 (2)", - "tab": "General information", - "score": 400.62264150943395 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736, - "details": { - "description": "min=0.736, mean=0.736, max=0.736, sum=1.472 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)", - "tab": "Efficiency", - "score": 0.2531234142628122 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=305.494, mean=305.494, max=305.494, sum=610.987 (2)", - "tab": "General information", - "score": 305.4936170212766 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.717, - "details": { - "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.198, mean=0.198, max=0.198, sum=0.396 (2)", - "tab": "Efficiency", - "score": 0.19794883070320918 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=463.8, mean=463.8, max=463.8, sum=927.6 (2)", - "tab": "General information", - "score": 463.8 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.643, - "details": { - "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.202, mean=0.202, max=0.202, sum=0.404 (2)", - "tab": "Efficiency", - "score": 0.2021035529949047 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=577.119, mean=577.119, max=577.119, sum=1154.238 (2)", - "tab": "General information", - "score": 577.1190476190476 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.587, - "details": { - "description": "min=0.587, mean=0.587, max=0.587, sum=1.175 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.197, mean=0.197, max=0.197, sum=0.393 (2)", - "tab": "Efficiency", - "score": 0.196545644411965 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=604.667, mean=604.667, max=604.667, sum=1209.333 (2)", - "tab": "General information", - "score": 604.6666666666666 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.878, - "details": { - "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.192, mean=0.192, max=0.192, sum=0.384 (2)", - "tab": "Efficiency", - "score": 0.19177444058079873 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.236, mean=0.236, max=0.236, sum=0.472 (2)", - "tab": "Efficiency", - "score": 0.23597407693346145 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.202, mean=0.202, max=0.202, sum=0.404 (2)", - "tab": "Efficiency", - "score": 0.20180433988571167 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)", - "tab": "Efficiency", - "score": 0.3130656791455818 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.215, mean=0.215, max=0.215, sum=0.43 (2)", - "tab": "Efficiency", - "score": 0.21512896725625702 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.192, mean=0.192, max=0.192, sum=0.384 (2)", - "tab": "Efficiency", - "score": 0.19191643611137113 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.204, mean=0.204, max=0.204, sum=0.409 (2)", - "tab": "Efficiency", - "score": 0.20429076965038592 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)", - "tab": "Efficiency", - "score": 0.2337868098859434 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.184, mean=0.184, max=0.184, sum=0.367 (2)", - "tab": "Efficiency", - "score": 0.18365505863638484 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.194, mean=0.194, max=0.194, sum=0.388 (2)", - "tab": "Efficiency", - "score": 0.19382640068104726 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.203, mean=0.203, max=0.203, sum=0.405 (2)", - "tab": "Efficiency", - "score": 0.20258700432033713 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.226, mean=0.226, max=0.226, sum=0.451 (2)", - "tab": "Efficiency", - "score": 0.22551235446223505 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)", - "tab": "Efficiency", - "score": 0.2492340417469249 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.231, mean=0.231, max=0.231, sum=0.462 (2)", - "tab": "Efficiency", - "score": 0.23088843812419393 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.916, mean=513.916, max=513.916, sum=1027.832 (2)", - "tab": "General information", - "score": 513.916129032258 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=517.261, mean=517.261, max=517.261, sum=1034.522 (2)", - "tab": "General information", - "score": 517.2610837438424 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=878.46, mean=878.46, max=878.46, sum=1756.92 (2)", - "tab": "General information", - "score": 878.46 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2814.903, mean=2814.903, max=2814.903, sum=5629.806 (2)", - "tab": "General information", - "score": 2814.9030303030304 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.217, mean=372.217, max=372.217, sum=744.434 (2)", - "tab": "General information", - "score": 372.2171717171717 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=467.311, mean=467.311, max=467.311, sum=934.622 (2)", - "tab": "General information", - "score": 467.31088082901556 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=374.349, mean=374.349, max=374.349, sum=748.697 (2)", - "tab": "General information", - "score": 374.34871794871793 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=565.326, mean=565.326, max=565.326, sum=1130.652 (2)", - "tab": "General information", - "score": 565.325925925926 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=402.277, mean=402.277, max=402.277, sum=804.555 (2)", - "tab": "General information", - "score": 402.2773109243698 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=580.536, mean=580.536, max=580.536, sum=1161.073 (2)", - "tab": "General information", - "score": 580.5364238410596 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.521, mean=495.521, max=495.521, sum=991.042 (2)", - "tab": "General information", - "score": 495.52110091743117 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=830.477, mean=830.477, max=830.477, sum=1660.954 (2)", - "tab": "General information", - "score": 830.4768518518518 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2237.176, mean=2237.176, max=2237.176, sum=4474.353 (2)", - "tab": "General information", - "score": 2237.176470588235 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1448.354, mean=1448.354, max=1448.354, sum=2896.709 (2)", - "tab": "General information", - "score": 1448.3544303797469 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.794, - "details": { - "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.206, mean=0.206, max=0.206, sum=0.411 (2)", - "tab": "Efficiency", - "score": 0.20559344591046663 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.191, mean=0.191, max=0.191, sum=0.381 (2)", - "tab": "Efficiency", - "score": 0.19073554941716084 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=322.121, mean=322.121, max=322.121, sum=644.242 (2)", - "tab": "General information", - "score": 322.1210762331838 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.504, mean=341.504, max=341.504, sum=683.008 (2)", - "tab": "General information", - "score": 341.5038167938931 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86, - "details": { - "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.23, mean=0.23, max=0.23, sum=0.46 (2)", - "tab": "Efficiency", - "score": 0.22999596792804308 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=640.579, mean=640.579, max=640.579, sum=1281.157 (2)", - "tab": "General information", - "score": 640.5785123966942 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "description": "min=0.773, mean=0.773, max=0.773, sum=1.546 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.201, mean=0.201, max=0.201, sum=0.401 (2)", - "tab": "Efficiency", - "score": 0.2005681289485627 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.632, mean=449.632, max=449.632, sum=899.264 (2)", - "tab": "General information", - "score": 449.6319018404908 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554, - "details": { - "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.232, mean=0.232, max=0.232, sum=0.463 (2)", - "tab": "Efficiency", - "score": 0.23156332118170603 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=681.848, mean=681.848, max=681.848, sum=1363.696 (2)", - "tab": "General information", - "score": 681.8482142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.197, mean=0.197, max=0.197, sum=0.394 (2)", - "tab": "Efficiency", - "score": 0.19694008410555644 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.854, mean=283.854, max=283.854, sum=567.709 (2)", - "tab": "General information", - "score": 283.8543689320388 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.919, - "details": { - "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.184, mean=0.184, max=0.184, sum=0.368 (2)", - "tab": "Efficiency", - "score": 0.18401269525544256 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.415, mean=404.415, max=404.415, sum=808.829 (2)", - "tab": "General information", - "score": 404.4145299145299 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.176, mean=0.176, max=0.176, sum=0.351 (2)", - "tab": "Efficiency", - "score": 0.17553309679031373 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=342.35, mean=342.35, max=342.35, sum=684.7 (2)", - "tab": "General information", - "score": 342.35 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852, - "details": { - "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.174, mean=0.174, max=0.174, sum=0.347 (2)", - "tab": "Efficiency", - "score": 0.17373346399377892 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=303.7, mean=303.7, max=303.7, sum=607.4 (2)", - "tab": "General information", - "score": 303.6998722860792 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.511, - "details": { - "description": "min=0.511, mean=0.511, max=0.511, sum=1.021 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.168, mean=0.168, max=0.168, sum=0.337 (2)", - "tab": "Efficiency", - "score": 0.16836080041234894 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.171, mean=0.171, max=0.171, sum=0.342 (2)", - "tab": "Efficiency", - "score": 0.1708347949235799 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.182, mean=476.182, max=476.182, sum=952.364 (2)", - "tab": "General information", - "score": 476.1820809248555 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=668.494, mean=668.494, max=668.494, sum=1336.988 (2)", - "tab": "General information", - "score": 668.4938547486033 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778, - "details": { - "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.168, mean=0.168, max=0.168, sum=0.337 (2)", - "tab": "Efficiency", - "score": 0.16839487724054872 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=599.637, mean=599.637, max=599.637, sum=1199.275 (2)", - "tab": "General information", - "score": 599.6372549019608 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.673 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.168, mean=0.168, max=0.168, sum=0.337 (2)", - "tab": "Efficiency", - "score": 0.16826030795956837 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=528.364, mean=528.364, max=528.364, sum=1056.728 (2)", - "tab": "General information", - "score": 528.3641975308642 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.709, - "details": { - "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.164, mean=0.164, max=0.164, sum=0.328 (2)", - "tab": "Efficiency", - "score": 0.1641989447853782 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=408.427, mean=408.427, max=408.427, sum=816.855 (2)", - "tab": "General information", - "score": 408.42727272727274 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.682, - "details": { - "description": "min=0.682, mean=0.682, max=0.682, sum=1.363 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.174, mean=0.174, max=0.174, sum=0.349 (2)", - "tab": "Efficiency", - "score": 0.1744946577111069 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1166.931, mean=1166.931, max=1166.931, sum=2333.861 (2)", - "tab": "General information", - "score": 1166.930612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.861, - "details": { - "description": "min=0.861, mean=0.861, max=0.861, sum=1.721 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.19, mean=0.19, max=0.19, sum=0.381 (2)", - "tab": "Efficiency", - "score": 0.1903395510431546 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=450.1, mean=450.1, max=450.1, sum=900.199 (2)", - "tab": "General information", - "score": 450.0995024875622 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.578, - "details": { - "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.174, mean=0.174, max=0.174, sum=0.348 (2)", - "tab": "Efficiency", - "score": 0.1741443513387657 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.819, mean=343.819, max=343.819, sum=687.639 (2)", - "tab": "General information", - "score": 343.8192771084337 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.661 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.162, mean=0.162, max=0.162, sum=0.325 (2)", - "tab": "Efficiency", - "score": 0.16239780292176365 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=276.07, mean=276.07, max=276.07, sum=552.14 (2)", - "tab": "General information", - "score": 276.0701754385965 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.887, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json deleted file mode 100644 index 0afa77758..000000000 --- a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Arctic Instruct", - "id": "snowflake/snowflake-arctic-instruct", - "developer": "snowflake", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677, - "details": { - "description": "min=0.28, mean=0.677, max=0.912, sum=77.129 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.35, mean=0.42, max=0.544, sum=47.89 (114)", - "tab": "Efficiency", - "score": 0.4200856614493726 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)", - "tab": "General information", - "score": 706.6820126388612 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35, - "details": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.377, mean=0.377, max=0.377, sum=0.753 (2)", - "tab": "Efficiency", - "score": 0.37665764808654784 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)", - "tab": "General information", - "score": 397.65 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652, - "details": { - "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.365, mean=0.365, max=0.365, sum=0.731 (2)", - "tab": "Efficiency", - "score": 0.3654881194785789 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=409.133, mean=409.133, max=409.133, sum=818.267 (2)", - "tab": "General information", - "score": 409.1333333333333 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.461, - "details": { - "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.35, mean=0.35, max=0.35, sum=0.701 (2)", - "tab": "Efficiency", - "score": 0.3502761268615723 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", - "tab": "Efficiency", - "score": 0.421069688267178 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.427, mean=0.427, max=0.427, sum=0.853 (2)", - "tab": "Efficiency", - "score": 0.4266632032394409 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.858 (2)", - "tab": "Efficiency", - "score": 0.42887043952941895 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.869 (2)", - "tab": "Efficiency", - "score": 0.4343285574389331 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", - "tab": "Efficiency", - "score": 0.4209739086674709 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)", - "tab": "General information", - "score": 622.43 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)", - "tab": "General information", - "score": 553.6319444444445 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)", - "tab": "General information", - "score": 901.14 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)", - "tab": "General information", - "score": 646.96 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)", - "tab": "General information", - "score": 608.6705202312139 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)", - "tab": "General information", - "score": 551.8725490196078 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84, - "details": { - "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)", - "tab": "Efficiency", - "score": 0.41247488737106325 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)", - "tab": "General information", - "score": 428.17 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)", - "tab": "Efficiency", - "score": 0.436487873395284 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)", - "tab": "General information", - "score": 684.6754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39, - "details": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.839 (2)", - "tab": "Efficiency", - "score": 0.41951879262924197 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=484.54, mean=484.54, max=484.54, sum=969.08 (2)", - "tab": "General information", - "score": 484.54 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741, - "details": { - "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)", - "tab": "Efficiency", - "score": 0.421647725281892 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=449.898, mean=449.898, max=449.898, sum=899.796 (2)", - "tab": "General information", - "score": 449.89814814814815 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.752, - "details": { - "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.418, mean=0.418, max=0.418, sum=0.837 (2)", - "tab": "Efficiency", - "score": 0.418486426497579 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=372.122, mean=372.122, max=372.122, sum=744.244 (2)", - "tab": "General information", - "score": 372.12218649517683 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724, - "details": { - "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)", - "tab": "Efficiency", - "score": 0.4448305149288738 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.887 (2)", - "tab": "Efficiency", - "score": 0.44340477683019974 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)", - "tab": "Efficiency", - "score": 0.531202322345669 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.847 (2)", - "tab": "Efficiency", - "score": 0.42342418120577446 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)", - "tab": "General information", - "score": 1330.6470588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)", - "tab": "General information", - "score": 823.2765957446809 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)", - "tab": "General information", - "score": 1915.0071707953064 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)", - "tab": "General information", - "score": 650.0784313725491 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)", - "tab": "Efficiency", - "score": 0.42398189067840575 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)", - "tab": "General information", - "score": 479.81 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.763, - "details": { - "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)", - "tab": "Efficiency", - "score": 0.42381788398090164 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)", - "tab": "General information", - "score": 681.078947368421 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69, - "details": { - "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)", - "tab": "Efficiency", - "score": 0.4315712761878967 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)", - "tab": "General information", - "score": 674.44 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.781, - "details": { - "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)", - "tab": "Efficiency", - "score": 0.4204666920428006 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=487.374, mean=487.374, max=487.374, sum=974.747 (2)", - "tab": "General information", - "score": 487.3735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.634, - "details": { - "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.824 (2)", - "tab": "Efficiency", - "score": 0.4118805824442113 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=333.153, mean=333.153, max=333.153, sum=666.306 (2)", - "tab": "General information", - "score": 333.1531914893617 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662, - "details": { - "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)", - "tab": "Efficiency", - "score": 0.42821227435407966 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=497.779, mean=497.779, max=497.779, sum=995.559 (2)", - "tab": "General information", - "score": 497.7793103448276 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.481, - "details": { - "description": "min=0.481, mean=0.481, max=0.481, sum=0.963 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.427, mean=0.427, max=0.427, sum=0.853 (2)", - "tab": "Efficiency", - "score": 0.4265344634888664 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)", - "tab": "General information", - "score": 609.1560846560847 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444, - "details": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)", - "tab": "Efficiency", - "score": 0.4107102117841206 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)", - "tab": "General information", - "score": 691.8095238095239 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827, - "details": { - "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.847 (2)", - "tab": "Efficiency", - "score": 0.42357982127897204 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)", - "tab": "Efficiency", - "score": 0.41242665375394777 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)", - "tab": "Efficiency", - "score": 0.44495458364486695 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)", - "tab": "Efficiency", - "score": 0.5441486705433238 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.4149725003675981 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)", - "tab": "Efficiency", - "score": 0.38312110629106433 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)", - "tab": "Efficiency", - "score": 0.4034240123553154 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.39, mean=0.39, max=0.39, sum=0.779 (2)", - "tab": "Efficiency", - "score": 0.38954139285617406 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.399, mean=0.399, max=0.399, sum=0.798 (2)", - "tab": "Efficiency", - "score": 0.3992174813727371 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.409, mean=0.409, max=0.409, sum=0.819 (2)", - "tab": "Efficiency", - "score": 0.40926165138648835 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)", - "tab": "Efficiency", - "score": 0.4081065694126514 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.833 (2)", - "tab": "Efficiency", - "score": 0.4166152830477114 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)", - "tab": "Efficiency", - "score": 0.4504043985815609 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.833 (2)", - "tab": "Efficiency", - "score": 0.4162542166086189 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)", - "tab": "General information", - "score": 596.8935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)", - "tab": "General information", - "score": 568.6650246305419 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)", - "tab": "General information", - "score": 988.57 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)", - "tab": "General information", - "score": 3159.6363636363635 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=436.657, mean=436.657, max=436.657, sum=873.313 (2)", - "tab": "General information", - "score": 436.65656565656565 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)", - "tab": "General information", - "score": 527.9274611398964 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=445.662, mean=445.662, max=445.662, sum=891.323 (2)", - "tab": "General information", - "score": 445.66153846153844 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)", - "tab": "General information", - "score": 579.1814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=449.492, mean=449.492, max=449.492, sum=898.983 (2)", - "tab": "General information", - "score": 449.49159663865544 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)", - "tab": "General information", - "score": 621.7880794701987 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)", - "tab": "General information", - "score": 585.9192660550459 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)", - "tab": "General information", - "score": 908.2083333333334 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)", - "tab": "General information", - "score": 2535.323529411765 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)", - "tab": "General information", - "score": 1638.2194092827003 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.847, - "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.401, mean=0.401, max=0.401, sum=0.802 (2)", - "tab": "Efficiency", - "score": 0.4010318255745242 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)", - "tab": "Efficiency", - "score": 0.39331119843111695 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=361.26, mean=361.26, max=361.26, sum=722.52 (2)", - "tab": "General information", - "score": 361.26008968609864 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=403.382, mean=403.382, max=403.382, sum=806.763 (2)", - "tab": "General information", - "score": 403.381679389313 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)", - "tab": "Efficiency", - "score": 0.42040472779392213 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)", - "tab": "General information", - "score": 729.4628099173553 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.779, - "details": { - "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.404, mean=0.404, max=0.404, sum=0.809 (2)", - "tab": "Efficiency", - "score": 0.4043445353127696 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)", - "tab": "General information", - "score": 502.7546012269939 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473, - "details": { - "description": "min=0.473, mean=0.473, max=0.473, sum=0.946 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)", - "tab": "Efficiency", - "score": 0.42122456644262585 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)", - "tab": "General information", - "score": 730.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796, - "details": { - "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.392, mean=0.392, max=0.392, sum=0.785 (2)", - "tab": "Efficiency", - "score": 0.392485206566968 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=315.777, mean=315.777, max=315.777, sum=631.553 (2)", - "tab": "General information", - "score": 315.77669902912623 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.902, - "details": { - "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)", - "tab": "Efficiency", - "score": 0.406507401384859 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=472.628, mean=472.628, max=472.628, sum=945.256 (2)", - "tab": "General information", - "score": 472.62820512820514 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.835 (2)", - "tab": "Efficiency", - "score": 0.41734427213668823 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=408.14, mean=408.14, max=408.14, sum=816.28 (2)", - "tab": "General information", - "score": 408.14 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875, - "details": { - "description": "min=0.875, mean=0.875, max=0.875, sum=1.75 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)", - "tab": "Efficiency", - "score": 0.40693108880200146 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=345.913, mean=345.913, max=345.913, sum=691.826 (2)", - "tab": "General information", - "score": 345.9131545338442 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.28, - "details": { - "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)", - "tab": "Efficiency", - "score": 0.4239204674097844 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.866 (2)", - "tab": "Efficiency", - "score": 0.43297034721800737 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)", - "tab": "General information", - "score": 542.5057803468208 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)", - "tab": "General information", - "score": 756.4793296089385 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725, - "details": { - "description": "min=0.725, mean=0.725, max=0.725, sum=1.451 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.417, mean=0.417, max=0.417, sum=0.835 (2)", - "tab": "Efficiency", - "score": 0.41727598430284485 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)", - "tab": "General information", - "score": 695.9215686274509 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.79, - "details": { - "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.861 (2)", - "tab": "Efficiency", - "score": 0.4303552037403907 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)", - "tab": "General information", - "score": 619.1851851851852 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.664, - "details": { - "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.428, mean=0.428, max=0.428, sum=0.855 (2)", - "tab": "Efficiency", - "score": 0.42750670259649104 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=474.827, mean=474.827, max=474.827, sum=949.655 (2)", - "tab": "General information", - "score": 474.8272727272727 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.933 (2)", - "tab": "Efficiency", - "score": 0.4662662194699657 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)", - "tab": "General information", - "score": 1377.530612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.891, - "details": { - "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)", - "tab": "Efficiency", - "score": 0.4159522590352528 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)", - "tab": "General information", - "score": 508.4776119402985 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536, - "details": { - "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.405, mean=0.405, max=0.405, sum=0.809 (2)", - "tab": "Efficiency", - "score": 0.40467354332108096 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=405.108, mean=405.108, max=405.108, sum=810.217 (2)", - "tab": "General information", - "score": 405.10843373493975 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.854, - "details": { - "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)", - "tab": "Efficiency", - "score": 0.39336834455791275 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=304.474, mean=304.474, max=304.474, sum=608.947 (2)", - "tab": "General information", - "score": 304.4736842105263 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json deleted file mode 100644 index 2c0cfc48a..000000000 --- a/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Solar Pro", - "id": "upstage/solar-pro-241126", - "developer": "upstage", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.776, - "details": { - "description": "min=0.44, mean=0.776, max=0.97, sum=88.521 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.415, mean=0.5, max=1.447, sum=56.972 (114)", - "tab": "Efficiency", - "score": 0.4997569605932576 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=313.474, mean=715.682, max=3168.636, sum=81587.749 (114)", - "tab": "General information", - "score": 715.6820126388612 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46, - "details": { - "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)", - "tab": "Efficiency", - "score": 0.47064422845840453 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=406.65, mean=406.65, max=406.65, sum=813.3 (2)", - "tab": "General information", - "score": 406.65 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719, - "details": { - "description": "min=0.719, mean=0.719, max=0.719, sum=1.437 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.526, mean=0.526, max=0.526, sum=1.052 (2)", - "tab": "Efficiency", - "score": 0.5261570206394902 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=418.133, mean=418.133, max=418.133, sum=836.267 (2)", - "tab": "General information", - "score": 418.1333333333333 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559, - "details": { - "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.538, mean=0.538, max=0.538, sum=1.077 (2)", - "tab": "Efficiency", - "score": 0.5384537291526794 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)", - "tab": "Efficiency", - "score": 0.44289560781584847 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)", - "tab": "Efficiency", - "score": 0.4359678840637207 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.865 (2)", - "tab": "Efficiency", - "score": 0.4324680757522583 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.435, mean=0.435, max=0.435, sum=0.869 (2)", - "tab": "Efficiency", - "score": 0.4347288250234086 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)", - "tab": "Efficiency", - "score": 0.43169068121442605 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=631.43, mean=631.43, max=631.43, sum=1262.86 (2)", - "tab": "General information", - "score": 631.43 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=562.632, mean=562.632, max=562.632, sum=1125.264 (2)", - "tab": "General information", - "score": 562.6319444444445 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=910.14, mean=910.14, max=910.14, sum=1820.28 (2)", - "tab": "General information", - "score": 910.14 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=655.96, mean=655.96, max=655.96, sum=1311.92 (2)", - "tab": "General information", - "score": 655.96 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=617.671, mean=617.671, max=617.671, sum=1235.341 (2)", - "tab": "General information", - "score": 617.6705202312139 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=560.873, mean=560.873, max=560.873, sum=1121.745 (2)", - "tab": "General information", - "score": 560.8725490196078 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.765, mean=0.765, max=0.765, sum=1.53 (2)", - "tab": "Efficiency", - "score": 0.7652230095863343 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=437.17, mean=437.17, max=437.17, sum=874.34 (2)", - "tab": "General information", - "score": 437.17 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.605, - "details": { - "description": "min=0.605, mean=0.605, max=0.605, sum=1.211 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.429, mean=0.429, max=0.429, sum=0.858 (2)", - "tab": "Efficiency", - "score": 0.4288227077116046 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=693.675, mean=693.675, max=693.675, sum=1387.351 (2)", - "tab": "General information", - "score": 693.6754385964912 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5, - "details": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.43, mean=0.43, max=0.43, sum=0.859 (2)", - "tab": "Efficiency", - "score": 0.4296323895454407 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=493.54, mean=493.54, max=493.54, sum=987.08 (2)", - "tab": "General information", - "score": 493.54 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898, - "details": { - "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.421, mean=0.421, max=0.421, sum=0.841 (2)", - "tab": "Efficiency", - "score": 0.4206738162923742 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=458.898, mean=458.898, max=458.898, sum=917.796 (2)", - "tab": "General information", - "score": 458.89814814814815 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.817, - "details": { - "description": "min=0.817, mean=0.817, max=0.817, sum=1.633 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.871 (2)", - "tab": "Efficiency", - "score": 0.43559602372516004 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=381.122, mean=381.122, max=381.122, sum=762.244 (2)", - "tab": "General information", - "score": 381.12218649517683 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85, - "details": { - "description": "min=0.85, mean=0.85, max=0.85, sum=1.699 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.497, mean=0.497, max=0.497, sum=0.994 (2)", - "tab": "Efficiency", - "score": 0.4968351388678831 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.917 (2)", - "tab": "Efficiency", - "score": 0.4586718564337872 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=1.016, mean=1.016, max=1.016, sum=2.033 (2)", - "tab": "Efficiency", - "score": 1.016288014092377 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.885 (2)", - "tab": "Efficiency", - "score": 0.4426119109384375 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1339.647, mean=1339.647, max=1339.647, sum=2679.294 (2)", - "tab": "General information", - "score": 1339.6470588235295 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=832.277, mean=832.277, max=832.277, sum=1664.553 (2)", - "tab": "General information", - "score": 832.2765957446809 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1924.007, mean=1924.007, max=1924.007, sum=3848.014 (2)", - "tab": "General information", - "score": 1924.0071707953064 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=659.078, mean=659.078, max=659.078, sum=1318.157 (2)", - "tab": "General information", - "score": 659.0784313725491 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.97, - "details": { - "description": "min=0.97, mean=0.97, max=0.97, sum=1.94 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)", - "tab": "Efficiency", - "score": 0.44084484577178956 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=488.81, mean=488.81, max=488.81, sum=977.62 (2)", - "tab": "General information", - "score": 488.81 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.868, - "details": { - "description": "min=0.868, mean=0.868, max=0.868, sum=1.737 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)", - "tab": "Efficiency", - "score": 0.4461362079570168 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=690.079, mean=690.079, max=690.079, sum=1380.158 (2)", - "tab": "General information", - "score": 690.078947368421 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8, - "details": { - "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)", - "tab": "Efficiency", - "score": 0.4362391257286072 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=683.44, mean=683.44, max=683.44, sum=1366.88 (2)", - "tab": "General information", - "score": 683.44 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.808, - "details": { - "description": "min=0.808, mean=0.808, max=0.808, sum=1.615 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.427, mean=0.427, max=0.427, sum=0.855 (2)", - "tab": "Efficiency", - "score": 0.42739290561316146 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=496.374, mean=496.374, max=496.374, sum=992.747 (2)", - "tab": "General information", - "score": 496.3735849056604 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.826, - "details": { - "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.904 (2)", - "tab": "Efficiency", - "score": 0.4520118307560048 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=342.153, mean=342.153, max=342.153, sum=684.306 (2)", - "tab": "General information", - "score": 342.1531914893617 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.697, - "details": { - "description": "min=0.697, mean=0.697, max=0.697, sum=1.393 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.487, mean=0.487, max=0.487, sum=0.974 (2)", - "tab": "Efficiency", - "score": 0.4870024582435345 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=506.779, mean=506.779, max=506.779, sum=1013.559 (2)", - "tab": "General information", - "score": 506.7793103448276 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611, - "details": { - "description": "min=0.611, mean=0.611, max=0.611, sum=1.222 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.457, mean=0.457, max=0.457, sum=0.915 (2)", - "tab": "Efficiency", - "score": 0.4574742739793485 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=618.156, mean=618.156, max=618.156, sum=1236.312 (2)", - "tab": "General information", - "score": 618.1560846560847 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.445, mean=0.445, max=0.445, sum=0.889 (2)", - "tab": "Efficiency", - "score": 0.44462628780849395 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=700.81, mean=700.81, max=700.81, sum=1401.619 (2)", - "tab": "General information", - "score": 700.8095238095239 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.907, - "details": { - "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)", - "tab": "Efficiency", - "score": 0.4396143251849759 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)", - "tab": "Efficiency", - "score": 0.4325766810055437 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)", - "tab": "Efficiency", - "score": 0.4435269355773926 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.447, mean=1.447, max=1.447, sum=2.894 (2)", - "tab": "Efficiency", - "score": 1.44696401682767 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.427, mean=0.427, max=0.427, sum=0.854 (2)", - "tab": "Efficiency", - "score": 0.4269573845044531 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.442, mean=0.442, max=0.442, sum=0.885 (2)", - "tab": "Efficiency", - "score": 0.4422582035855308 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)", - "tab": "Efficiency", - "score": 0.4230540263347137 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.877 (2)", - "tab": "Efficiency", - "score": 0.4383223215738932 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.887 (2)", - "tab": "Efficiency", - "score": 0.4434382264353648 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.448, mean=0.448, max=0.448, sum=0.896 (2)", - "tab": "Efficiency", - "score": 0.4479467000392889 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.438, mean=0.438, max=0.438, sum=0.876 (2)", - "tab": "Efficiency", - "score": 0.43786543006197026 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.454, mean=0.454, max=0.454, sum=0.907 (2)", - "tab": "Efficiency", - "score": 0.45358082431334035 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.14, mean=1.14, max=1.14, sum=2.28 (2)", - "tab": "Efficiency", - "score": 1.13988286373662 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.666 (2)", - "tab": "Efficiency", - "score": 0.8329467803617067 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=605.894, mean=605.894, max=605.894, sum=1211.787 (2)", - "tab": "General information", - "score": 605.8935483870968 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=577.665, mean=577.665, max=577.665, sum=1155.33 (2)", - "tab": "General information", - "score": 577.6650246305419 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=997.57, mean=997.57, max=997.57, sum=1995.14 (2)", - "tab": "General information", - "score": 997.57 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=3168.636, mean=3168.636, max=3168.636, sum=6337.273 (2)", - "tab": "General information", - "score": 3168.6363636363635 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=445.657, mean=445.657, max=445.657, sum=891.313 (2)", - "tab": "General information", - "score": 445.65656565656565 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=536.927, mean=536.927, max=536.927, sum=1073.855 (2)", - "tab": "General information", - "score": 536.9274611398964 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=454.662, mean=454.662, max=454.662, sum=909.323 (2)", - "tab": "General information", - "score": 454.66153846153844 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=588.181, mean=588.181, max=588.181, sum=1176.363 (2)", - "tab": "General information", - "score": 588.1814814814815 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=458.492, mean=458.492, max=458.492, sum=916.983 (2)", - "tab": "General information", - "score": 458.49159663865544 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=630.788, mean=630.788, max=630.788, sum=1261.576 (2)", - "tab": "General information", - "score": 630.7880794701987 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=594.919, mean=594.919, max=594.919, sum=1189.839 (2)", - "tab": "General information", - "score": 594.9192660550459 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=917.208, mean=917.208, max=917.208, sum=1834.417 (2)", - "tab": "General information", - "score": 917.2083333333334 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2544.324, mean=2544.324, max=2544.324, sum=5088.647 (2)", - "tab": "General information", - "score": 2544.323529411765 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1647.219, mean=1647.219, max=1647.219, sum=3294.439 (2)", - "tab": "General information", - "score": 1647.2194092827003 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.847, - "details": { - "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)", - "tab": "Efficiency", - "score": 0.43635595539760164 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.869 (2)", - "tab": "Efficiency", - "score": 0.4343654235810724 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)", - "tab": "General information", - "score": 370.26008968609864 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=412.382, mean=412.382, max=412.382, sum=824.763 (2)", - "tab": "General information", - "score": 412.381679389313 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)", - "tab": "Efficiency", - "score": 0.46112686346385107 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=738.463, mean=738.463, max=738.463, sum=1476.926 (2)", - "tab": "General information", - "score": 738.4628099173553 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.865, - "details": { - "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)", - "tab": "Efficiency", - "score": 0.44979269080366824 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=511.755, mean=511.755, max=511.755, sum=1023.509 (2)", - "tab": "General information", - "score": 511.7546012269939 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616, - "details": { - "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.466, mean=0.466, max=0.466, sum=0.932 (2)", - "tab": "Efficiency", - "score": 0.46596066866602215 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=739.402, mean=739.402, max=739.402, sum=1478.804 (2)", - "tab": "General information", - "score": 739.4017857142857 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864, - "details": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)", - "tab": "Efficiency", - "score": 0.43890966720951413 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=324.777, mean=324.777, max=324.777, sum=649.553 (2)", - "tab": "General information", - "score": 324.77669902912623 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.953, - "details": { - "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.885 (2)", - "tab": "Efficiency", - "score": 0.4425381727707692 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=481.628, mean=481.628, max=481.628, sum=963.256 (2)", - "tab": "General information", - "score": 481.62820512820514 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)", - "tab": "Efficiency", - "score": 0.43624018907546996 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=417.14, mean=417.14, max=417.14, sum=834.28 (2)", - "tab": "General information", - "score": 417.14 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.888, - "details": { - "description": "min=0.888, mean=0.888, max=0.888, sum=1.775 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)", - "tab": "Efficiency", - "score": 0.4337884417591119 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=354.913, mean=354.913, max=354.913, sum=709.826 (2)", - "tab": "General information", - "score": 354.9131545338442 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811, - "details": { - "description": "min=0.811, mean=0.811, max=0.811, sum=1.622 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.454, mean=0.454, max=0.454, sum=0.908 (2)", - "tab": "Efficiency", - "score": 0.4541343209371401 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)", - "tab": "Efficiency", - "score": 0.4522555020934377 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=551.506, mean=551.506, max=551.506, sum=1103.012 (2)", - "tab": "General information", - "score": 551.5057803468208 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=765.479, mean=765.479, max=765.479, sum=1530.959 (2)", - "tab": "General information", - "score": 765.4793296089385 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.859, - "details": { - "description": "min=0.859, mean=0.859, max=0.859, sum=1.719 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.469, mean=0.469, max=0.469, sum=0.937 (2)", - "tab": "Efficiency", - "score": 0.46850453872306674 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=704.922, mean=704.922, max=704.922, sum=1409.843 (2)", - "tab": "General information", - "score": 704.9215686274509 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.867, - "details": { - "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)", - "tab": "Efficiency", - "score": 0.45942840973536175 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=628.185, mean=628.185, max=628.185, sum=1256.37 (2)", - "tab": "General information", - "score": 628.1851851851852 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.764, - "details": { - "description": "min=0.764, mean=0.764, max=0.764, sum=1.527 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)", - "tab": "Efficiency", - "score": 0.4240685766393488 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=483.827, mean=483.827, max=483.827, sum=967.655 (2)", - "tab": "General information", - "score": 483.8272727272727 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.5, mean=0.5, max=0.5, sum=1.001 (2)", - "tab": "Efficiency", - "score": 0.500300864784085 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1386.531, mean=1386.531, max=1386.531, sum=2773.061 (2)", - "tab": "General information", - "score": 1386.530612244898 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886, - "details": { - "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)", - "tab": "Efficiency", - "score": 0.4395348717324176 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=517.478, mean=517.478, max=517.478, sum=1034.955 (2)", - "tab": "General information", - "score": 517.4776119402985 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572, - "details": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)", - "tab": "Efficiency", - "score": 0.4260225296020508 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=414.108, mean=414.108, max=414.108, sum=828.217 (2)", - "tab": "General information", - "score": 414.10843373493975 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.883, - "details": { - "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)", - "tab": "Efficiency", - "score": 0.41479549212762484 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=313.474, mean=313.474, max=313.474, sum=626.947 (2)", - "tab": "General information", - "score": 313.4736842105263 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json deleted file mode 100644 index c204b253d..000000000 --- a/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra-X-004", - "id": "writer/palmyra-x-004", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.813, - "details": { - "description": "min=0.52, mean=0.813, max=0.959, sum=92.659 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.298, mean=0.535, max=2.946, sum=60.962 (114)", - "tab": "Efficiency", - "score": 0.5347547453538 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)", - "tab": "General information", - "score": 614.6193817308517 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=0.968, mean=0.991, max=1, sum=112.995 (114)", - "tab": "General information", - "score": 0.9911842955118555 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75, - "details": { - "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.722, mean=0.722, max=0.722, sum=1.444 (2)", - "tab": "Efficiency", - "score": 0.7220739269256592 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)", - "tab": "General information", - "score": 373.43 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.822, - "details": { - "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)", - "tab": "Efficiency", - "score": 0.3229873922136095 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)", - "tab": "General information", - "score": 353.8740740740741 - }, - "Anatomy - # output tokens": { - "description": "min=0.993, mean=0.993, max=0.993, sum=1.985 (2)", - "tab": "General information", - "score": 0.9925925925925926 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.647, - "details": { - "description": "min=0.647, mean=0.647, max=0.647, sum=1.294 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)", - "tab": "Efficiency", - "score": 0.316190505027771 - }, - "College Biology - Observed inference time (s)": { - "description": "min=2.087, mean=2.087, max=2.087, sum=4.175 (2)", - "tab": "Efficiency", - "score": 2.0873730795250998 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=1.575, mean=1.575, max=1.575, sum=3.15 (2)", - "tab": "Efficiency", - "score": 1.574983057975769 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=1.58, mean=1.58, max=1.58, sum=3.16 (2)", - "tab": "Efficiency", - "score": 1.5799101972579956 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=1.786, mean=1.786, max=1.786, sum=3.572 (2)", - "tab": "Efficiency", - "score": 1.786004883705536 - }, - "College Physics - Observed inference time (s)": { - "description": "min=1.112, mean=1.112, max=1.112, sum=2.225 (2)", - "tab": "Efficiency", - "score": 1.1123062372207642 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)", - "tab": "General information", - "score": 549.28 - }, - "College Chemistry - # output tokens": { - "description": "min=0.97, mean=0.97, max=0.97, sum=1.94 (2)", - "tab": "General information", - "score": 0.97 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)", - "tab": "General information", - "score": 473.875 - }, - "College Biology - # output tokens": { - "description": "min=0.993, mean=0.993, max=0.993, sum=1.986 (2)", - "tab": "General information", - "score": 0.9930555555555556 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)", - "tab": "General information", - "score": 828.29 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)", - "tab": "General information", - "score": 594.51 - }, - "College Mathematics - # output tokens": { - "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)", - "tab": "General information", - "score": 0.98 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)", - "tab": "General information", - "score": 502.70520231213874 - }, - "College Medicine - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)", - "tab": "General information", - "score": 0.9942196531791907 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)", - "tab": "General information", - "score": 503.5686274509804 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82, - "details": { - "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)", - "tab": "Efficiency", - "score": 0.3091639161109924 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)", - "tab": "General information", - "score": 378.51 - }, - "Computer Security - # output tokens": { - "description": "min=0.99, mean=0.99, max=0.99, sum=1.98 (2)", - "tab": "General information", - "score": 0.99 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.684, - "details": { - "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.644 (2)", - "tab": "Efficiency", - "score": 0.32210456070147064 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)", - "tab": "General information", - "score": 614.421052631579 - }, - "Econometrics - # output tokens": { - "description": "min=0.991, mean=0.991, max=0.991, sum=1.982 (2)", - "tab": "General information", - "score": 0.9912280701754386 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62, - "details": { - "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.31063568592071533 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)", - "tab": "General information", - "score": 399.71 - }, - "Global Facts - # output tokens": { - "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)", - "tab": "General information", - "score": 0.98 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.843, - "details": { - "description": "min=0.843, mean=0.843, max=0.843, sum=1.685 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.298, mean=0.298, max=0.298, sum=0.597 (2)", - "tab": "Efficiency", - "score": 0.29833372433980304 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)", - "tab": "General information", - "score": 394.6296296296296 - }, - "Jurisprudence - # output tokens": { - "description": "min=0.991, mean=0.991, max=0.991, sum=1.981 (2)", - "tab": "General information", - "score": 0.9907407407407407 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)", - "tab": "Efficiency", - "score": 0.30590631187537093 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)", - "tab": "General information", - "score": 329.08360128617363 - }, - "Philosophy - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=1.987 (2)", - "tab": "General information", - "score": 0.9935691318327974 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.845, - "details": { - "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)", - "tab": "Efficiency", - "score": 0.42044701295740466 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)", - "tab": "Efficiency", - "score": 0.35206349944391996 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=2.946, mean=2.946, max=2.946, sum=5.892 (2)", - "tab": "Efficiency", - "score": 2.9459040923410784 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.342, mean=0.342, max=0.342, sum=0.683 (2)", - "tab": "Efficiency", - "score": 0.34150391076904496 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)", - "tab": "General information", - "score": 1094.4889705882354 - }, - "Professional Medicine - # output tokens": { - "description": "min=0.989, mean=0.989, max=0.989, sum=1.978 (2)", - "tab": "General information", - "score": 0.9889705882352942 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)", - "tab": "General information", - "score": 658.5851063829788 - }, - "Professional Accounting - # output tokens": { - "description": "min=0.968, mean=0.968, max=0.968, sum=1.936 (2)", - "tab": "General information", - "score": 0.9680851063829787 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)", - "tab": "General information", - "score": 1637.6010430247718 - }, - "Professional Law - # output tokens": { - "description": "min=0.995, mean=0.995, max=0.995, sum=1.99 (2)", - "tab": "General information", - "score": 0.9947848761408083 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)", - "tab": "General information", - "score": 575.0980392156863 - }, - "Professional Psychology - # output tokens": { - "description": "min=0.993, mean=0.993, max=0.993, sum=1.987 (2)", - "tab": "General information", - "score": 0.9934640522875817 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92, - "details": { - "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)", - "tab": "Efficiency", - "score": 0.31222330808639526 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)", - "tab": "General information", - "score": 422.79 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.928, - "details": { - "description": "min=0.928, mean=0.928, max=0.928, sum=1.855 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)", - "tab": "Efficiency", - "score": 0.3264871161235006 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)", - "tab": "General information", - "score": 579.6842105263158 - }, - "Astronomy - # output tokens": { - "description": "min=0.993, mean=0.993, max=0.993, sum=1.987 (2)", - "tab": "General information", - "score": 0.993421052631579 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76, - "details": { - "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)", - "tab": "Efficiency", - "score": 0.3212712168693542 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)", - "tab": "General information", - "score": 569.52 - }, - "Business Ethics - # output tokens": { - "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)", - "tab": "General information", - "score": 0.98 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.879, - "details": { - "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.477, mean=0.477, max=0.477, sum=0.953 (2)", - "tab": "Efficiency", - "score": 0.4765495894090185 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)", - "tab": "General information", - "score": 397.92830188679244 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=0.992, mean=0.992, max=0.992, sum=1.985 (2)", - "tab": "General information", - "score": 0.9924528301886792 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885, - "details": { - "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)", - "tab": "Efficiency", - "score": 0.3465714748869551 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)", - "tab": "General information", - "score": 304.83404255319147 - }, - "Conceptual Physics - # output tokens": { - "description": "min=0.996, mean=0.996, max=0.996, sum=1.991 (2)", - "tab": "General information", - "score": 0.9957446808510638 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.793, - "details": { - "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.305, mean=0.305, max=0.305, sum=0.611 (2)", - "tab": "Efficiency", - "score": 0.3054168865598481 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)", - "tab": "General information", - "score": 435.60689655172416 - }, - "Electrical Engineering - # output tokens": { - "description": "min=0.993, mean=0.993, max=0.993, sum=1.986 (2)", - "tab": "General information", - "score": 0.993103448275862 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.841, - "details": { - "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.627 (2)", - "tab": "Efficiency", - "score": 0.31325215069705215 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)", - "tab": "General information", - "score": 531.8544973544973 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=0.995, mean=0.995, max=0.995, sum=1.989 (2)", - "tab": "General information", - "score": 0.9947089947089947 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579, - "details": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=1.035, mean=1.035, max=1.035, sum=2.07 (2)", - "tab": "Efficiency", - "score": 1.034958042795696 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)", - "tab": "General information", - "score": 601.7777777777778 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911, - "details": { - "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.123 (2)", - "tab": "Efficiency", - "score": 0.561508382520368 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.349, mean=0.349, max=0.349, sum=0.698 (2)", - "tab": "Efficiency", - "score": 0.34899539900530735 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.423, mean=0.423, max=0.423, sum=0.845 (2)", - "tab": "Efficiency", - "score": 0.4227438974380493 - }, - "High School European History - Observed inference time (s)": { - "description": "min=0.899, mean=0.899, max=0.899, sum=1.799 (2)", - "tab": "Efficiency", - "score": 0.8994465018763687 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)", - "tab": "Efficiency", - "score": 0.3236422189558395 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)", - "tab": "Efficiency", - "score": 0.31354672550537427 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.628 (2)", - "tab": "Efficiency", - "score": 0.31394460568061244 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.3151667806837294 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.3151869453301951 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.32, mean=0.32, max=0.32, sum=0.639 (2)", - "tab": "Efficiency", - "score": 0.31971652302520953 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)", - "tab": "Efficiency", - "score": 0.3149662079067405 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)", - "tab": "Efficiency", - "score": 0.3859624167283376 - }, - "High School US History - Observed inference time (s)": { - "description": "min=0.651, mean=0.651, max=0.651, sum=1.303 (2)", - "tab": "Efficiency", - "score": 0.6513510615217919 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.472, mean=0.472, max=0.472, sum=0.945 (2)", - "tab": "Efficiency", - "score": 0.4723552480528626 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)", - "tab": "General information", - "score": 513.6709677419354 - }, - "High School Biology - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=1.987 (2)", - "tab": "General information", - "score": 0.9935483870967742 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)", - "tab": "General information", - "score": 496.70443349753697 - }, - "High School Chemistry - # output tokens": { - "description": "min=0.985, mean=0.985, max=0.985, sum=1.97 (2)", - "tab": "General information", - "score": 0.9852216748768473 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)", - "tab": "General information", - "score": 867.78 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)", - "tab": "General information", - "score": 2797.8848484848486 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)", - "tab": "General information", - "score": 372.0353535353535 - }, - "High School Geography - # output tokens": { - "description": "min=0.99, mean=0.99, max=0.99, sum=1.98 (2)", - "tab": "General information", - "score": 0.98989898989899 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)", - "tab": "General information", - "score": 465.8238341968912 - }, - "High School Government And Politics - # output tokens": { - "description": "min=0.979, mean=0.979, max=0.979, sum=1.959 (2)", - "tab": "General information", - "score": 0.9792746113989638 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)", - "tab": "General information", - "score": 370.9076923076923 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=0.992, mean=0.992, max=0.992, sum=1.985 (2)", - "tab": "General information", - "score": 0.9923076923076923 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)", - "tab": "General information", - "score": 532.3555555555556 - }, - "High School Mathematics - # output tokens": { - "description": "min=0.993, mean=0.993, max=0.993, sum=1.985 (2)", - "tab": "General information", - "score": 0.9925925925925926 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)", - "tab": "General information", - "score": 399.0126050420168 - }, - "High School Microeconomics - # output tokens": { - "description": "min=0.987, mean=0.987, max=0.987, sum=1.975 (2)", - "tab": "General information", - "score": 0.9873949579831933 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)", - "tab": "General information", - "score": 560.4569536423841 - }, - "High School Physics - # output tokens": { - "description": "min=0.974, mean=0.974, max=0.974, sum=1.947 (2)", - "tab": "General information", - "score": 0.9735099337748344 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)", - "tab": "General information", - "score": 495.2422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=0.996, mean=0.996, max=0.996, sum=1.993 (2)", - "tab": "General information", - "score": 0.9963302752293578 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)", - "tab": "General information", - "score": 795.6388888888889 - }, - "High School Statistics - # output tokens": { - "description": "min=0.977, mean=0.977, max=0.977, sum=1.954 (2)", - "tab": "General information", - "score": 0.9768518518518519 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)", - "tab": "General information", - "score": 2217.8088235294117 - }, - "High School US History - # output tokens": { - "description": "min=0.99, mean=0.99, max=0.99, sum=1.98 (2)", - "tab": "General information", - "score": 0.9901960784313726 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)", - "tab": "General information", - "score": 1428.1729957805908 - }, - "High School World History - # output tokens": { - "description": "min=0.996, mean=0.996, max=0.996, sum=1.992 (2)", - "tab": "General information", - "score": 0.9957805907172996 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.322, mean=0.322, max=0.322, sum=0.644 (2)", - "tab": "Efficiency", - "score": 0.3221198432648663 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)", - "tab": "Efficiency", - "score": 0.31875184474100593 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)", - "tab": "General information", - "score": 319.88789237668163 - }, - "Human Aging - # output tokens": { - "description": "min=0.996, mean=0.996, max=0.996, sum=1.991 (2)", - "tab": "General information", - "score": 0.9955156950672646 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)", - "tab": "General information", - "score": 341.1679389312977 - }, - "Human Sexuality - # output tokens": { - "description": "min=0.992, mean=0.992, max=0.992, sum=1.985 (2)", - "tab": "General information", - "score": 0.9923664122137404 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.901, - "details": { - "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)", - "tab": "Efficiency", - "score": 0.33550412989844963 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)", - "tab": "General information", - "score": 639.8181818181819 - }, - "International Law - # output tokens": { - "description": "min=0.983, mean=0.983, max=0.983, sum=1.967 (2)", - "tab": "General information", - "score": 0.9834710743801653 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)", - "tab": "Efficiency", - "score": 0.3120760069302986 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)", - "tab": "General information", - "score": 449.5644171779141 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679, - "details": { - "description": "min=0.679, mean=0.679, max=0.679, sum=1.357 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.3368471988609859 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)", - "tab": "General information", - "score": 668.0535714285714 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.3103753525076561 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)", - "tab": "General information", - "score": 283.7864077669903 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.932, - "details": { - "description": "min=0.932, mean=0.932, max=0.932, sum=1.863 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.314, mean=0.314, max=0.314, sum=0.628 (2)", - "tab": "Efficiency", - "score": 0.3138112644863944 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)", - "tab": "General information", - "score": 404.21794871794873 - }, - "Marketing - # output tokens": { - "description": "min=0.991, mean=0.991, max=0.991, sum=1.983 (2)", - "tab": "General information", - "score": 0.9914529914529915 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.31, mean=0.31, max=0.31, sum=0.619 (2)", - "tab": "Efficiency", - "score": 0.3096977710723877 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)", - "tab": "General information", - "score": 340.99 - }, - "Medical Genetics - # output tokens": { - "description": "min=0.97, mean=0.97, max=0.97, sum=1.94 (2)", - "tab": "General information", - "score": 0.97 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.934, - "details": { - "description": "min=0.934, mean=0.934, max=0.934, sum=1.867 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.311, mean=0.311, max=0.311, sum=0.621 (2)", - "tab": "Efficiency", - "score": 0.3106613128730316 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)", - "tab": "General information", - "score": 299.9106002554278 - }, - "Miscellaneous - # output tokens": { - "description": "min=0.99, mean=0.99, max=0.99, sum=1.98 (2)", - "tab": "General information", - "score": 0.9897828863346104 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.825, - "details": { - "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)", - "tab": "Efficiency", - "score": 0.31282479501184013 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)", - "tab": "Efficiency", - "score": 0.3348748574709759 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)", - "tab": "General information", - "score": 476.1127167630058 - }, - "Moral Disputes - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)", - "tab": "General information", - "score": 0.9942196531791907 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)", - "tab": "General information", - "score": 656.454748603352 - }, - "Moral Scenarios - # output tokens": { - "description": "min=0.993, mean=0.993, max=0.993, sum=1.987 (2)", - "tab": "General information", - "score": 0.9932960893854749 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.869, - "details": { - "description": "min=0.869, mean=0.869, max=0.869, sum=1.739 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)", - "tab": "Efficiency", - "score": 0.33182784311132496 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)", - "tab": "General information", - "score": 586.8137254901961 - }, - "Nutrition - # output tokens": { - "description": "min=0.997, mean=0.997, max=0.997, sum=1.993 (2)", - "tab": "General information", - "score": 0.9967320261437909 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.917, - "details": { - "description": "min=0.917, mean=0.917, max=0.917, sum=1.833 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)", - "tab": "Efficiency", - "score": 0.3158548356574259 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)", - "tab": "General information", - "score": 514.5277777777778 - }, - "Prehistory - # output tokens": { - "description": "min=0.988, mean=0.988, max=0.988, sum=1.975 (2)", - "tab": "General information", - "score": 0.9876543209876543 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791, - "details": { - "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.657 (2)", - "tab": "Efficiency", - "score": 0.32829454161904076 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)", - "tab": "General information", - "score": 405.3181818181818 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.849, - "details": { - "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)", - "tab": "Efficiency", - "score": 0.44323594618816764 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)", - "tab": "General information", - "score": 1164.4734693877551 - }, - "Security Studies - # output tokens": { - "description": "min=0.992, mean=0.992, max=0.992, sum=1.984 (2)", - "tab": "General information", - "score": 0.9918367346938776 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.915, - "details": { - "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)", - "tab": "Efficiency", - "score": 0.336861949654954 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)", - "tab": "General information", - "score": 445.51741293532336 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584, - "details": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)", - "tab": "Efficiency", - "score": 0.32804813155208723 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)", - "tab": "General information", - "score": 343.01807228915663 - }, - "Virology - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)", - "tab": "General information", - "score": 0.9939759036144579 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.842, - "details": { - "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)", - "tab": "Efficiency", - "score": 0.3761981662951018 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)", - "tab": "General information", - "score": 274.5204678362573 - }, - "World Religions - # output tokens": { - "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)", - "tab": "General information", - "score": 0.9941520467836257 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.629, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json deleted file mode 100644 index 2eef769c8..000000000 --- a/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json +++ /dev/null @@ -1,3021 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770835937.459157", - "retrieved_timestamp": "1770835937.459157", - "source_metadata": { - "source_name": "helm_mmlu", - "source_type": "documentation", - "source_organization_name": "crfm", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Palmyra X V3 72B", - "id": "writer/palmyra-x-v3", - "developer": "writer", - "inference_platform": "unknown" - }, - "evaluation_results": [ - { - "evaluation_name": "MMLU All Subjects", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on MMLU All Subjects", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.786, - "details": { - "description": "min=0.493, mean=0.786, max=0.979, sum=89.625 (114)", - "tab": "Accuracy", - "MMLU All Subjects - Observed inference time (s)": { - "description": "min=0.555, mean=0.663, max=1.566, sum=75.544 (114)", - "tab": "Efficiency", - "score": 0.6626657480593275 - }, - "MMLU All Subjects - # eval": { - "description": "min=100, mean=246.351, max=1534, sum=28084 (114)", - "tab": "General information", - "score": 246.35087719298247 - }, - "MMLU All Subjects - # train": { - "description": "min=5, mean=5, max=5, sum=570 (114)", - "tab": "General information", - "score": 5.0 - }, - "MMLU All Subjects - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (114)", - "tab": "General information", - "score": 0.0 - }, - "MMLU All Subjects - # prompt tokens": { - "description": "min=277.386, mean=627.489, max=2844.03, sum=71533.746 (114)", - "tab": "General information", - "score": 627.4890026560713 - }, - "MMLU All Subjects - # output tokens": { - "description": "min=1, mean=1, max=1, sum=114 (114)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": [ - "abstract_algebra", - "anatomy", - "astronomy", - "business_ethics", - "clinical_knowledge", - "college_biology", - "college_chemistry", - "college_computer_science", - "college_mathematics", - "college_medicine", - "college_physics", - "computer_security", - "conceptual_physics", - "econometrics", - "electrical_engineering", - "elementary_mathematics", - "formal_logic", - "global_facts", - "high_school_biology", - "high_school_chemistry", - "high_school_computer_science", - "high_school_european_history", - "high_school_geography", - "high_school_government_and_politics", - "high_school_macroeconomics", - "high_school_mathematics", - "high_school_microeconomics", - "high_school_physics", - "high_school_psychology", - "high_school_statistics", - "high_school_us_history", - "high_school_world_history", - "human_aging", - "human_sexuality", - "international_law", - "jurisprudence", - "logical_fallacies", - "machine_learning", - "management", - "marketing", - "medical_genetics", - "miscellaneous", - "moral_disputes", - "moral_scenarios", - "nutrition", - "philosophy", - "prehistory", - "professional_accounting", - "professional_law", - "professional_medicine", - "professional_psychology", - "public_relations", - "security_studies", - "sociology", - "us_foreign_policy", - "virology", - "world_religions" - ], - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": [ - "mmlu_abstract_algebra", - "mmlu_anatomy", - "mmlu_astronomy", - "mmlu_business_ethics", - "mmlu_clinical_knowledge", - "mmlu_college_biology", - "mmlu_college_chemistry", - "mmlu_college_computer_science", - "mmlu_college_mathematics", - "mmlu_college_medicine", - "mmlu_college_physics", - "mmlu_computer_security", - "mmlu_conceptual_physics", - "mmlu_econometrics", - "mmlu_electrical_engineering", - "mmlu_elementary_mathematics", - "mmlu_formal_logic", - "mmlu_global_facts", - "mmlu_high_school_biology", - "mmlu_high_school_chemistry", - "mmlu_high_school_computer_science", - "mmlu_high_school_european_history", - "mmlu_high_school_geography", - "mmlu_high_school_government_and_politics", - "mmlu_high_school_macroeconomics", - "mmlu_high_school_mathematics", - "mmlu_high_school_microeconomics", - "mmlu_high_school_physics", - "mmlu_high_school_psychology", - "mmlu_high_school_statistics", - "mmlu_high_school_us_history", - "mmlu_high_school_world_history", - "mmlu_human_aging", - "mmlu_human_sexuality", - "mmlu_international_law", - "mmlu_jurisprudence", - "mmlu_logical_fallacies", - "mmlu_machine_learning", - "mmlu_management", - "mmlu_marketing", - "mmlu_medical_genetics", - "mmlu_miscellaneous", - "mmlu_moral_disputes", - "mmlu_moral_scenarios", - "mmlu_nutrition", - "mmlu_philosophy", - "mmlu_prehistory", - "mmlu_professional_accounting", - "mmlu_professional_law", - "mmlu_professional_medicine", - "mmlu_professional_psychology", - "mmlu_public_relations", - "mmlu_security_studies", - "mmlu_sociology", - "mmlu_us_foreign_policy", - "mmlu_virology", - "mmlu_world_religions" - ] - } - } - }, - { - "evaluation_name": "Abstract Algebra", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Abstract Algebra", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Abstract Algebra - Observed inference time (s)": { - "description": "min=0.62, mean=0.62, max=0.62, sum=1.239 (2)", - "tab": "Efficiency", - "score": 0.6195793676376343 - }, - "Abstract Algebra - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Abstract Algebra - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Abstract Algebra - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Abstract Algebra - # prompt tokens": { - "description": "min=371.38, mean=371.38, max=371.38, sum=742.76 (2)", - "tab": "General information", - "score": 371.38 - }, - "Abstract Algebra - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "abstract_algebra", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_abstract_algebra" - } - } - }, - { - "evaluation_name": "Anatomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Anatomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.733, - "details": { - "description": "min=0.733, mean=0.733, max=0.733, sum=1.467 (2)", - "tab": "Accuracy", - "Anatomy - Observed inference time (s)": { - "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)", - "tab": "Efficiency", - "score": 0.5858598179287381 - }, - "Anatomy - # eval": { - "description": "min=135, mean=135, max=135, sum=270 (2)", - "tab": "General information", - "score": 135.0 - }, - "Anatomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Anatomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Anatomy - # prompt tokens": { - "description": "min=372.081, mean=372.081, max=372.081, sum=744.163 (2)", - "tab": "General information", - "score": 372.0814814814815 - }, - "Anatomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "anatomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_anatomy" - } - } - }, - { - "evaluation_name": "College Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on College Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549, - "details": { - "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)", - "tab": "Accuracy", - "College Chemistry - Observed inference time (s)": { - "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)", - "tab": "Efficiency", - "score": 0.6636523914337158 - }, - "College Biology - Observed inference time (s)": { - "description": "min=0.575, mean=0.575, max=0.575, sum=1.15 (2)", - "tab": "Efficiency", - "score": 0.5751992679304547 - }, - "College Computer Science - Observed inference time (s)": { - "description": "min=0.867, mean=0.867, max=0.867, sum=1.734 (2)", - "tab": "Efficiency", - "score": 0.8668097257614136 - }, - "College Mathematics - Observed inference time (s)": { - "description": "min=0.591, mean=0.591, max=0.591, sum=1.182 (2)", - "tab": "Efficiency", - "score": 0.5912106204032898 - }, - "College Medicine - Observed inference time (s)": { - "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)", - "tab": "Efficiency", - "score": 0.5927534434147653 - }, - "College Physics - Observed inference time (s)": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.159 (2)", - "tab": "Efficiency", - "score": 0.5796795171849868 - }, - "College Chemistry - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Chemistry - # prompt tokens": { - "description": "min=545.4, mean=545.4, max=545.4, sum=1090.8 (2)", - "tab": "General information", - "score": 545.4 - }, - "College Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Biology - # eval": { - "description": "min=144, mean=144, max=144, sum=288 (2)", - "tab": "General information", - "score": 144.0 - }, - "College Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Biology - # prompt tokens": { - "description": "min=482.278, mean=482.278, max=482.278, sum=964.556 (2)", - "tab": "General information", - "score": 482.27777777777777 - }, - "College Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Computer Science - # prompt tokens": { - "description": "min=852.15, mean=852.15, max=852.15, sum=1704.3 (2)", - "tab": "General information", - "score": 852.15 - }, - "College Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Mathematics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "College Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Mathematics - # prompt tokens": { - "description": "min=611.53, mean=611.53, max=611.53, sum=1223.06 (2)", - "tab": "General information", - "score": 611.53 - }, - "College Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Medicine - # eval": { - "description": "min=173, mean=173, max=173, sum=346 (2)", - "tab": "General information", - "score": 173.0 - }, - "College Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Medicine - # prompt tokens": { - "description": "min=530.301, mean=530.301, max=530.301, sum=1060.601 (2)", - "tab": "General information", - "score": 530.3005780346821 - }, - "College Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "College Physics - # eval": { - "description": "min=102, mean=102, max=102, sum=204 (2)", - "tab": "General information", - "score": 102.0 - }, - "College Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "College Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "College Physics - # prompt tokens": { - "description": "min=489.324, mean=489.324, max=489.324, sum=978.647 (2)", - "tab": "General information", - "score": 489.3235294117647 - }, - "College Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "college_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_college_physics" - } - } - }, - { - "evaluation_name": "Computer Security", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Computer Security", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78, - "details": { - "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)", - "tab": "Accuracy", - "Computer Security - Observed inference time (s)": { - "description": "min=0.613, mean=0.613, max=0.613, sum=1.227 (2)", - "tab": "Efficiency", - "score": 0.613369300365448 - }, - "Computer Security - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Computer Security - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Computer Security - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Computer Security - # prompt tokens": { - "description": "min=387.4, mean=387.4, max=387.4, sum=774.8 (2)", - "tab": "General information", - "score": 387.4 - }, - "Computer Security - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "computer_security", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_computer_security" - } - } - }, - { - "evaluation_name": "Econometrics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Econometrics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649, - "details": { - "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)", - "tab": "Accuracy", - "Econometrics - Observed inference time (s)": { - "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)", - "tab": "Efficiency", - "score": 0.7830351319229394 - }, - "Econometrics - # eval": { - "description": "min=114, mean=114, max=114, sum=228 (2)", - "tab": "General information", - "score": 114.0 - }, - "Econometrics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Econometrics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Econometrics - # prompt tokens": { - "description": "min=624.07, mean=624.07, max=624.07, sum=1248.14 (2)", - "tab": "General information", - "score": 624.0701754385965 - }, - "Econometrics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "econometrics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_econometrics" - } - } - }, - { - "evaluation_name": "Global Facts", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Global Facts", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53, - "details": { - "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)", - "tab": "Accuracy", - "Global Facts - Observed inference time (s)": { - "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)", - "tab": "Efficiency", - "score": 0.5858692646026611 - }, - "Global Facts - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Global Facts - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Global Facts - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Global Facts - # prompt tokens": { - "description": "min=398.42, mean=398.42, max=398.42, sum=796.84 (2)", - "tab": "General information", - "score": 398.42 - }, - "Global Facts - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "global_facts", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_global_facts" - } - } - }, - { - "evaluation_name": "Jurisprudence", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Jurisprudence", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88, - "details": { - "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)", - "tab": "Accuracy", - "Jurisprudence - Observed inference time (s)": { - "description": "min=0.581, mean=0.581, max=0.581, sum=1.162 (2)", - "tab": "Efficiency", - "score": 0.5810460448265076 - }, - "Jurisprudence - # eval": { - "description": "min=108, mean=108, max=108, sum=216 (2)", - "tab": "General information", - "score": 108.0 - }, - "Jurisprudence - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Jurisprudence - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Jurisprudence - # prompt tokens": { - "description": "min=418.722, mean=418.722, max=418.722, sum=837.444 (2)", - "tab": "General information", - "score": 418.72222222222223 - }, - "Jurisprudence - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "jurisprudence", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_jurisprudence" - } - } - }, - { - "evaluation_name": "Philosophy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Philosophy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.836, - "details": { - "description": "min=0.836, mean=0.836, max=0.836, sum=1.672 (2)", - "tab": "Accuracy", - "Philosophy - Observed inference time (s)": { - "description": "min=0.576, mean=0.576, max=0.576, sum=1.152 (2)", - "tab": "Efficiency", - "score": 0.5761417744627336 - }, - "Philosophy - # eval": { - "description": "min=311, mean=311, max=311, sum=622 (2)", - "tab": "General information", - "score": 311.0 - }, - "Philosophy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Philosophy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Philosophy - # prompt tokens": { - "description": "min=353.704, mean=353.704, max=353.704, sum=707.408 (2)", - "tab": "General information", - "score": 353.7041800643087 - }, - "Philosophy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "philosophy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_philosophy" - } - } - }, - { - "evaluation_name": "Professional Psychology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Professional Psychology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.858, - "details": { - "description": "min=0.858, mean=0.858, max=0.858, sum=1.716 (2)", - "tab": "Accuracy", - "Professional Medicine - Observed inference time (s)": { - "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)", - "tab": "Efficiency", - "score": 0.8839500090655159 - }, - "Professional Accounting - Observed inference time (s)": { - "description": "min=0.711, mean=0.711, max=0.711, sum=1.423 (2)", - "tab": "Efficiency", - "score": 0.7114707704976941 - }, - "Professional Law - Observed inference time (s)": { - "description": "min=0.981, mean=0.981, max=0.981, sum=1.962 (2)", - "tab": "Efficiency", - "score": 0.9809994663377785 - }, - "Professional Psychology - Observed inference time (s)": { - "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)", - "tab": "Efficiency", - "score": 0.5978598594665527 - }, - "Professional Medicine - # eval": { - "description": "min=272, mean=272, max=272, sum=544 (2)", - "tab": "General information", - "score": 272.0 - }, - "Professional Medicine - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Medicine - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Medicine - # prompt tokens": { - "description": "min=1118.287, mean=1118.287, max=1118.287, sum=2236.574 (2)", - "tab": "General information", - "score": 1118.2867647058824 - }, - "Professional Medicine - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Accounting - # eval": { - "description": "min=282, mean=282, max=282, sum=564 (2)", - "tab": "General information", - "score": 282.0 - }, - "Professional Accounting - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Accounting - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Accounting - # prompt tokens": { - "description": "min=660.72, mean=660.72, max=660.72, sum=1321.44 (2)", - "tab": "General information", - "score": 660.7198581560284 - }, - "Professional Accounting - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Law - # eval": { - "description": "min=1534, mean=1534, max=1534, sum=3068 (2)", - "tab": "General information", - "score": 1534.0 - }, - "Professional Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Law - # prompt tokens": { - "description": "min=1658.73, mean=1658.73, max=1658.73, sum=3317.46 (2)", - "tab": "General information", - "score": 1658.7301173402868 - }, - "Professional Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Professional Psychology - # eval": { - "description": "min=612, mean=612, max=612, sum=1224 (2)", - "tab": "General information", - "score": 612.0 - }, - "Professional Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Professional Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Professional Psychology - # prompt tokens": { - "description": "min=597.574, mean=597.574, max=597.574, sum=1195.147 (2)", - "tab": "General information", - "score": 597.5735294117648 - }, - "Professional Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "professional_psychology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_professional_psychology" - } - } - }, - { - "evaluation_name": "Us Foreign Policy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Us Foreign Policy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.96, - "details": { - "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)", - "tab": "Accuracy", - "Us Foreign Policy - Observed inference time (s)": { - "description": "min=0.604, mean=0.604, max=0.604, sum=1.207 (2)", - "tab": "Efficiency", - "score": 0.6037013912200928 - }, - "Us Foreign Policy - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Us Foreign Policy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Us Foreign Policy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Us Foreign Policy - # prompt tokens": { - "description": "min=433.12, mean=433.12, max=433.12, sum=866.24 (2)", - "tab": "General information", - "score": 433.12 - }, - "Us Foreign Policy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "us_foreign_policy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_us_foreign_policy" - } - } - }, - { - "evaluation_name": "Astronomy", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Astronomy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.862, - "details": { - "description": "min=0.862, mean=0.862, max=0.862, sum=1.724 (2)", - "tab": "Accuracy", - "Astronomy - Observed inference time (s)": { - "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)", - "tab": "Efficiency", - "score": 0.5929083667303386 - }, - "Astronomy - # eval": { - "description": "min=152, mean=152, max=152, sum=304 (2)", - "tab": "General information", - "score": 152.0 - }, - "Astronomy - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Astronomy - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Astronomy - # prompt tokens": { - "description": "min=600.112, mean=600.112, max=600.112, sum=1200.224 (2)", - "tab": "General information", - "score": 600.1118421052631 - }, - "Astronomy - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "astronomy", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_astronomy" - } - } - }, - { - "evaluation_name": "Business Ethics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Business Ethics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Business Ethics - Observed inference time (s)": { - "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)", - "tab": "Efficiency", - "score": 0.5981829071044922 - }, - "Business Ethics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Business Ethics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Business Ethics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Business Ethics - # prompt tokens": { - "description": "min=589.46, mean=589.46, max=589.46, sum=1178.92 (2)", - "tab": "General information", - "score": 589.46 - }, - "Business Ethics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "business_ethics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_business_ethics" - } - } - }, - { - "evaluation_name": "Clinical Knowledge", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Clinical Knowledge", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804, - "details": { - "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)", - "tab": "Accuracy", - "Clinical Knowledge - Observed inference time (s)": { - "description": "min=0.575, mean=0.575, max=0.575, sum=1.15 (2)", - "tab": "Efficiency", - "score": 0.5750116924069962 - }, - "Clinical Knowledge - # eval": { - "description": "min=265, mean=265, max=265, sum=530 (2)", - "tab": "General information", - "score": 265.0 - }, - "Clinical Knowledge - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Clinical Knowledge - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Clinical Knowledge - # prompt tokens": { - "description": "min=423.925, mean=423.925, max=423.925, sum=847.849 (2)", - "tab": "General information", - "score": 423.92452830188677 - }, - "Clinical Knowledge - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "clinical_knowledge", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_clinical_knowledge" - } - } - }, - { - "evaluation_name": "Conceptual Physics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Conceptual Physics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809, - "details": { - "description": "min=0.809, mean=0.809, max=0.809, sum=1.617 (2)", - "tab": "Accuracy", - "Conceptual Physics - Observed inference time (s)": { - "description": "min=0.58, mean=0.58, max=0.58, sum=1.161 (2)", - "tab": "Efficiency", - "score": 0.5802780881841132 - }, - "Conceptual Physics - # eval": { - "description": "min=235, mean=235, max=235, sum=470 (2)", - "tab": "General information", - "score": 235.0 - }, - "Conceptual Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Conceptual Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Conceptual Physics - # prompt tokens": { - "description": "min=313.723, mean=313.723, max=313.723, sum=627.447 (2)", - "tab": "General information", - "score": 313.72340425531917 - }, - "Conceptual Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "conceptual_physics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_conceptual_physics" - } - } - }, - { - "evaluation_name": "Electrical Engineering", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Electrical Engineering", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772, - "details": { - "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)", - "tab": "Accuracy", - "Electrical Engineering - Observed inference time (s)": { - "description": "min=0.583, mean=0.583, max=0.583, sum=1.165 (2)", - "tab": "Efficiency", - "score": 0.5827381166918525 - }, - "Electrical Engineering - # eval": { - "description": "min=145, mean=145, max=145, sum=290 (2)", - "tab": "General information", - "score": 145.0 - }, - "Electrical Engineering - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Electrical Engineering - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Electrical Engineering - # prompt tokens": { - "description": "min=430.345, mean=430.345, max=430.345, sum=860.69 (2)", - "tab": "General information", - "score": 430.3448275862069 - }, - "Electrical Engineering - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "electrical_engineering", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_electrical_engineering" - } - } - }, - { - "evaluation_name": "Elementary Mathematics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Elementary Mathematics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.661, - "details": { - "description": "min=0.661, mean=0.661, max=0.661, sum=1.323 (2)", - "tab": "Accuracy", - "Elementary Mathematics - Observed inference time (s)": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.167 (2)", - "tab": "Efficiency", - "score": 0.5836543033993433 - }, - "Elementary Mathematics - # eval": { - "description": "min=378, mean=378, max=378, sum=756 (2)", - "tab": "General information", - "score": 378.0 - }, - "Elementary Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Elementary Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Elementary Mathematics - # prompt tokens": { - "description": "min=506.09, mean=506.09, max=506.09, sum=1012.18 (2)", - "tab": "General information", - "score": 506.0899470899471 - }, - "Elementary Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "elementary_mathematics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_elementary_mathematics" - } - } - }, - { - "evaluation_name": "Formal Logic", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Formal Logic", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.659, - "details": { - "description": "min=0.659, mean=0.659, max=0.659, sum=1.317 (2)", - "tab": "Accuracy", - "Formal Logic - Observed inference time (s)": { - "description": "min=0.597, mean=0.597, max=0.597, sum=1.194 (2)", - "tab": "Efficiency", - "score": 0.5971027309932406 - }, - "Formal Logic - # eval": { - "description": "min=126, mean=126, max=126, sum=252 (2)", - "tab": "General information", - "score": 126.0 - }, - "Formal Logic - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Formal Logic - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Formal Logic - # prompt tokens": { - "description": "min=641, mean=641, max=641, sum=1282 (2)", - "tab": "General information", - "score": 641.0 - }, - "Formal Logic - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "formal_logic", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_formal_logic" - } - } - }, - { - "evaluation_name": "High School World History", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on High School World History", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911, - "details": { - "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)", - "tab": "Accuracy", - "High School Biology - Observed inference time (s)": { - "description": "min=0.584, mean=0.584, max=0.584, sum=1.168 (2)", - "tab": "Efficiency", - "score": 0.5838540715555991 - }, - "High School Chemistry - Observed inference time (s)": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)", - "tab": "Efficiency", - "score": 0.5794280843781721 - }, - "High School Computer Science - Observed inference time (s)": { - "description": "min=0.873, mean=0.873, max=0.873, sum=1.745 (2)", - "tab": "Efficiency", - "score": 0.8726636576652527 - }, - "High School European History - Observed inference time (s)": { - "description": "min=1.532, mean=1.532, max=1.532, sum=3.063 (2)", - "tab": "Efficiency", - "score": 1.5316768602891402 - }, - "High School Geography - Observed inference time (s)": { - "description": "min=0.568, mean=0.568, max=0.568, sum=1.135 (2)", - "tab": "Efficiency", - "score": 0.5675288703706529 - }, - "High School Government And Politics - Observed inference time (s)": { - "description": "min=0.574, mean=0.574, max=0.574, sum=1.147 (2)", - "tab": "Efficiency", - "score": 0.573576919773082 - }, - "High School Macroeconomics - Observed inference time (s)": { - "description": "min=0.608, mean=0.608, max=0.608, sum=1.215 (2)", - "tab": "Efficiency", - "score": 0.607545349536798 - }, - "High School Mathematics - Observed inference time (s)": { - "description": "min=0.594, mean=0.594, max=0.594, sum=1.187 (2)", - "tab": "Efficiency", - "score": 0.5936917472768712 - }, - "High School Microeconomics - Observed inference time (s)": { - "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)", - "tab": "Efficiency", - "score": 0.5614581979623362 - }, - "High School Physics - Observed inference time (s)": { - "description": "min=0.594, mean=0.594, max=0.594, sum=1.189 (2)", - "tab": "Efficiency", - "score": 0.5943679051683438 - }, - "High School Psychology - Observed inference time (s)": { - "description": "min=0.595, mean=0.595, max=0.595, sum=1.189 (2)", - "tab": "Efficiency", - "score": 0.5945224263252469 - }, - "High School Statistics - Observed inference time (s)": { - "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)", - "tab": "Efficiency", - "score": 0.8891873856385549 - }, - "High School US History - Observed inference time (s)": { - "description": "min=1.566, mean=1.566, max=1.566, sum=3.131 (2)", - "tab": "Efficiency", - "score": 1.5656375043532427 - }, - "High School World History - Observed inference time (s)": { - "description": "min=0.876, mean=0.876, max=0.876, sum=1.751 (2)", - "tab": "Efficiency", - "score": 0.8755375081476783 - }, - "High School Biology - # eval": { - "description": "min=310, mean=310, max=310, sum=620 (2)", - "tab": "General information", - "score": 310.0 - }, - "High School Biology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Biology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Biology - # prompt tokens": { - "description": "min=540.748, mean=540.748, max=540.748, sum=1081.497 (2)", - "tab": "General information", - "score": 540.7483870967742 - }, - "High School Biology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Chemistry - # eval": { - "description": "min=203, mean=203, max=203, sum=406 (2)", - "tab": "General information", - "score": 203.0 - }, - "High School Chemistry - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Chemistry - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Chemistry - # prompt tokens": { - "description": "min=495.65, mean=495.65, max=495.65, sum=991.3 (2)", - "tab": "General information", - "score": 495.6502463054187 - }, - "High School Chemistry - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Computer Science - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "High School Computer Science - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Computer Science - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Computer Science - # prompt tokens": { - "description": "min=904.15, mean=904.15, max=904.15, sum=1808.3 (2)", - "tab": "General information", - "score": 904.15 - }, - "High School Computer Science - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School European History - # eval": { - "description": "min=165, mean=165, max=165, sum=330 (2)", - "tab": "General information", - "score": 165.0 - }, - "High School European History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School European History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School European History - # prompt tokens": { - "description": "min=2844.03, mean=2844.03, max=2844.03, sum=5688.061 (2)", - "tab": "General information", - "score": 2844.030303030303 - }, - "High School European History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Geography - # eval": { - "description": "min=198, mean=198, max=198, sum=396 (2)", - "tab": "General information", - "score": 198.0 - }, - "High School Geography - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Geography - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Geography - # prompt tokens": { - "description": "min=397.646, mean=397.646, max=397.646, sum=795.293 (2)", - "tab": "General information", - "score": 397.64646464646466 - }, - "High School Geography - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Government And Politics - # eval": { - "description": "min=193, mean=193, max=193, sum=386 (2)", - "tab": "General information", - "score": 193.0 - }, - "High School Government And Politics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Government And Politics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Government And Politics - # prompt tokens": { - "description": "min=478.073, mean=478.073, max=478.073, sum=956.145 (2)", - "tab": "General information", - "score": 478.07253886010363 - }, - "High School Government And Politics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Macroeconomics - # eval": { - "description": "min=390, mean=390, max=390, sum=780 (2)", - "tab": "General information", - "score": 390.0 - }, - "High School Macroeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Macroeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Macroeconomics - # prompt tokens": { - "description": "min=391.987, mean=391.987, max=391.987, sum=783.974 (2)", - "tab": "General information", - "score": 391.9871794871795 - }, - "High School Macroeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Mathematics - # eval": { - "description": "min=270, mean=270, max=270, sum=540 (2)", - "tab": "General information", - "score": 270.0 - }, - "High School Mathematics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Mathematics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Mathematics - # prompt tokens": { - "description": "min=526.352, mean=526.352, max=526.352, sum=1052.704 (2)", - "tab": "General information", - "score": 526.3518518518518 - }, - "High School Mathematics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Microeconomics - # eval": { - "description": "min=238, mean=238, max=238, sum=476 (2)", - "tab": "General information", - "score": 238.0 - }, - "High School Microeconomics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Microeconomics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Microeconomics - # prompt tokens": { - "description": "min=411.055, mean=411.055, max=411.055, sum=822.109 (2)", - "tab": "General information", - "score": 411.0546218487395 - }, - "High School Microeconomics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Physics - # eval": { - "description": "min=151, mean=151, max=151, sum=302 (2)", - "tab": "General information", - "score": 151.0 - }, - "High School Physics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Physics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Physics - # prompt tokens": { - "description": "min=553.669, mean=553.669, max=553.669, sum=1107.338 (2)", - "tab": "General information", - "score": 553.6688741721854 - }, - "High School Physics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Psychology - # eval": { - "description": "min=545, mean=545, max=545, sum=1090 (2)", - "tab": "General information", - "score": 545.0 - }, - "High School Psychology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Psychology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Psychology - # prompt tokens": { - "description": "min=516.842, mean=516.842, max=516.842, sum=1033.684 (2)", - "tab": "General information", - "score": 516.8422018348624 - }, - "High School Psychology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School Statistics - # eval": { - "description": "min=216, mean=216, max=216, sum=432 (2)", - "tab": "General information", - "score": 216.0 - }, - "High School Statistics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School Statistics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School Statistics - # prompt tokens": { - "description": "min=805, mean=805, max=805, sum=1610 (2)", - "tab": "General information", - "score": 805.0 - }, - "High School Statistics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School US History - # eval": { - "description": "min=204, mean=204, max=204, sum=408 (2)", - "tab": "General information", - "score": 204.0 - }, - "High School US History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School US History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School US History - # prompt tokens": { - "description": "min=2242.25, mean=2242.25, max=2242.25, sum=4484.5 (2)", - "tab": "General information", - "score": 2242.25 - }, - "High School US History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "High School World History - # eval": { - "description": "min=237, mean=237, max=237, sum=474 (2)", - "tab": "General information", - "score": 237.0 - }, - "High School World History - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "High School World History - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "High School World History - # prompt tokens": { - "description": "min=1438.561, mean=1438.561, max=1438.561, sum=2877.122 (2)", - "tab": "General information", - "score": 1438.5611814345991 - }, - "High School World History - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "high_school_world_history", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_high_school_world_history" - } - } - }, - { - "evaluation_name": "Human Sexuality", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Human Sexuality", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.924, - "details": { - "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)", - "tab": "Accuracy", - "Human Aging - Observed inference time (s)": { - "description": "min=0.577, mean=0.577, max=0.577, sum=1.154 (2)", - "tab": "Efficiency", - "score": 0.5767963167797824 - }, - "Human Sexuality - Observed inference time (s)": { - "description": "min=0.564, mean=0.564, max=0.564, sum=1.127 (2)", - "tab": "Efficiency", - "score": 0.5637276700434793 - }, - "Human Aging - # eval": { - "description": "min=223, mean=223, max=223, sum=446 (2)", - "tab": "General information", - "score": 223.0 - }, - "Human Aging - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Aging - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Aging - # prompt tokens": { - "description": "min=324.48, mean=324.48, max=324.48, sum=648.96 (2)", - "tab": "General information", - "score": 324.47982062780267 - }, - "Human Aging - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Human Sexuality - # eval": { - "description": "min=131, mean=131, max=131, sum=262 (2)", - "tab": "General information", - "score": 131.0 - }, - "Human Sexuality - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Human Sexuality - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Human Sexuality - # prompt tokens": { - "description": "min=357.626, mean=357.626, max=357.626, sum=715.252 (2)", - "tab": "General information", - "score": 357.62595419847327 - }, - "Human Sexuality - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "human_sexuality", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_human_sexuality" - } - } - }, - { - "evaluation_name": "International Law", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on International Law", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.909, - "details": { - "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)", - "tab": "Accuracy", - "International Law - Observed inference time (s)": { - "description": "min=0.603, mean=0.603, max=0.603, sum=1.205 (2)", - "tab": "Efficiency", - "score": 0.6025364970372729 - }, - "International Law - # eval": { - "description": "min=121, mean=121, max=121, sum=242 (2)", - "tab": "General information", - "score": 121.0 - }, - "International Law - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "International Law - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "International Law - # prompt tokens": { - "description": "min=639.843, mean=639.843, max=639.843, sum=1279.686 (2)", - "tab": "General information", - "score": 639.8429752066115 - }, - "International Law - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "international_law", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_international_law" - } - } - }, - { - "evaluation_name": "Logical Fallacies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Logical Fallacies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)", - "tab": "Accuracy", - "Logical Fallacies - Observed inference time (s)": { - "description": "min=0.577, mean=0.577, max=0.577, sum=1.154 (2)", - "tab": "Efficiency", - "score": 0.5770467907373159 - }, - "Logical Fallacies - # eval": { - "description": "min=163, mean=163, max=163, sum=326 (2)", - "tab": "General information", - "score": 163.0 - }, - "Logical Fallacies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Logical Fallacies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Logical Fallacies - # prompt tokens": { - "description": "min=454.227, mean=454.227, max=454.227, sum=908.454 (2)", - "tab": "General information", - "score": 454.2269938650307 - }, - "Logical Fallacies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "logical_fallacies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_logical_fallacies" - } - } - }, - { - "evaluation_name": "Machine Learning", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Machine Learning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625, - "details": { - "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)", - "tab": "Accuracy", - "Machine Learning - Observed inference time (s)": { - "description": "min=0.612, mean=0.612, max=0.612, sum=1.223 (2)", - "tab": "Efficiency", - "score": 0.6116326642887933 - }, - "Machine Learning - # eval": { - "description": "min=112, mean=112, max=112, sum=224 (2)", - "tab": "General information", - "score": 112.0 - }, - "Machine Learning - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Machine Learning - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Machine Learning - # prompt tokens": { - "description": "min=671.598, mean=671.598, max=671.598, sum=1343.196 (2)", - "tab": "General information", - "score": 671.5982142857143 - }, - "Machine Learning - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "machine_learning", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_machine_learning" - } - } - }, - { - "evaluation_name": "Management", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Management", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903, - "details": { - "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)", - "tab": "Accuracy", - "Management - Observed inference time (s)": { - "description": "min=0.555, mean=0.555, max=0.555, sum=1.111 (2)", - "tab": "Efficiency", - "score": 0.5553541276061419 - }, - "Management - # eval": { - "description": "min=103, mean=103, max=103, sum=206 (2)", - "tab": "General information", - "score": 103.0 - }, - "Management - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Management - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Management - # prompt tokens": { - "description": "min=292.34, mean=292.34, max=292.34, sum=584.68 (2)", - "tab": "General information", - "score": 292.3398058252427 - }, - "Management - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "management", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_management" - } - } - }, - { - "evaluation_name": "Marketing", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Marketing", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94, - "details": { - "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)", - "tab": "Accuracy", - "Marketing - Observed inference time (s)": { - "description": "min=0.567, mean=0.567, max=0.567, sum=1.133 (2)", - "tab": "Efficiency", - "score": 0.56665647131765 - }, - "Marketing - # eval": { - "description": "min=234, mean=234, max=234, sum=468 (2)", - "tab": "General information", - "score": 234.0 - }, - "Marketing - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Marketing - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Marketing - # prompt tokens": { - "description": "min=438.697, mean=438.697, max=438.697, sum=877.393 (2)", - "tab": "General information", - "score": 438.6965811965812 - }, - "Marketing - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "marketing", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_marketing" - } - } - }, - { - "evaluation_name": "Medical Genetics", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Medical Genetics", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.83, - "details": { - "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)", - "tab": "Accuracy", - "Medical Genetics - Observed inference time (s)": { - "description": "min=0.566, mean=0.566, max=0.566, sum=1.131 (2)", - "tab": "Efficiency", - "score": 0.5655512261390686 - }, - "Medical Genetics - # eval": { - "description": "min=100, mean=100, max=100, sum=200 (2)", - "tab": "General information", - "score": 100.0 - }, - "Medical Genetics - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Medical Genetics - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Medical Genetics - # prompt tokens": { - "description": "min=352.71, mean=352.71, max=352.71, sum=705.42 (2)", - "tab": "General information", - "score": 352.71 - }, - "Medical Genetics - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "medical_genetics", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_medical_genetics" - } - } - }, - { - "evaluation_name": "Miscellaneous", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Miscellaneous", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.894, - "details": { - "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)", - "tab": "Accuracy", - "Miscellaneous - Observed inference time (s)": { - "description": "min=0.571, mean=0.571, max=0.571, sum=1.142 (2)", - "tab": "Efficiency", - "score": 0.5712210739252668 - }, - "Miscellaneous - # eval": { - "description": "min=783, mean=783, max=783, sum=1566 (2)", - "tab": "General information", - "score": 783.0 - }, - "Miscellaneous - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Miscellaneous - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Miscellaneous - # prompt tokens": { - "description": "min=314.847, mean=314.847, max=314.847, sum=629.693 (2)", - "tab": "General information", - "score": 314.84674329501917 - }, - "Miscellaneous - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "miscellaneous", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_miscellaneous" - } - } - }, - { - "evaluation_name": "Moral Scenarios", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Moral Scenarios", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.562, - "details": { - "description": "min=0.562, mean=0.562, max=0.562, sum=1.124 (2)", - "tab": "Accuracy", - "Moral Disputes - Observed inference time (s)": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", - "tab": "Efficiency", - "score": 0.5724084032753299 - }, - "Moral Scenarios - Observed inference time (s)": { - "description": "min=0.583, mean=0.583, max=0.583, sum=1.166 (2)", - "tab": "Efficiency", - "score": 0.5827599754546607 - }, - "Moral Disputes - # eval": { - "description": "min=346, mean=346, max=346, sum=692 (2)", - "tab": "General information", - "score": 346.0 - }, - "Moral Disputes - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Disputes - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Disputes - # prompt tokens": { - "description": "min=497.329, mean=497.329, max=497.329, sum=994.659 (2)", - "tab": "General information", - "score": 497.32947976878614 - }, - "Moral Disputes - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - }, - "Moral Scenarios - # eval": { - "description": "min=895, mean=895, max=895, sum=1790 (2)", - "tab": "General information", - "score": 895.0 - }, - "Moral Scenarios - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Moral Scenarios - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Moral Scenarios - # prompt tokens": { - "description": "min=664.482, mean=664.482, max=664.482, sum=1328.963 (2)", - "tab": "General information", - "score": 664.4815642458101 - }, - "Moral Scenarios - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "moral_scenarios", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_moral_scenarios" - } - } - }, - { - "evaluation_name": "Nutrition", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Nutrition", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.856, - "details": { - "description": "min=0.856, mean=0.856, max=0.856, sum=1.712 (2)", - "tab": "Accuracy", - "Nutrition - Observed inference time (s)": { - "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)", - "tab": "Efficiency", - "score": 0.5898437850615558 - }, - "Nutrition - # eval": { - "description": "min=306, mean=306, max=306, sum=612 (2)", - "tab": "General information", - "score": 306.0 - }, - "Nutrition - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Nutrition - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Nutrition - # prompt tokens": { - "description": "min=584.69, mean=584.69, max=584.69, sum=1169.379 (2)", - "tab": "General information", - "score": 584.6895424836601 - }, - "Nutrition - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "nutrition", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_nutrition" - } - } - }, - { - "evaluation_name": "Prehistory", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Prehistory", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.87, - "details": { - "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)", - "tab": "Accuracy", - "Prehistory - Observed inference time (s)": { - "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)", - "tab": "Efficiency", - "score": 0.5852300509994413 - }, - "Prehistory - # eval": { - "description": "min=324, mean=324, max=324, sum=648 (2)", - "tab": "General information", - "score": 324.0 - }, - "Prehistory - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Prehistory - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Prehistory - # prompt tokens": { - "description": "min=524.454, mean=524.454, max=524.454, sum=1048.907 (2)", - "tab": "General information", - "score": 524.4537037037037 - }, - "Prehistory - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "prehistory", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_prehistory" - } - } - }, - { - "evaluation_name": "Public Relations", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Public Relations", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773, - "details": { - "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)", - "tab": "Accuracy", - "Public Relations - Observed inference time (s)": { - "description": "min=0.567, mean=0.567, max=0.567, sum=1.134 (2)", - "tab": "Efficiency", - "score": 0.5669147144664418 - }, - "Public Relations - # eval": { - "description": "min=110, mean=110, max=110, sum=220 (2)", - "tab": "General information", - "score": 110.0 - }, - "Public Relations - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Public Relations - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Public Relations - # prompt tokens": { - "description": "min=420.609, mean=420.609, max=420.609, sum=841.218 (2)", - "tab": "General information", - "score": 420.6090909090909 - }, - "Public Relations - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "public_relations", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_public_relations" - } - } - }, - { - "evaluation_name": "Security Studies", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Security Studies", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833, - "details": { - "description": "min=0.833, mean=0.833, max=0.833, sum=1.665 (2)", - "tab": "Accuracy", - "Security Studies - Observed inference time (s)": { - "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)", - "tab": "Efficiency", - "score": 0.8641960144042968 - }, - "Security Studies - # eval": { - "description": "min=245, mean=245, max=245, sum=490 (2)", - "tab": "General information", - "score": 245.0 - }, - "Security Studies - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Security Studies - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Security Studies - # prompt tokens": { - "description": "min=1196.433, mean=1196.433, max=1196.433, sum=2392.865 (2)", - "tab": "General information", - "score": 1196.4326530612245 - }, - "Security Studies - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "security_studies", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_security_studies" - } - } - }, - { - "evaluation_name": "Sociology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Sociology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.91, - "details": { - "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)", - "tab": "Accuracy", - "Sociology - Observed inference time (s)": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)", - "tab": "Efficiency", - "score": 0.5788582047419761 - }, - "Sociology - # eval": { - "description": "min=201, mean=201, max=201, sum=402 (2)", - "tab": "General information", - "score": 201.0 - }, - "Sociology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Sociology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Sociology - # prompt tokens": { - "description": "min=446.512, mean=446.512, max=446.512, sum=893.025 (2)", - "tab": "General information", - "score": 446.5124378109453 - }, - "Sociology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "sociology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_sociology" - } - } - }, - { - "evaluation_name": "Virology", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on Virology", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572, - "details": { - "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)", - "tab": "Accuracy", - "Virology - Observed inference time (s)": { - "description": "min=0.569, mean=0.569, max=0.569, sum=1.138 (2)", - "tab": "Efficiency", - "score": 0.5690187689769699 - }, - "Virology - # eval": { - "description": "min=166, mean=166, max=166, sum=332 (2)", - "tab": "General information", - "score": 166.0 - }, - "Virology - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "Virology - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "Virology - # prompt tokens": { - "description": "min=352.753, mean=352.753, max=352.753, sum=705.506 (2)", - "tab": "General information", - "score": 352.7530120481928 - }, - "Virology - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "virology", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_virology" - } - } - }, - { - "evaluation_name": "World Religions", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "EM on World Religions", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877, - "details": { - "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)", - "tab": "Accuracy", - "World Religions - Observed inference time (s)": { - "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)", - "tab": "Efficiency", - "score": 0.5794550257119518 - }, - "World Religions - # eval": { - "description": "min=171, mean=171, max=171, sum=342 (2)", - "tab": "General information", - "score": 171.0 - }, - "World Religions - # train": { - "description": "min=5, mean=5, max=5, sum=10 (2)", - "tab": "General information", - "score": 5.0 - }, - "World Religions - truncated": { - "description": "min=0, mean=0, max=0, sum=0 (2)", - "tab": "General information", - "score": 0.0 - }, - "World Religions - # prompt tokens": { - "description": "min=277.386, mean=277.386, max=277.386, sum=554.772 (2)", - "tab": "General information", - "score": 277.3859649122807 - }, - "World Religions - # output tokens": { - "description": "min=1, mean=1, max=1, sum=2 (2)", - "tab": "General information", - "score": 1.0 - } - } - }, - "generation_config": { - "additional_details": { - "subject": "world_religions", - "method": "multiple_choice_joint", - "eval_split": "test", - "groups": "mmlu_world_religions" - } - } - }, - { - "evaluation_name": "Mean win rate", - "source_data": { - "dataset_name": "helm_mmlu", - "source_type": "url", - "url": [ - "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json" - ] - }, - "metric_config": { - "evaluation_description": "How many models this model outperforms on average (over columns).", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325, - "details": { - "tab": "Efficiency" - } - }, - "generation_config": { - "additional_details": {} - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/0-hero/Matter-0.2-7B-DPO/0d7928c3-c769-474e-8249-7a5c70c4c559.json b/data/hfopenllm_v2/0-hero/Matter-0.2-7B-DPO/0d7928c3-c769-474e-8249-7a5c70c4c559.json deleted file mode 100644 index f776710f3..000000000 --- a/data/hfopenllm_v2/0-hero/Matter-0.2-7B-DPO/0d7928c3-c769-474e-8249-7a5c70c4c559.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/0-hero_Matter-0.2-7B-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Matter-0.2-7B-DPO", - "id": "0-hero/Matter-0.2-7B-DPO", - "developer": "0-hero", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3303 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3596 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-32K/f63536ed-752b-4538-9b92-2514a617a4bf.json b/data/hfopenllm_v2/01-ai/Yi-1.5-34B-32K/f63536ed-752b-4538-9b92-2514a617a4bf.json deleted file mode 100644 index 7d0d73c85..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-32K/f63536ed-752b-4538-9b92-2514a617a4bf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-32K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-34B-32K", - "id": "01-ai/Yi-1.5-34B-32K", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3119 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6016 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4709 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat-16K/8ff13de2-ea43-4392-992f-ba70b6023e96.json b/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat-16K/8ff13de2-ea43-4392-992f-ba70b6023e96.json deleted file mode 100644 index 8682b3811..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat-16K/8ff13de2-ea43-4392-992f-ba70b6023e96.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-Chat-16K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-34B-Chat-16K", - "id": "01-ai/Yi-1.5-34B-Chat-16K", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2137 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4545 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat/02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json b/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat/02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json deleted file mode 100644 index 1a02c9bdc..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat/02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-34B-Chat", - "id": "01-ai/Yi-1.5-34B-Chat", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6067 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2772 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4282 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-34B/74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json b/data/hfopenllm_v2/01-ai/Yi-1.5-34B/74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json deleted file mode 100644 index 948057bc5..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-34B/74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-34B", - "id": "01-ai/Yi-1.5-34B", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2841 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5976 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4666 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-6B-Chat/ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json b/data/hfopenllm_v2/01-ai/Yi-1.5-6B-Chat/ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json deleted file mode 100644 index 3a37bdc49..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-6B-Chat/ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-6B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-6B-Chat", - "id": "01-ai/Yi-1.5-6B-Chat", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5145 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4571 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1624 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3193 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-6B/05307b41-d832-4533-99bd-c8608bf8e64c.json b/data/hfopenllm_v2/01-ai/Yi-1.5-6B/05307b41-d832-4533-99bd-c8608bf8e64c.json deleted file mode 100644 index 8abcdb009..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-6B/05307b41-d832-4533-99bd-c8608bf8e64c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-6B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-6B", - "id": "01-ai/Yi-1.5-6B", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4374 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-32K/c09bd9b0-6f85-4120-94a9-b628c68bccb7.json b/data/hfopenllm_v2/01-ai/Yi-1.5-9B-32K/c09bd9b0-6f85-4120-94a9-b628c68bccb7.json deleted file mode 100644 index 510b97e1d..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-32K/c09bd9b0-6f85-4120-94a9-b628c68bccb7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-32K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-9B-32K", - "id": "01-ai/Yi-1.5-9B-32K", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2303 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4963 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3765 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat-16K/9f971385-1146-4436-91a6-0e52d4db1f07.json b/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat-16K/9f971385-1146-4436-91a6-0e52d4db1f07.json deleted file mode 100644 index 67dbc9f74..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat-16K/9f971385-1146-4436-91a6-0e52d4db1f07.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-Chat-16K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-9B-Chat-16K", - "id": "01-ai/Yi-1.5-9B-Chat-16K", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4214 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5153 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4099 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3994 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat/80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json b/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat/80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json deleted file mode 100644 index 9ac18fdab..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat/80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-9B-Chat", - "id": "01-ai/Yi-1.5-9B-Chat", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6046 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5559 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2258 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4259 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3975 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-9B/db88e3f5-58a9-4783-9093-a6df96483342.json b/data/hfopenllm_v2/01-ai/Yi-1.5-9B/db88e3f5-58a9-4783-9093-a6df96483342.json deleted file mode 100644 index 465841ce2..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-1.5-9B/db88e3f5-58a9-4783-9093-a6df96483342.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-9B", - "id": "01-ai/Yi-1.5-9B", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5143 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3916 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-34B-200K/8cd90f8a-d8dc-469b-95b9-260fcef804d2.json b/data/hfopenllm_v2/01-ai/Yi-34B-200K/8cd90f8a-d8dc-469b-95b9-260fcef804d2.json deleted file mode 100644 index ba5c90e1f..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-34B-200K/8cd90f8a-d8dc-469b-95b9-260fcef804d2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B-200K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-34B-200K", - "id": "01-ai/Yi-34B-200K", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1542 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5442 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4535 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-34B-Chat/b2c82703-2b5c-407d-b84f-a8f8261ac894.json b/data/hfopenllm_v2/01-ai/Yi-34B-Chat/b2c82703-2b5c-407d-b84f-a8f8261ac894.json deleted file mode 100644 index a62cda89d..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-34B-Chat/b2c82703-2b5c-407d-b84f-a8f8261ac894.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-34B-Chat", - "id": "01-ai/Yi-34B-Chat", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4699 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5561 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3978 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4093 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-34B/55462e67-5eca-4e9d-9095-51fcf12de5fa.json b/data/hfopenllm_v2/01-ai/Yi-34B/55462e67-5eca-4e9d-9095-51fcf12de5fa.json deleted file mode 100644 index 1781d005a..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-34B/55462e67-5eca-4e9d-9095-51fcf12de5fa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-34B", - "id": "01-ai/Yi-34B", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3046 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5457 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4119 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-6B-200K/25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json b/data/hfopenllm_v2/01-ai/Yi-6B-200K/25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json deleted file mode 100644 index 8ffbe3e70..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-6B-200K/25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B-200K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-6B-200K", - "id": "01-ai/Yi-6B-200K", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0843 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4587 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-6B-Chat/efc036b6-d8de-4393-87a1-d4f86fb44d91.json b/data/hfopenllm_v2/01-ai/Yi-6B-Chat/efc036b6-d8de-4393-87a1-d4f86fb44d91.json deleted file mode 100644 index a454f4866..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-6B-Chat/efc036b6-d8de-4393-87a1-d4f86fb44d91.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-6B-Chat", - "id": "01-ai/Yi-6B-Chat", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3395 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4133 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3061 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-6B/a5144406-eb85-43b2-a49d-be6b06d6b04a.json b/data/hfopenllm_v2/01-ai/Yi-6B/a5144406-eb85-43b2-a49d-be6b06d6b04a.json deleted file mode 100644 index b7fc1616d..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-6B/a5144406-eb85-43b2-a49d-be6b06d6b04a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-6B", - "id": "01-ai/Yi-6B", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2893 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4309 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2991 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-9B-200K/900184ad-656d-416b-956f-5f6e3a991d1b.json b/data/hfopenllm_v2/01-ai/Yi-9B-200K/900184ad-656d-416b-956f-5f6e3a991d1b.json deleted file mode 100644 index de7d6ef80..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-9B-200K/900184ad-656d-416b-956f-5f6e3a991d1b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-9B-200K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-9B-200K", - "id": "01-ai/Yi-9B-200K", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2327 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4793 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4294 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3622 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-9B/7a58954a-5d7d-4640-99fd-773249640237.json b/data/hfopenllm_v2/01-ai/Yi-9B/7a58954a-5d7d-4640-99fd-773249640237.json deleted file mode 100644 index f8eee73fd..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-9B/7a58954a-5d7d-4640-99fd-773249640237.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-9B", - "id": "01-ai/Yi-9B", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2709 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.494 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4054 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/01-ai/Yi-Coder-9B-Chat/4ea3146c-b912-424a-b0a9-7c37348348c8.json b/data/hfopenllm_v2/01-ai/Yi-Coder-9B-Chat/4ea3146c-b912-424a-b0a9-7c37348348c8.json deleted file mode 100644 index aa071d68a..000000000 --- a/data/hfopenllm_v2/01-ai/Yi-Coder-9B-Chat/4ea3146c-b912-424a-b0a9-7c37348348c8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/01-ai_Yi-Coder-9B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-Coder-9B-Chat", - "id": "01-ai/Yi-Coder-9B-Chat", - "developer": "01-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4817 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4814 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3992 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2425 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct/b0276278-6d86-49c0-a246-cd9110ac1deb.json b/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct/b0276278-6d86-49c0-a246-cd9110ac1deb.json deleted file mode 100644 index 14f396bdb..000000000 --- a/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct/b0276278-6d86-49c0-a246-cd9110ac1deb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-14B-Hindi-Custom-Instruct", - "id": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct", - "developer": "1-800-LLMs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4491 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi/04216f67-1385-43bf-b7de-5bae7a60f379.json b/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi/04216f67-1385-43bf-b7de-5bae7a60f379.json deleted file mode 100644 index 96c0dfd9a..000000000 --- a/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi/04216f67-1385-43bf-b7de-5bae7a60f379.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/1-800-LLMs_Qwen-2.5-14B-Hindi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-14B-Hindi", - "id": "1-800-LLMs/Qwen-2.5-14B-Hindi", - "developer": "1-800-LLMs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5826 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5263 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/1024m/PHI-4-Hindi/fbf7b76b-7ced-4217-8e14-1d02184e271c.json b/data/hfopenllm_v2/1024m/PHI-4-Hindi/fbf7b76b-7ced-4217-8e14-1d02184e271c.json deleted file mode 100644 index 0b0fe7e6b..000000000 --- a/data/hfopenllm_v2/1024m/PHI-4-Hindi/fbf7b76b-7ced-4217-8e14-1d02184e271c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/1024m_PHI-4-Hindi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PHI-4-Hindi", - "id": "1024m/PHI-4-Hindi", - "developer": "1024m", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0082 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.671 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2334 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4914 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5239 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/1024m/QWEN-14B-B100/74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json b/data/hfopenllm_v2/1024m/QWEN-14B-B100/74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json deleted file mode 100644 index 21260063a..000000000 --- a/data/hfopenllm_v2/1024m/QWEN-14B-B100/74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/1024m_QWEN-14B-B100/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QWEN-14B-B100", - "id": "1024m/QWEN-14B-B100", - "developer": "1024m", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6533 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.41 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5179 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/152334H/miqu-1-70b-sf/295938e1-ade2-4d36-beca-3cbe506b5b90.json b/data/hfopenllm_v2/152334H/miqu-1-70b-sf/295938e1-ade2-4d36-beca-3cbe506b5b90.json deleted file mode 100644 index 133dae7a4..000000000 --- a/data/hfopenllm_v2/152334H/miqu-1-70b-sf/295938e1-ade2-4d36-beca-3cbe506b5b90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/152334H_miqu-1-70b-sf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "miqu-1-70b-sf", - "id": "152334H/miqu-1-70b-sf", - "developer": "152334H", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 68.977 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5182 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6102 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4582 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/1TuanPham/T-VisStar-7B-v0.1/f331782f-ea09-41bd-8c6a-e964c88d7e09.json b/data/hfopenllm_v2/1TuanPham/T-VisStar-7B-v0.1/f331782f-ea09-41bd-8c6a-e964c88d7e09.json deleted file mode 100644 index 95ed6976e..000000000 --- a/data/hfopenllm_v2/1TuanPham/T-VisStar-7B-v0.1/f331782f-ea09-41bd-8c6a-e964c88d7e09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/1TuanPham_T-VisStar-7B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "T-VisStar-7B-v0.1", - "id": "1TuanPham/T-VisStar-7B-v0.1", - "developer": "1TuanPham", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.294 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5052 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3211 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/1TuanPham/T-VisStar-v0.1/e4e3d79a-1de9-43be-a029-0be4f60e472b.json b/data/hfopenllm_v2/1TuanPham/T-VisStar-v0.1/e4e3d79a-1de9-43be-a029-0be4f60e472b.json deleted file mode 100644 index 32ac26ea2..000000000 --- a/data/hfopenllm_v2/1TuanPham/T-VisStar-v0.1/e4e3d79a-1de9-43be-a029-0be4f60e472b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/1TuanPham_T-VisStar-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "T-VisStar-v0.1", - "id": "1TuanPham/T-VisStar-v0.1", - "developer": "1TuanPham", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.294 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5052 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3211 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/3rd-Degree-Burn/L-3.1-Science-Writer-8B/6914ac28-b543-4f36-81f1-f7491c018e3b.json b/data/hfopenllm_v2/3rd-Degree-Burn/L-3.1-Science-Writer-8B/6914ac28-b543-4f36-81f1-f7491c018e3b.json deleted file mode 100644 index c548f8a55..000000000 --- a/data/hfopenllm_v2/3rd-Degree-Burn/L-3.1-Science-Writer-8B/6914ac28-b543-4f36-81f1-f7491c018e3b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_L-3.1-Science-Writer-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L-3.1-Science-Writer-8B", - "id": "3rd-Degree-Burn/L-3.1-Science-Writer-8B", - "developer": "3rd-Degree-Burn", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4263 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5041 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3959 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1/b7378f41-46ab-41af-94cc-e7fb10738658.json b/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1/b7378f41-46ab-41af-94cc-e7fb10738658.json deleted file mode 100644 index 44e2bc2aa..000000000 --- a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1/b7378f41-46ab-41af-94cc-e7fb10738658.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_Llama-3.1-8B-Squareroot-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Squareroot-v1", - "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1", - "developer": "3rd-Degree-Burn", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2892 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0884 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot/acedae59-6192-4ac4-a354-d520ecd6ba36.json b/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot/acedae59-6192-4ac4-a354-d520ecd6ba36.json deleted file mode 100644 index 56cd9d2ea..000000000 --- a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot/acedae59-6192-4ac4-a354-d520ecd6ba36.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_Llama-3.1-8B-Squareroot/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Squareroot", - "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot", - "developer": "3rd-Degree-Burn", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2213 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3461 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3089 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.175 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-Squared-8B/ff105961-761d-4261-8a44-20acf2e7f440.json b/data/hfopenllm_v2/3rd-Degree-Burn/Llama-Squared-8B/ff105961-761d-4261-8a44-20acf2e7f440.json deleted file mode 100644 index 4375cd5c9..000000000 --- a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-Squared-8B/ff105961-761d-4261-8a44-20acf2e7f440.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_Llama-Squared-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Squared-8B", - "id": "3rd-Degree-Burn/Llama-Squared-8B", - "developer": "3rd-Degree-Burn", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2755 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4431 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3089 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2366 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/4season/final_model_test_v2/fa0901f6-514e-44ae-84dc-0b793f26169e.json b/data/hfopenllm_v2/4season/final_model_test_v2/fa0901f6-514e-44ae-84dc-0b793f26169e.json deleted file mode 100644 index 2c3cdad71..000000000 --- a/data/hfopenllm_v2/4season/final_model_test_v2/fa0901f6-514e-44ae-84dc-0b793f26169e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/4season_final_model_test_v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "final_model_test_v2", - "id": "4season/final_model_test_v2", - "developer": "4season", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 21.421 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3191 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6342 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4314 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3528 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-Instruct-preview/d2dff5df-343b-40f3-85de-14eb72dab050.json b/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-Instruct-preview/d2dff5df-343b-40f3-85de-14eb72dab050.json deleted file mode 100644 index 6aa520b9d..000000000 --- a/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-Instruct-preview/d2dff5df-343b-40f3-85de-14eb72dab050.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AALF_FuseChat-Llama-3.1-8B-Instruct-preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FuseChat-Llama-3.1-8B-Instruct-preview", - "id": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview", - "developer": "AALF", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-SFT-preview/8fa3010f-b7a1-4fc1-9156-ba70453add86.json b/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-SFT-preview/8fa3010f-b7a1-4fc1-9156-ba70453add86.json deleted file mode 100644 index 5b5e4c7c1..000000000 --- a/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-SFT-preview/8fa3010f-b7a1-4fc1-9156-ba70453add86.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AALF_FuseChat-Llama-3.1-8B-SFT-preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FuseChat-Llama-3.1-8B-SFT-preview", - "id": "AALF/FuseChat-Llama-3.1-8B-SFT-preview", - "developer": "AALF", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7281 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2251 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3743 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K-100steps/58034f99-3b01-46d6-aea9-90c75d073bb0.json b/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K-100steps/58034f99-3b01-46d6-aea9-90c75d073bb0.json deleted file mode 100644 index 410931148..000000000 --- a/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K-100steps/58034f99-3b01-46d6-aea9-90c75d073bb0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AALF_gemma-2-27b-it-SimPO-37K-100steps/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-27b-it-SimPO-37K-100steps", - "id": "AALF/gemma-2-27b-it-SimPO-37K-100steps", - "developer": "AALF", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2568 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3931 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3329 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K/e6c08c9c-6d01-45c7-8a24-219b756b8632.json b/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K/e6c08c9c-6d01-45c7-8a24-219b756b8632.json deleted file mode 100644 index 3cc0a4181..000000000 --- a/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K/e6c08c9c-6d01-45c7-8a24-219b756b8632.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AALF_gemma-2-27b-it-SimPO-37K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-27b-it-SimPO-37K", - "id": "AALF/gemma-2-27b-it-SimPO-37K", - "developer": "AALF", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2407 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1971 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AELLM/gemma-2-aeria-infinity-9b/cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json b/data/hfopenllm_v2/AELLM/gemma-2-aeria-infinity-9b/cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json deleted file mode 100644 index 4e8d23811..000000000 --- a/data/hfopenllm_v2/AELLM/gemma-2-aeria-infinity-9b/cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AELLM_gemma-2-aeria-infinity-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-aeria-infinity-9b", - "id": "AELLM/gemma-2-aeria-infinity-9b", - "developer": "AELLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7594 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5983 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3862 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AELLM/gemma-2-lyco-infinity-9b/95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json b/data/hfopenllm_v2/AELLM/gemma-2-lyco-infinity-9b/95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json deleted file mode 100644 index 8c6280d2f..000000000 --- a/data/hfopenllm_v2/AELLM/gemma-2-lyco-infinity-9b/95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AELLM_gemma-2-lyco-infinity-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-lyco-infinity-9b", - "id": "AELLM/gemma-2-lyco-infinity-9b", - "developer": "AELLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7316 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4006 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3787 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AGI-0/Art-v0-3B/082f25f0-994c-438a-8086-b1e439aca466.json b/data/hfopenllm_v2/AGI-0/Art-v0-3B/082f25f0-994c-438a-8086-b1e439aca466.json deleted file mode 100644 index 521160bae..000000000 --- a/data/hfopenllm_v2/AGI-0/Art-v0-3B/082f25f0-994c-438a-8086-b1e439aca466.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AGI-0_Art-v0-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Art-v0-3B", - "id": "AGI-0/Art-v0-3B", - "developer": "AGI-0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3192 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3401 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2462 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1179 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AGI-0/Artificium-llama3.1-8B-001/31423cbd-08cd-4079-b1c5-ba412acf1b51.json b/data/hfopenllm_v2/AGI-0/Artificium-llama3.1-8B-001/31423cbd-08cd-4079-b1c5-ba412acf1b51.json deleted file mode 100644 index 39747f5fd..000000000 --- a/data/hfopenllm_v2/AGI-0/Artificium-llama3.1-8B-001/31423cbd-08cd-4079-b1c5-ba412acf1b51.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AGI-0_Artificium-llama3.1-8B-001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Artificium-llama3.1-8B-001", - "id": "AGI-0/Artificium-llama3.1-8B-001", - "developer": "AGI-0", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5248 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3795 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AGI-0/smartllama3.1-8B-001/2669bd86-da65-4d87-8464-bfa8c741ce0b.json b/data/hfopenllm_v2/AGI-0/smartllama3.1-8B-001/2669bd86-da65-4d87-8464-bfa8c741ce0b.json deleted file mode 100644 index 2a80fe730..000000000 --- a/data/hfopenllm_v2/AGI-0/smartllama3.1-8B-001/2669bd86-da65-4d87-8464-bfa8c741ce0b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AGI-0_smartllama3.1-8B-001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smartllama3.1-8B-001", - "id": "AGI-0/smartllama3.1-8B-001", - "developer": "AGI-0", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.467 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AI-MO/NuminaMath-7B-CoT/ab2c19ff-5671-446f-b09e-731e2ae515ca.json b/data/hfopenllm_v2/AI-MO/NuminaMath-7B-CoT/ab2c19ff-5671-446f-b09e-731e2ae515ca.json deleted file mode 100644 index 885230cbc..000000000 --- a/data/hfopenllm_v2/AI-MO/NuminaMath-7B-CoT/ab2c19ff-5671-446f-b09e-731e2ae515ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AI-MO_NuminaMath-7B-CoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NuminaMath-7B-CoT", - "id": "AI-MO/NuminaMath-7B-CoT", - "developer": "AI-MO", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.91 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2689 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2696 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3303 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2868 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AI-MO/NuminaMath-7B-TIR/36250dc3-cb51-43be-8ab0-6788eb5bda7c.json b/data/hfopenllm_v2/AI-MO/NuminaMath-7B-TIR/36250dc3-cb51-43be-8ab0-6788eb5bda7c.json deleted file mode 100644 index d1e8152de..000000000 --- a/data/hfopenllm_v2/AI-MO/NuminaMath-7B-TIR/36250dc3-cb51-43be-8ab0-6788eb5bda7c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AI-MO_NuminaMath-7B-TIR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NuminaMath-7B-TIR", - "id": "AI-MO/NuminaMath-7B-TIR", - "developer": "AI-MO", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.91 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2756 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4144 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1609 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3509 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2733 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AI-Sweden-Models/Llama-3-8B-instruct/cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json b/data/hfopenllm_v2/AI-Sweden-Models/Llama-3-8B-instruct/cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json deleted file mode 100644 index 4c342ae03..000000000 --- a/data/hfopenllm_v2/AI-Sweden-Models/Llama-3-8B-instruct/cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AI-Sweden-Models_Llama-3-8B-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-instruct", - "id": "AI-Sweden-Models/Llama-3-8B-instruct", - "developer": "AI-Sweden-Models", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2401 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4771 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2597 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AI-Sweden-Models/gpt-sw3-40b/9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json b/data/hfopenllm_v2/AI-Sweden-Models/gpt-sw3-40b/9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json deleted file mode 100644 index d5848d50e..000000000 --- a/data/hfopenllm_v2/AI-Sweden-Models/gpt-sw3-40b/9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AI-Sweden-Models_gpt-sw3-40b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-sw3-40b", - "id": "AI-Sweden-Models/gpt-sw3-40b", - "developer": "AI-Sweden-Models", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPT2LMHeadModel", - "params_billions": 39.927 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.147 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3268 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3632 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AI4free/Dhanishtha/038c32da-add5-4299-ac17-df6ef3fdea58.json b/data/hfopenllm_v2/AI4free/Dhanishtha/038c32da-add5-4299-ac17-df6ef3fdea58.json deleted file mode 100644 index ead44ad6b..000000000 --- a/data/hfopenllm_v2/AI4free/Dhanishtha/038c32da-add5-4299-ac17-df6ef3fdea58.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AI4free_Dhanishtha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dhanishtha", - "id": "AI4free/Dhanishtha", - "developer": "AI4free", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2451 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3404 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.256 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3569 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1643 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AI4free/t2/25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json b/data/hfopenllm_v2/AI4free/t2/25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json deleted file mode 100644 index f06b40189..000000000 --- a/data/hfopenllm_v2/AI4free/t2/25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AI4free_t2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "t2", - "id": "AI4free/t2", - "developer": "AI4free", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.291 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1896 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3846 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AIDC-AI/Marco-o1/77655d60-872f-468a-acc6-d584ef5bf46a.json b/data/hfopenllm_v2/AIDC-AI/Marco-o1/77655d60-872f-468a-acc6-d584ef5bf46a.json deleted file mode 100644 index a71a6c124..000000000 --- a/data/hfopenllm_v2/AIDC-AI/Marco-o1/77655d60-872f-468a-acc6-d584ef5bf46a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AIDC-AI_Marco-o1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Marco-o1", - "id": "AIDC-AI/Marco-o1", - "developer": "AIDC-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4771 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5364 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3746 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Aashraf995/Creative-7B-nerd/4de378c8-ccf6-4f0b-8287-3d138a8645b9.json b/data/hfopenllm_v2/Aashraf995/Creative-7B-nerd/4de378c8-ccf6-4f0b-8287-3d138a8645b9.json deleted file mode 100644 index 90198bb55..000000000 --- a/data/hfopenllm_v2/Aashraf995/Creative-7B-nerd/4de378c8-ccf6-4f0b-8287-3d138a8645b9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Aashraf995_Creative-7B-nerd/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Creative-7B-nerd", - "id": "Aashraf995/Creative-7B-nerd", - "developer": "Aashraf995", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4722 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5607 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3165 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4515 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4492 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Aashraf995/Gemma-Evo-10B/8039cadf-6644-44e7-8452-90e9c8069e28.json b/data/hfopenllm_v2/Aashraf995/Gemma-Evo-10B/8039cadf-6644-44e7-8452-90e9c8069e28.json deleted file mode 100644 index 915792a80..000000000 --- a/data/hfopenllm_v2/Aashraf995/Gemma-Evo-10B/8039cadf-6644-44e7-8452-90e9c8069e28.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Aashraf995_Gemma-Evo-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-Evo-10B", - "id": "Aashraf995/Gemma-Evo-10B", - "developer": "Aashraf995", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7332 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6044 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2228 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4275 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Aashraf995/Qwen-Evo-7B/8914d89d-c873-4704-998e-dc807e96030b.json b/data/hfopenllm_v2/Aashraf995/Qwen-Evo-7B/8914d89d-c873-4704-998e-dc807e96030b.json deleted file mode 100644 index b958767ce..000000000 --- a/data/hfopenllm_v2/Aashraf995/Qwen-Evo-7B/8914d89d-c873-4704-998e-dc807e96030b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Aashraf995_Qwen-Evo-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-Evo-7B", - "id": "Aashraf995/Qwen-Evo-7B", - "developer": "Aashraf995", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5709 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Aashraf995/QwenStock-14B/c2e9fc29-db07-4b49-a98a-084158831ac4.json b/data/hfopenllm_v2/Aashraf995/QwenStock-14B/c2e9fc29-db07-4b49-a98a-084158831ac4.json deleted file mode 100644 index 2181091a7..000000000 --- a/data/hfopenllm_v2/Aashraf995/QwenStock-14B/c2e9fc29-db07-4b49-a98a-084158831ac4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Aashraf995_QwenStock-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenStock-14B", - "id": "Aashraf995/QwenStock-14B", - "developer": "Aashraf995", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5009 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3573 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AbacusResearch/Jallabi-34B/58724539-6fc5-40d9-ba43-87410959894d.json b/data/hfopenllm_v2/AbacusResearch/Jallabi-34B/58724539-6fc5-40d9-ba43-87410959894d.json deleted file mode 100644 index 51b7d1999..000000000 --- a/data/hfopenllm_v2/AbacusResearch/Jallabi-34B/58724539-6fc5-40d9-ba43-87410959894d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AbacusResearch_Jallabi-34B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jallabi-34B", - "id": "AbacusResearch/Jallabi-34B", - "developer": "AbacusResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6023 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4822 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4682 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ahdoot/StructuredThinker-v0.3-MoreStructure/b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json b/data/hfopenllm_v2/Ahdoot/StructuredThinker-v0.3-MoreStructure/b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json deleted file mode 100644 index a962961f2..000000000 --- a/data/hfopenllm_v2/Ahdoot/StructuredThinker-v0.3-MoreStructure/b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ahdoot_StructuredThinker-v0.3-MoreStructure/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "StructuredThinker-v0.3-MoreStructure", - "id": "Ahdoot/StructuredThinker-v0.3-MoreStructure", - "developer": "Ahdoot", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4193 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4838 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2908 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4158 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.361 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ahdoot/Test_StealthThinker/782b2df0-d1b3-414c-a4bd-59052a4441a9.json b/data/hfopenllm_v2/Ahdoot/Test_StealthThinker/782b2df0-d1b3-414c-a4bd-59052a4441a9.json deleted file mode 100644 index 717c7bcb8..000000000 --- a/data/hfopenllm_v2/Ahdoot/Test_StealthThinker/782b2df0-d1b3-414c-a4bd-59052a4441a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ahdoot_Test_StealthThinker/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Test_StealthThinker", - "id": "Ahdoot/Test_StealthThinker", - "developer": "Ahdoot", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4647 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.179 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3597 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder/b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json b/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder/b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json deleted file mode 100644 index 8c4cb6a4c..000000000 --- a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder/b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cybernet-Sec-3B-R1-V0-Coder", - "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder", - "developer": "AicoresSecurity", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7098 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4478 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1488 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3178 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0/2824e8d4-2749-4b18-a3a1-b987ed215ac6.json b/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0/2824e8d4-2749-4b18-a3a1-b987ed215ac6.json deleted file mode 100644 index 5acadd460..000000000 --- a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0/2824e8d4-2749-4b18-a3a1-b987ed215ac6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cybernet-Sec-3B-R1-V0", - "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0", - "developer": "AicoresSecurity", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6358 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4497 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.301 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1.1/53176984-ba93-4a64-b81e-21f6e0f65bcd.json b/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1.1/53176984-ba93-4a64-b81e-21f6e0f65bcd.json deleted file mode 100644 index 6634b986e..000000000 --- a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1.1/53176984-ba93-4a64-b81e-21f6e0f65bcd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cybernet-Sec-3B-R1-V1.1", - "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1", - "developer": "AicoresSecurity", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3088 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1/53252698-7d17-4f2a-9106-3b744ae7a985.json b/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1/53252698-7d17-4f2a-9106-3b744ae7a985.json deleted file mode 100644 index 0a69dcaf4..000000000 --- a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1/53252698-7d17-4f2a-9106-3b744ae7a985.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cybernet-Sec-3B-R1-V1", - "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1", - "developer": "AicoresSecurity", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6146 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4282 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1518 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2876 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Alepach/notHumpback-M0/6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json b/data/hfopenllm_v2/Alepach/notHumpback-M0/6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json deleted file mode 100644 index c2db43af7..000000000 --- a/data/hfopenllm_v2/Alepach/notHumpback-M0/6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Alepach_notHumpback-M0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "notHumpback-M0", - "id": "Alepach/notHumpback-M0", - "developer": "Alepach", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3552 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1119 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Alepach/notHumpback-M1-v2/35f11d5e-88c4-4a95-8d06-a40bee648b00.json b/data/hfopenllm_v2/Alepach/notHumpback-M1-v2/35f11d5e-88c4-4a95-8d06-a40bee648b00.json deleted file mode 100644 index 63b777f1c..000000000 --- a/data/hfopenllm_v2/Alepach/notHumpback-M1-v2/35f11d5e-88c4-4a95-8d06-a40bee648b00.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Alepach_notHumpback-M1-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "notHumpback-M1-v2", - "id": "Alepach/notHumpback-M1-v2", - "developer": "Alepach", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2277 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2776 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1119 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Alepach/notHumpback-M1/ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json b/data/hfopenllm_v2/Alepach/notHumpback-M1/ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json deleted file mode 100644 index 80d910c4c..000000000 --- a/data/hfopenllm_v2/Alepach/notHumpback-M1/ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Alepach_notHumpback-M1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "notHumpback-M1", - "id": "Alepach/notHumpback-M1", - "developer": "Alepach", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2207 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2882 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2374 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1091 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Alibaba-NLP/gte-Qwen2-7B-instruct/95733620-e1e7-4442-b9c3-a699165df5e7.json b/data/hfopenllm_v2/Alibaba-NLP/gte-Qwen2-7B-instruct/95733620-e1e7-4442-b9c3-a699165df5e7.json deleted file mode 100644 index 2205af9c9..000000000 --- a/data/hfopenllm_v2/Alibaba-NLP/gte-Qwen2-7B-instruct/95733620-e1e7-4442-b9c3-a699165df5e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Alibaba-NLP_gte-Qwen2-7B-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gte-Qwen2-7B-instruct", - "id": "Alibaba-NLP/gte-Qwen2-7B-instruct", - "developer": "Alibaba-NLP", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2255 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3559 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3321 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Alsebay/Qwen2.5-7B-test-novelist/cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json b/data/hfopenllm_v2/Alsebay/Qwen2.5-7B-test-novelist/cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json deleted file mode 100644 index ea8074956..000000000 --- a/data/hfopenllm_v2/Alsebay/Qwen2.5-7B-test-novelist/cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Alsebay_Qwen2.5-7B-test-novelist/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-test-novelist", - "id": "Alsebay/Qwen2.5-7B-test-novelist", - "developer": "Alsebay", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5352 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5151 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2349 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4749 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3866 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Amaorynho/BBAI2006/72be5537-198a-43e9-9840-a803083158d3.json b/data/hfopenllm_v2/Amaorynho/BBAI2006/72be5537-198a-43e9-9840-a803083158d3.json deleted file mode 100644 index 0d3a4cdd2..000000000 --- a/data/hfopenllm_v2/Amaorynho/BBAI2006/72be5537-198a-43e9-9840-a803083158d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Amaorynho_BBAI2006/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI2006", - "id": "Amaorynho/BBAI2006", - "developer": "Amaorynho", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.09 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1467 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2704 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3605 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Amaorynho/BBAI270V4/2e9a3443-970d-4f37-a356-277a11c81754.json b/data/hfopenllm_v2/Amaorynho/BBAI270V4/2e9a3443-970d-4f37-a356-277a11c81754.json deleted file mode 100644 index 2399e7c15..000000000 --- a/data/hfopenllm_v2/Amaorynho/BBAI270V4/2e9a3443-970d-4f37-a356-277a11c81754.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Amaorynho_BBAI270V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI270V4", - "id": "Amaorynho/BBAI270V4", - "developer": "Amaorynho", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3071 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1114 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Amaorynho/BBAIIFEV1/1188402f-aa1c-4306-b031-c92ff0a5dd64.json b/data/hfopenllm_v2/Amaorynho/BBAIIFEV1/1188402f-aa1c-4306-b031-c92ff0a5dd64.json deleted file mode 100644 index 1e9f99354..000000000 --- a/data/hfopenllm_v2/Amaorynho/BBAIIFEV1/1188402f-aa1c-4306-b031-c92ff0a5dd64.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Amaorynho_BBAIIFEV1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAIIFEV1", - "id": "Amaorynho/BBAIIFEV1", - "developer": "Amaorynho", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8047 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1934 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3857 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Amaorynho/BBAI_375/ee2f567a-6403-46d5-9a6b-bd029f81d660.json b/data/hfopenllm_v2/Amaorynho/BBAI_375/ee2f567a-6403-46d5-9a6b-bd029f81d660.json deleted file mode 100644 index e92063a97..000000000 --- a/data/hfopenllm_v2/Amaorynho/BBAI_375/ee2f567a-6403-46d5-9a6b-bd029f81d660.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Amaorynho_BBAI_375/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_375", - "id": "Amaorynho/BBAI_375", - "developer": "Amaorynho", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.09 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1467 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2704 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3605 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Amu/t1-1.5B/d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json b/data/hfopenllm_v2/Amu/t1-1.5B/d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json deleted file mode 100644 index c7e1777b9..000000000 --- a/data/hfopenllm_v2/Amu/t1-1.5B/d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Amu_t1-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "t1-1.5B", - "id": "Amu/t1-1.5B", - "developer": "Amu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3394 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4008 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3517 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2566 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Amu/t1-3B/87d66efc-173f-4c14-b76c-d8b7e00d575d.json b/data/hfopenllm_v2/Amu/t1-3B/87d66efc-173f-4c14-b76c-d8b7e00d575d.json deleted file mode 100644 index 8c8a648ec..000000000 --- a/data/hfopenllm_v2/Amu/t1-3B/87d66efc-173f-4c14-b76c-d8b7e00d575d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Amu_t1-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "t1-3B", - "id": "Amu/t1-3B", - "developer": "Amu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3999 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1375 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ArliAI/ArliAI-RPMax-12B-v1.1/47f62378-c3cc-408f-a0d1-71eb3f522f57.json b/data/hfopenllm_v2/ArliAI/ArliAI-RPMax-12B-v1.1/47f62378-c3cc-408f-a0d1-71eb3f522f57.json deleted file mode 100644 index 6c27ae339..000000000 --- a/data/hfopenllm_v2/ArliAI/ArliAI-RPMax-12B-v1.1/47f62378-c3cc-408f-a0d1-71eb3f522f57.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ArliAI_ArliAI-RPMax-12B-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ArliAI-RPMax-12B-v1.1", - "id": "ArliAI/ArliAI-RPMax-12B-v1.1", - "developer": "ArliAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5349 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4752 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1/dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json b/data/hfopenllm_v2/ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1/dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json deleted file mode 100644 index c8b199daf..000000000 --- a/data/hfopenllm_v2/ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1/dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ArliAI_Llama-3.1-8B-ArliAI-RPMax-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-ArliAI-RPMax-v1.1", - "id": "ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1", - "developer": "ArliAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6359 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5016 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3577 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Arthur-LAGACHERIE/Precis-1B-Instruct/e4087285-1d1a-465e-ac88-91310e939710.json b/data/hfopenllm_v2/Arthur-LAGACHERIE/Precis-1B-Instruct/e4087285-1d1a-465e-ac88-91310e939710.json deleted file mode 100644 index d6fc53fc3..000000000 --- a/data/hfopenllm_v2/Arthur-LAGACHERIE/Precis-1B-Instruct/e4087285-1d1a-465e-ac88-91310e939710.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Arthur-LAGACHERIE_Precis-1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Precis-1B-Instruct", - "id": "Arthur-LAGACHERIE/Precis-1B-Instruct", - "developer": "Arthur-LAGACHERIE", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3224 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3436 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1426 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Artples/L-MChat-7b/09f189d9-74fd-47bb-b5fb-7994cba56ae2.json b/data/hfopenllm_v2/Artples/L-MChat-7b/09f189d9-74fd-47bb-b5fb-7994cba56ae2.json deleted file mode 100644 index 68e074cae..000000000 --- a/data/hfopenllm_v2/Artples/L-MChat-7b/09f189d9-74fd-47bb-b5fb-7994cba56ae2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Artples_L-MChat-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L-MChat-7b", - "id": "Artples/L-MChat-7b", - "developer": "Artples", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5297 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0921 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4029 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Artples/L-MChat-Small/5754c262-6ddf-4f54-9722-22ff20a8d76f.json b/data/hfopenllm_v2/Artples/L-MChat-Small/5754c262-6ddf-4f54-9722-22ff20a8d76f.json deleted file mode 100644 index fd60828c3..000000000 --- a/data/hfopenllm_v2/Artples/L-MChat-Small/5754c262-6ddf-4f54-9722-22ff20a8d76f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Artples_L-MChat-Small/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L-MChat-Small", - "id": "Artples/L-MChat-Small", - "developer": "Artples", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3287 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4823 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3696 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2464 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Aryanne/QwentileSwap/cc1bd811-ec88-4514-8b47-4140ded4f03d.json b/data/hfopenllm_v2/Aryanne/QwentileSwap/cc1bd811-ec88-4514-8b47-4140ded4f03d.json deleted file mode 100644 index cd69fff0a..000000000 --- a/data/hfopenllm_v2/Aryanne/QwentileSwap/cc1bd811-ec88-4514-8b47-4140ded4f03d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Aryanne_QwentileSwap/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwentileSwap", - "id": "Aryanne/QwentileSwap", - "developer": "Aryanne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7008 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4222 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.464 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5946 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Aryanne/SHBA/3f08155d-8551-4472-86fe-7988cd6df78b.json b/data/hfopenllm_v2/Aryanne/SHBA/3f08155d-8551-4472-86fe-7988cd6df78b.json deleted file mode 100644 index 5001e5134..000000000 --- a/data/hfopenllm_v2/Aryanne/SHBA/3f08155d-8551-4472-86fe-7988cd6df78b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Aryanne_SHBA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SHBA", - "id": "Aryanne/SHBA", - "developer": "Aryanne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7817 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5233 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1798 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4161 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3892 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Aryanne/SuperHeart/339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json b/data/hfopenllm_v2/Aryanne/SuperHeart/339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json deleted file mode 100644 index 9df9c1198..000000000 --- a/data/hfopenllm_v2/Aryanne/SuperHeart/339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Aryanne_SuperHeart/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SuperHeart", - "id": "Aryanne/SuperHeart", - "developer": "Aryanne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5192 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1563 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4436 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3912 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json b/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json deleted file mode 100644 index c1dcb320d..000000000 --- a/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AtAndDev_Qwen2.5-1.5B-continuous-learnt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-1.5B-continuous-learnt", - "id": "AtAndDev/Qwen2.5-1.5B-continuous-learnt", - "developer": "AtAndDev", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4605 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4258 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3636 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json b/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json deleted file mode 100644 index cd9fc36f7..000000000 --- a/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AtAndDev_Qwen2.5-1.5B-continuous-learnt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-1.5B-continuous-learnt", - "id": "AtAndDev/Qwen2.5-1.5B-continuous-learnt", - "developer": "AtAndDev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4511 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1473 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3623 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2806 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ateron/Glowing-Forest-12B/6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json b/data/hfopenllm_v2/Ateron/Glowing-Forest-12B/6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json deleted file mode 100644 index d15a6561f..000000000 --- a/data/hfopenllm_v2/Ateron/Glowing-Forest-12B/6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ateron_Glowing-Forest-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Glowing-Forest-12B", - "id": "Ateron/Glowing-Forest-12B", - "developer": "Ateron", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3592 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3718 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ateron/Lotus-Magpic/99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json b/data/hfopenllm_v2/Ateron/Lotus-Magpic/99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json deleted file mode 100644 index 63c101443..000000000 --- a/data/hfopenllm_v2/Ateron/Lotus-Magpic/99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ateron_Lotus-Magpic/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lotus-Magpic", - "id": "Ateron/Lotus-Magpic", - "developer": "Ateron", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6286 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5254 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3491 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ateron/Way_of_MagPicaro/b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json b/data/hfopenllm_v2/Ateron/Way_of_MagPicaro/b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json deleted file mode 100644 index 7a1c07042..000000000 --- a/data/hfopenllm_v2/Ateron/Way_of_MagPicaro/b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ateron_Way_of_MagPicaro/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Way_of_MagPicaro", - "id": "Ateron/Way_of_MagPicaro", - "developer": "Ateron", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2637 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5427 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3536 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AuraIndustries/Aura-4B/c3d39b6c-02af-410d-8a5c-224495b04572.json b/data/hfopenllm_v2/AuraIndustries/Aura-4B/c3d39b6c-02af-410d-8a5c-224495b04572.json deleted file mode 100644 index 259ade02b..000000000 --- a/data/hfopenllm_v2/AuraIndustries/Aura-4B/c3d39b6c-02af-410d-8a5c-224495b04572.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AuraIndustries_Aura-4B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aura-4B", - "id": "AuraIndustries/Aura-4B", - "developer": "AuraIndustries", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.513 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3816 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3938 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2706 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AuraIndustries/Aura-8B/0426fcba-3db4-492d-b622-e34ab8d3fc8f.json b/data/hfopenllm_v2/AuraIndustries/Aura-8B/0426fcba-3db4-492d-b622-e34ab8d3fc8f.json deleted file mode 100644 index 2bec15124..000000000 --- a/data/hfopenllm_v2/AuraIndustries/Aura-8B/0426fcba-3db4-492d-b622-e34ab8d3fc8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AuraIndustries_Aura-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aura-8B", - "id": "AuraIndustries/Aura-8B", - "developer": "AuraIndustries", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7205 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5131 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1518 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4004 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B-v2/aa099cfe-ac9a-42dd-8357-f4d8115133ca.json b/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B-v2/aa099cfe-ac9a-42dd-8357-f4d8115133ca.json deleted file mode 100644 index 2a673ad65..000000000 --- a/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B-v2/aa099cfe-ac9a-42dd-8357-f4d8115133ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AuraIndustries_Aura-MoE-2x4B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aura-MoE-2x4B-v2", - "id": "AuraIndustries/Aura-MoE-2x4B-v2", - "developer": "AuraIndustries", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 7.231 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4778 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4315 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.261 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B/ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json b/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B/ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json deleted file mode 100644 index 460a41061..000000000 --- a/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B/ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/AuraIndustries_Aura-MoE-2x4B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aura-MoE-2x4B", - "id": "AuraIndustries/Aura-MoE-2x4B", - "developer": "AuraIndustries", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 7.231 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4601 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4339 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4085 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.265 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Aurel9/testmerge-7b/b359a7a3-cf2c-4952-b308-333672dadcec.json b/data/hfopenllm_v2/Aurel9/testmerge-7b/b359a7a3-cf2c-4952-b308-333672dadcec.json deleted file mode 100644 index a8574b05b..000000000 --- a/data/hfopenllm_v2/Aurel9/testmerge-7b/b359a7a3-cf2c-4952-b308-333672dadcec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Aurel9_testmerge-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "testmerge-7b", - "id": "Aurel9/testmerge-7b", - "developer": "Aurel9", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4659 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3053 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ayush-Singh/Llama1B-sft-2/0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json b/data/hfopenllm_v2/Ayush-Singh/Llama1B-sft-2/0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json deleted file mode 100644 index cf11b9c2a..000000000 --- a/data/hfopenllm_v2/Ayush-Singh/Llama1B-sft-2/0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ayush-Singh_Llama1B-sft-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama1B-sft-2", - "id": "Ayush-Singh/Llama1B-sft-2", - "developer": "Ayush-Singh", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1374 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2834 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3552 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Azure99/Blossom-V6-14B/e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json b/data/hfopenllm_v2/Azure99/Blossom-V6-14B/e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json deleted file mode 100644 index 374b26fce..000000000 --- a/data/hfopenllm_v2/Azure99/Blossom-V6-14B/e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Azure99_Blossom-V6-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Blossom-V6-14B", - "id": "Azure99/Blossom-V6-14B", - "developer": "Azure99", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6395 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5069 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4035 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4544 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Azure99/Blossom-V6-7B/45d019ab-b23c-4fc3-baf5-d57576e9945c.json b/data/hfopenllm_v2/Azure99/Blossom-V6-7B/45d019ab-b23c-4fc3-baf5-d57576e9945c.json deleted file mode 100644 index e807205a9..000000000 --- a/data/hfopenllm_v2/Azure99/Blossom-V6-7B/45d019ab-b23c-4fc3-baf5-d57576e9945c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Azure99_Blossom-V6-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Blossom-V6-7B", - "id": "Azure99/Blossom-V6-7B", - "developer": "Azure99", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5538 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4974 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4585 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4301 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Azure99/blossom-v5-32b/e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json b/data/hfopenllm_v2/Azure99/blossom-v5-32b/e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json deleted file mode 100644 index c1499a18c..000000000 --- a/data/hfopenllm_v2/Azure99/blossom-v5-32b/e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Azure99_blossom-v5-32b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "blossom-v5-32b", - "id": "Azure99/blossom-v5-32b", - "developer": "Azure99", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.512 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5955 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1866 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4235 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Azure99/blossom-v5-llama3-8b/9be442e8-4b77-43e0-a981-887338e59b78.json b/data/hfopenllm_v2/Azure99/blossom-v5-llama3-8b/9be442e8-4b77-43e0-a981-887338e59b78.json deleted file mode 100644 index 6b1528202..000000000 --- a/data/hfopenllm_v2/Azure99/blossom-v5-llama3-8b/9be442e8-4b77-43e0-a981-887338e59b78.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Azure99_blossom-v5-llama3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "blossom-v5-llama3-8b", - "id": "Azure99/blossom-v5-llama3-8b", - "developer": "Azure99", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2206 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Azure99/blossom-v5.1-34b/a07b6326-f393-490e-b696-d8b45f593d4b.json b/data/hfopenllm_v2/Azure99/blossom-v5.1-34b/a07b6326-f393-490e-b696-d8b45f593d4b.json deleted file mode 100644 index 108854be5..000000000 --- a/data/hfopenllm_v2/Azure99/blossom-v5.1-34b/a07b6326-f393-490e-b696-d8b45f593d4b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Azure99_blossom-v5.1-34b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "blossom-v5.1-34b", - "id": "Azure99/blossom-v5.1-34b", - "developer": "Azure99", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5697 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6109 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2591 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4558 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Azure99/blossom-v5.1-9b/b66ed91a-98d5-407c-9896-9c2e2a31e9da.json b/data/hfopenllm_v2/Azure99/blossom-v5.1-9b/b66ed91a-98d5-407c-9896-9c2e2a31e9da.json deleted file mode 100644 index 812870f1b..000000000 --- a/data/hfopenllm_v2/Azure99/blossom-v5.1-9b/b66ed91a-98d5-407c-9896-9c2e2a31e9da.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Azure99_blossom-v5.1-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "blossom-v5.1-9b", - "id": "Azure99/blossom-v5.1-9b", - "developer": "Azure99", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5086 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2122 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3994 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3979 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference/9c70921d-956b-4727-9201-1addbd01bb8b.json b/data/hfopenllm_v2/BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference/9c70921d-956b-4727-9201-1addbd01bb8b.json deleted file mode 100644 index 212643c5b..000000000 --- a/data/hfopenllm_v2/BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference/9c70921d-956b-4727-9201-1addbd01bb8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Gemma2-9B-IT-Simpo-Infinity-Preference/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2-9B-IT-Simpo-Infinity-Preference", - "id": "BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3176 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5979 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3966 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3869 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Llama3-70B/4ba6d51e-314a-4db4-9552-568a4093e01a.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Llama3-70B/4ba6d51e-314a-4db4-9552-568a4093e01a.json deleted file mode 100644 index 3e94d0dd0..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Llama3-70B/4ba6d51e-314a-4db4-9552-568a4093e01a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0613-Llama3-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-3M-0613-Llama3-70B", - "id": "BAAI/Infinity-Instruct-3M-0613-Llama3-70B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6821 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6642 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2153 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4523 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Mistral-7B/835f5056-56bf-4a6c-886f-fbe6f263ac07.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Mistral-7B/835f5056-56bf-4a6c-886f-fbe6f263ac07.json deleted file mode 100644 index 9053fed79..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Mistral-7B/835f5056-56bf-4a6c-886f-fbe6f263ac07.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0613-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-3M-0613-Mistral-7B", - "id": "BAAI/Infinity-Instruct-3M-0613-Mistral-7B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4958 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3161 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-70B/c2a63afa-9d25-41dc-b25f-848f5a640501.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-70B/c2a63afa-9d25-41dc-b25f-848f5a640501.json deleted file mode 100644 index 090eaf46c..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-70B/c2a63afa-9d25-41dc-b25f-848f5a640501.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Llama3-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-3M-0625-Llama3-70B", - "id": "BAAI/Infinity-Instruct-3M-0625-Llama3-70B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7442 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2251 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4617 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4586 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-8B/f64f9d24-e448-4bb6-89c3-edb66499bac9.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-8B/f64f9d24-e448-4bb6-89c3-edb66499bac9.json deleted file mode 100644 index 4f8d75121..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-8B/f64f9d24-e448-4bb6-89c3-edb66499bac9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Llama3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-3M-0625-Llama3-8B", - "id": "BAAI/Infinity-Instruct-3M-0625-Llama3-8B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.605 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0884 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3712 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3252 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Mistral-7B/2de14bfb-844a-4711-815e-8f63487a78fd.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Mistral-7B/2de14bfb-844a-4711-815e-8f63487a78fd.json deleted file mode 100644 index ea49688f6..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Mistral-7B/2de14bfb-844a-4711-815e-8f63487a78fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-3M-0625-Mistral-7B", - "id": "BAAI/Infinity-Instruct-3M-0625-Mistral-7B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5867 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.494 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4272 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Qwen2-7B/f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Qwen2-7B/f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json deleted file mode 100644 index ff2b44e64..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Qwen2-7B/f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Qwen2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-3M-0625-Qwen2-7B", - "id": "BAAI/Infinity-Instruct-3M-0625-Qwen2-7B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5554 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5346 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3888 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B/98187b98-0cc8-4756-9cb7-c53deb998f90.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B/98187b98-0cc8-4756-9cb7-c53deb998f90.json deleted file mode 100644 index 5553e65e0..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B/98187b98-0cc8-4756-9cb7-c53deb998f90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Yi-1.5-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-3M-0625-Yi-1.5-9B", - "id": "BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5186 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5509 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4575 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B/8c79c60d-ebf4-4409-be4f-928a54cedd1d.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B/8c79c60d-ebf4-4409-be4f-928a54cedd1d.json deleted file mode 100644 index de7d61cf3..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B/8c79c60d-ebf4-4409-be4f-928a54cedd1d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-0729-Llama3_1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-7M-0729-Llama3_1-8B", - "id": "BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6132 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5077 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3578 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3224 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-mistral-7B/5d5cebeb-faf0-4fdf-8749-6307080e82f2.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-mistral-7B/5d5cebeb-faf0-4fdf-8749-6307080e82f2.json deleted file mode 100644 index 27fce40f9..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-mistral-7B/5d5cebeb-faf0-4fdf-8749-6307080e82f2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-0729-mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-7M-0729-mistral-7B", - "id": "BAAI/Infinity-Instruct-7M-0729-mistral-7B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6162 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4964 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3274 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B/e926ce8f-45bb-4f3d-b579-ecadb3df6468.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B/e926ce8f-45bb-4f3d-b579-ecadb3df6468.json deleted file mode 100644 index c05f8ef93..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B/e926ce8f-45bb-4f3d-b579-ecadb3df6468.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-7M-Gen-Llama3_1-70B", - "id": "BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7335 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6695 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2523 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4607 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B/070609d6-5f41-4712-9ad7-e215b1a6bb81.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B/070609d6-5f41-4712-9ad7-e215b1a6bb81.json deleted file mode 100644 index e21c1ef5d..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B/070609d6-5f41-4712-9ad7-e215b1a6bb81.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-7M-Gen-Llama3_1-8B", - "id": "BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6132 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5077 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3578 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3224 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-mistral-7B/8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-mistral-7B/8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json deleted file mode 100644 index 4f2f9d23d..000000000 --- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-mistral-7B/8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-Gen-mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinity-Instruct-7M-Gen-mistral-7B", - "id": "BAAI/Infinity-Instruct-7M-Gen-mistral-7B", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6147 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4964 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3274 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BAAI/OPI-Llama-3.1-8B-Instruct/53587959-25f9-43aa-a34b-f274d8bc93af.json b/data/hfopenllm_v2/BAAI/OPI-Llama-3.1-8B-Instruct/53587959-25f9-43aa-a34b-f274d8bc93af.json deleted file mode 100644 index e76dc81cd..000000000 --- a/data/hfopenllm_v2/BAAI/OPI-Llama-3.1-8B-Instruct/53587959-25f9-43aa-a34b-f274d8bc93af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BAAI_OPI-Llama-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OPI-Llama-3.1-8B-Instruct", - "id": "BAAI/OPI-Llama-3.1-8B-Instruct", - "developer": "BAAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2075 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3233 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BEE-spoke-data/Meta-Llama-3-8Bee/2a7f80ed-d404-4c81-b000-b65c83069121.json b/data/hfopenllm_v2/BEE-spoke-data/Meta-Llama-3-8Bee/2a7f80ed-d404-4c81-b000-b65c83069121.json deleted file mode 100644 index 3edbb6f6f..000000000 --- a/data/hfopenllm_v2/BEE-spoke-data/Meta-Llama-3-8Bee/2a7f80ed-d404-4c81-b000-b65c83069121.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BEE-spoke-data_Meta-Llama-3-8Bee/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3-8Bee", - "id": "BEE-spoke-data/Meta-Llama-3-8Bee", - "developer": "BEE-spoke-data", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1951 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4626 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3654 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.322 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-101M-GQA/f0983645-4adb-4ddb-bf2f-33480cb7f421.json b/data/hfopenllm_v2/BEE-spoke-data/smol_llama-101M-GQA/f0983645-4adb-4ddb-bf2f-33480cb7f421.json deleted file mode 100644 index 1bb779c1b..000000000 --- a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-101M-GQA/f0983645-4adb-4ddb-bf2f-33480cb7f421.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BEE-spoke-data_smol_llama-101M-GQA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smol_llama-101M-GQA", - "id": "BEE-spoke-data/smol_llama-101M-GQA", - "developer": "BEE-spoke-data", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.101 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1384 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3018 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1107 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu/161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json b/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu/161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json deleted file mode 100644 index d7af3cb33..000000000 --- a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu/161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BEE-spoke-data_smol_llama-220M-GQA-fineweb_edu/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smol_llama-220M-GQA-fineweb_edu", - "id": "BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu", - "developer": "BEE-spoke-data", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.218 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1988 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2929 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA/694a02f9-4729-4d0b-97ce-80adaef29be2.json b/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA/694a02f9-4729-4d0b-97ce-80adaef29be2.json deleted file mode 100644 index aa2c6a93b..000000000 --- a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA/694a02f9-4729-4d0b-97ce-80adaef29be2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BEE-spoke-data_smol_llama-220M-GQA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smol_llama-220M-GQA", - "id": "BEE-spoke-data/smol_llama-220M-GQA", - "developer": "BEE-spoke-data", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.218 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2386 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4059 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1149 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-openhermes/0521f51d-22c1-4821-8f04-23c533411668.json b/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-openhermes/0521f51d-22c1-4821-8f04-23c533411668.json deleted file mode 100644 index 7df19fae4..000000000 --- a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-openhermes/0521f51d-22c1-4821-8f04-23c533411668.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BEE-spoke-data_smol_llama-220M-openhermes/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smol_llama-220M-openhermes", - "id": "BEE-spoke-data/smol_llama-220M-openhermes", - "developer": "BEE-spoke-data", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.218 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3028 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/8fdea71b-5e68-4a78-aefc-8a00650464c4.json b/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/8fdea71b-5e68-4a78-aefc-8a00650464c4.json deleted file mode 100644 index 18703f8e9..000000000 --- a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/8fdea71b-5e68-4a78-aefc-8a00650464c4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BEE-spoke-data_tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024", - "id": "BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024", - "developer": "BEE-spoke-data", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 0.887 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4393 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1237 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan/e2ba5674-9251-4a4e-9eb8-046c834da400.json b/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan/e2ba5674-9251-4a4e-9eb8-046c834da400.json deleted file mode 100644 index c330cd864..000000000 --- a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan/e2ba5674-9251-4a4e-9eb8-046c834da400.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BEE-spoke-data_tFINE-900m-e16-d32-flan/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tFINE-900m-e16-d32-flan", - "id": "BEE-spoke-data/tFINE-900m-e16-d32-flan", - "developer": "BEE-spoke-data", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 0.887 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1506 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3028 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2332 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e/4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json b/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e/4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json deleted file mode 100644 index 2509ffeaa..000000000 --- a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e/4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BEE-spoke-data_tFINE-900m-e16-d32-instruct_2e/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tFINE-900m-e16-d32-instruct_2e", - "id": "BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e", - "developer": "BEE-spoke-data", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 0.887 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1403 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3135 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1237 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-instruct-orpo/886e0b8b-b2dc-434f-a299-50f668006241.json b/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-instruct-orpo/886e0b8b-b2dc-434f-a299-50f668006241.json deleted file mode 100644 index 2061ed658..000000000 --- a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-instruct-orpo/886e0b8b-b2dc-434f-a299-50f668006241.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BEE-spoke-data_tFINE-900m-instruct-orpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tFINE-900m-instruct-orpo", - "id": "BEE-spoke-data/tFINE-900m-instruct-orpo", - "developer": "BEE-spoke-data", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 0.887 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.133 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3022 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BSC-LT/salamandra-7b-instruct/7a6a9443-f331-4dfa-acf9-6aa30049bade.json b/data/hfopenllm_v2/BSC-LT/salamandra-7b-instruct/7a6a9443-f331-4dfa-acf9-6aa30049bade.json deleted file mode 100644 index 83c4bafbf..000000000 --- a/data/hfopenllm_v2/BSC-LT/salamandra-7b-instruct/7a6a9443-f331-4dfa-acf9-6aa30049bade.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BSC-LT_salamandra-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "salamandra-7b-instruct", - "id": "BSC-LT/salamandra-7b-instruct", - "developer": "BSC-LT", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.768 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2451 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4134 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BSC-LT/salamandra-7b/6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json b/data/hfopenllm_v2/BSC-LT/salamandra-7b/6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json deleted file mode 100644 index c5d56580f..000000000 --- a/data/hfopenllm_v2/BSC-LT/salamandra-7b/6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BSC-LT_salamandra-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "salamandra-7b", - "id": "BSC-LT/salamandra-7b", - "developer": "BSC-LT", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.768 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3517 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1493 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ba2han/Llama-Phi-3_DoRA/cfecfce3-090d-4c2e-826c-03c0c5337e98.json b/data/hfopenllm_v2/Ba2han/Llama-Phi-3_DoRA/cfecfce3-090d-4c2e-826c-03c0c5337e98.json deleted file mode 100644 index a299a3968..000000000 --- a/data/hfopenllm_v2/Ba2han/Llama-Phi-3_DoRA/cfecfce3-090d-4c2e-826c-03c0c5337e98.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ba2han_Llama-Phi-3_DoRA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Phi-3_DoRA", - "id": "Ba2han/Llama-Phi-3_DoRA", - "developer": "Ba2han", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5131 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4069 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3915 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB/5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json b/data/hfopenllm_v2/Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB/5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json deleted file mode 100644 index b23494890..000000000 --- a/data/hfopenllm_v2/Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB/5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Baptiste-HUVELLE-10_LeTriomphant2.2_ECE_iLAB/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LeTriomphant2.2_ECE_iLAB", - "id": "Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB", - "developer": "Baptiste-HUVELLE-10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5076 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4449 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3993 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4626 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5851 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0/ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json b/data/hfopenllm_v2/BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0/ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json deleted file mode 100644 index 7d796c600..000000000 --- a/data/hfopenllm_v2/BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0/ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BenevolenceMessiah_Qwen2.5-72B-2x-Instruct-TIES-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-72B-2x-Instruct-TIES-v1.0", - "id": "BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0", - "developer": "BenevolenceMessiah", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.7 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5473 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7273 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5628 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/114f246a-6049-40bf-ad86-9a822d13cf74.json b/data/hfopenllm_v2/BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/114f246a-6049-40bf-ad86-9a822d13cf74.json deleted file mode 100644 index 49b763b11..000000000 --- a/data/hfopenllm_v2/BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/114f246a-6049-40bf-ad86-9a822d13cf74.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BenevolenceMessiah_Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0", - "id": "BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0", - "developer": "BenevolenceMessiah", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 28.309 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4909 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.268 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BlackBeenie/Bloslain-8B-v0.2/82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json b/data/hfopenllm_v2/BlackBeenie/Bloslain-8B-v0.2/82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json deleted file mode 100644 index 40f3caa9f..000000000 --- a/data/hfopenllm_v2/BlackBeenie/Bloslain-8B-v0.2/82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BlackBeenie_Bloslain-8B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bloslain-8B-v0.2", - "id": "BlackBeenie/Bloslain-8B-v0.2", - "developer": "BlackBeenie", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5023 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4076 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3654 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1/ed3c1349-a154-4866-890f-2b115ffaf127.json b/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1/ed3c1349-a154-4866-890f-2b115ffaf127.json deleted file mode 100644 index ce32e3e07..000000000 --- a/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1/ed3c1349-a154-4866-890f-2b115ffaf127.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BlackBeenie_Llama-3.1-8B-OpenO1-SFT-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-OpenO1-SFT-v0.1", - "id": "BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1", - "developer": "BlackBeenie", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5124 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4787 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1526 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3492 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge/47942c55-5ddb-4fda-9c5b-34676ae2046a.json b/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge/47942c55-5ddb-4fda-9c5b-34676ae2046a.json deleted file mode 100644 index 3447f8e90..000000000 --- a/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge/47942c55-5ddb-4fda-9c5b-34676ae2046a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BlackBeenie_Llama-3.1-8B-pythonic-passthrough-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-pythonic-passthrough-merge", - "id": "BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge", - "developer": "BlackBeenie", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 20.245 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2316 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3454 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3778 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1332 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BlackBeenie/Neos-Gemma-2-9b/d860210b-4c8a-4d15-ad3a-4e39905f91ed.json b/data/hfopenllm_v2/BlackBeenie/Neos-Gemma-2-9b/d860210b-4c8a-4d15-ad3a-4e39905f91ed.json deleted file mode 100644 index 7ae80464f..000000000 --- a/data/hfopenllm_v2/BlackBeenie/Neos-Gemma-2-9b/d860210b-4c8a-4d15-ad3a-4e39905f91ed.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BlackBeenie_Neos-Gemma-2-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neos-Gemma-2-9b", - "id": "BlackBeenie/Neos-Gemma-2-9b", - "developer": "BlackBeenie", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5876 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5503 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-8B/d137f429-2b65-4ee9-9d66-3f619b270fad.json b/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-8B/d137f429-2b65-4ee9-9d66-3f619b270fad.json deleted file mode 100644 index d7f22bbd1..000000000 --- a/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-8B/d137f429-2b65-4ee9-9d66-3f619b270fad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BlackBeenie_Neos-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neos-Llama-3.1-8B", - "id": "BlackBeenie/Neos-Llama-3.1-8B", - "developer": "BlackBeenie", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4944 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4425 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-base/1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json b/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-base/1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json deleted file mode 100644 index 4db1b8828..000000000 --- a/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-base/1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BlackBeenie_Neos-Llama-3.1-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neos-Llama-3.1-base", - "id": "BlackBeenie/Neos-Llama-3.1-base", - "developer": "BlackBeenie", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.65 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1751 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.293 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2374 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3499 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BlackBeenie/Neos-Phi-3-14B-v0.1/6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json b/data/hfopenllm_v2/BlackBeenie/Neos-Phi-3-14B-v0.1/6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json deleted file mode 100644 index 5db8afdb2..000000000 --- a/data/hfopenllm_v2/BlackBeenie/Neos-Phi-3-14B-v0.1/6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BlackBeenie_Neos-Phi-3-14B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neos-Phi-3-14B-v0.1", - "id": "BlackBeenie/Neos-Phi-3-14B-v0.1", - "developer": "BlackBeenie", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4022 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6212 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4564 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BlackBeenie/llama-3-luminous-merged/676342d2-f37a-4b6a-967d-3ac750243470.json b/data/hfopenllm_v2/BlackBeenie/llama-3-luminous-merged/676342d2-f37a-4b6a-967d-3ac750243470.json deleted file mode 100644 index 2c979d2a9..000000000 --- a/data/hfopenllm_v2/BlackBeenie/llama-3-luminous-merged/676342d2-f37a-4b6a-967d-3ac750243470.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BlackBeenie_llama-3-luminous-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-luminous-merged", - "id": "BlackBeenie/llama-3-luminous-merged", - "developer": "BlackBeenie", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5154 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4149 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3773 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco/950b7108-0192-4875-b4e9-c3e43ab71e08.json b/data/hfopenllm_v2/BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco/950b7108-0192-4875-b4e9-c3e43ab71e08.json deleted file mode 100644 index 776b081c0..000000000 --- a/data/hfopenllm_v2/BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco/950b7108-0192-4875-b4e9-c3e43ab71e08.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BlackBeenie_llama-3.1-8B-Galore-openassistant-guanaco/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3.1-8B-Galore-openassistant-guanaco", - "id": "BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco", - "developer": "BlackBeenie", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2635 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5213 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4406 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3206 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Bllossom/llama-3.2-Korean-Bllossom-AICA-5B/85672df5-2f35-43be-8648-9937c66872dc.json b/data/hfopenllm_v2/Bllossom/llama-3.2-Korean-Bllossom-AICA-5B/85672df5-2f35-43be-8648-9937c66872dc.json deleted file mode 100644 index 2f5bfbf50..000000000 --- a/data/hfopenllm_v2/Bllossom/llama-3.2-Korean-Bllossom-AICA-5B/85672df5-2f35-43be-8648-9937c66872dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Bllossom_llama-3.2-Korean-Bllossom-AICA-5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3.2-Korean-Bllossom-AICA-5B", - "id": "Bllossom/llama-3.2-Korean-Bllossom-AICA-5B", - "developer": "Bllossom", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MllamaForConditionalGeneration", - "params_billions": 5.199 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5172 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BoltMonkey/DreadMix/051c5642-3b23-4879-9d10-639d1b3127d7.json b/data/hfopenllm_v2/BoltMonkey/DreadMix/051c5642-3b23-4879-9d10-639d1b3127d7.json deleted file mode 100644 index 7ccd4a48e..000000000 --- a/data/hfopenllm_v2/BoltMonkey/DreadMix/051c5642-3b23-4879-9d10-639d1b3127d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BoltMonkey_DreadMix/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DreadMix", - "id": "BoltMonkey/DreadMix", - "developer": "BoltMonkey", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7095 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4212 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json b/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json deleted file mode 100644 index 7730b26d1..000000000 --- a/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated", - "id": "BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated", - "developer": "BoltMonkey", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7999 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5152 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json b/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json deleted file mode 100644 index 0ea3c09c9..000000000 --- a/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated", - "id": "BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated", - "developer": "BoltMonkey", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5185 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4083 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BoltMonkey/SuperNeuralDreadDevil-8b/703df6c3-dae4-437f-9379-f8c264797adc.json b/data/hfopenllm_v2/BoltMonkey/SuperNeuralDreadDevil-8b/703df6c3-dae4-437f-9379-f8c264797adc.json deleted file mode 100644 index 00d308164..000000000 --- a/data/hfopenllm_v2/BoltMonkey/SuperNeuralDreadDevil-8b/703df6c3-dae4-437f-9379-f8c264797adc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BoltMonkey_SuperNeuralDreadDevil-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SuperNeuralDreadDevil-8b", - "id": "BoltMonkey/SuperNeuralDreadDevil-8b", - "developer": "BoltMonkey", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.771 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5286 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BrainWave-ML/llama3.2-3B-maths-orpo/1e349ad3-d29b-4a4b-97e7-b82055e41b07.json b/data/hfopenllm_v2/BrainWave-ML/llama3.2-3B-maths-orpo/1e349ad3-d29b-4a4b-97e7-b82055e41b07.json deleted file mode 100644 index 4d4697ad4..000000000 --- a/data/hfopenllm_v2/BrainWave-ML/llama3.2-3B-maths-orpo/1e349ad3-d29b-4a4b-97e7-b82055e41b07.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BrainWave-ML_llama3.2-3B-maths-orpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3.2-3B-maths-orpo", - "id": "BrainWave-ML/llama3.2-3B-maths-orpo", - "developer": "BrainWave-ML", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2049 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2912 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1168 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BramVanroy/GEITje-7B-ultra/8f677a76-932c-4c35-9708-4b723226aa19.json b/data/hfopenllm_v2/BramVanroy/GEITje-7B-ultra/8f677a76-932c-4c35-9708-4b723226aa19.json deleted file mode 100644 index 876af8a9c..000000000 --- a/data/hfopenllm_v2/BramVanroy/GEITje-7B-ultra/8f677a76-932c-4c35-9708-4b723226aa19.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BramVanroy_GEITje-7B-ultra/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GEITje-7B-ultra", - "id": "BramVanroy/GEITje-7B-ultra", - "developer": "BramVanroy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2011 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BramVanroy/fietje-2-chat/ebfe625f-ff1f-45f9-826c-9351ea4134e1.json b/data/hfopenllm_v2/BramVanroy/fietje-2-chat/ebfe625f-ff1f-45f9-826c-9351ea4134e1.json deleted file mode 100644 index 3c12f59b5..000000000 --- a/data/hfopenllm_v2/BramVanroy/fietje-2-chat/ebfe625f-ff1f-45f9-826c-9351ea4134e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BramVanroy_fietje-2-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fietje-2-chat", - "id": "BramVanroy/fietje-2-chat", - "developer": "BramVanroy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "PhiForCausalLM", - "params_billions": 2.775 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2917 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2055 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BramVanroy/fietje-2-instruct/66e6a757-ac22-47f3-82ce-81af45e1d3cf.json b/data/hfopenllm_v2/BramVanroy/fietje-2-instruct/66e6a757-ac22-47f3-82ce-81af45e1d3cf.json deleted file mode 100644 index 667f588bf..000000000 --- a/data/hfopenllm_v2/BramVanroy/fietje-2-instruct/66e6a757-ac22-47f3-82ce-81af45e1d3cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BramVanroy_fietje-2-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fietje-2-instruct", - "id": "BramVanroy/fietje-2-instruct", - "developer": "BramVanroy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "PhiForCausalLM", - "params_billions": 2.775 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.279 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2332 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2104 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/BramVanroy/fietje-2/1cd840c7-d432-495c-a3df-af1fa6264259.json b/data/hfopenllm_v2/BramVanroy/fietje-2/1cd840c7-d432-495c-a3df-af1fa6264259.json deleted file mode 100644 index 9d714f7c1..000000000 --- a/data/hfopenllm_v2/BramVanroy/fietje-2/1cd840c7-d432-495c-a3df-af1fa6264259.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/BramVanroy_fietje-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fietje-2", - "id": "BramVanroy/fietje-2", - "developer": "BramVanroy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2098 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4036 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3696 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1986 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-base/066f520f-9a64-4564-abfc-6435732c3585.json b/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-base/066f520f-9a64-4564-abfc-6435732c3585.json deleted file mode 100644 index 72e310100..000000000 --- a/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-base/066f520f-9a64-4564-abfc-6435732c3585.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_Llama-PLLuM-8B-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-PLLuM-8B-base", - "id": "CYFRAGOVPL/Llama-PLLuM-8B-base", - "developer": "CYFRAGOVPL", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2899 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.397 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2757 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-chat/aced5181-040a-48c0-bc5f-78d0de3afae8.json b/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-chat/aced5181-040a-48c0-bc5f-78d0de3afae8.json deleted file mode 100644 index 8c1a7d263..000000000 --- a/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-chat/aced5181-040a-48c0-bc5f-78d0de3afae8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_Llama-PLLuM-8B-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-PLLuM-8B-chat", - "id": "CYFRAGOVPL/Llama-PLLuM-8B-chat", - "developer": "CYFRAGOVPL", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4077 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-base/a4889a38-84d2-4ae1-b8a9-297b4400602d.json b/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-base/a4889a38-84d2-4ae1-b8a9-297b4400602d.json deleted file mode 100644 index ec8ff80a8..000000000 --- a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-base/a4889a38-84d2-4ae1-b8a9-297b4400602d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_PLLuM-12B-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PLLuM-12B-base", - "id": "CYFRAGOVPL/PLLuM-12B-base", - "developer": "CYFRAGOVPL", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2821 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4391 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4142 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.274 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-chat/d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json b/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-chat/d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json deleted file mode 100644 index b607ed69c..000000000 --- a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-chat/d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_PLLuM-12B-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PLLuM-12B-chat", - "id": "CYFRAGOVPL/PLLuM-12B-chat", - "developer": "CYFRAGOVPL", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3214 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4446 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4115 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2872 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-base/9859afee-02ca-4c48-acc8-acfd20c37e4e.json b/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-base/9859afee-02ca-4c48-acc8-acfd20c37e4e.json deleted file mode 100644 index d200117b7..000000000 --- a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-base/9859afee-02ca-4c48-acc8-acfd20c37e4e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_PLLuM-12B-nc-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PLLuM-12B-nc-base", - "id": "CYFRAGOVPL/PLLuM-12B-nc-base", - "developer": "CYFRAGOVPL", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2405 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3645 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-chat/e222d12b-c796-4890-a584-cd689bae7ea6.json b/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-chat/e222d12b-c796-4890-a584-cd689bae7ea6.json deleted file mode 100644 index 2eaeab787..000000000 --- a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-chat/e222d12b-c796-4890-a584-cd689bae7ea6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_PLLuM-12B-nc-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PLLuM-12B-nc-chat", - "id": "CYFRAGOVPL/PLLuM-12B-nc-chat", - "developer": "CYFRAGOVPL", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2834 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4576 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2597 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412/c16850f8-0b80-4455-8f38-8ec453cd1d41.json b/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412/c16850f8-0b80-4455-8f38-8ec453cd1d41.json deleted file mode 100644 index 49e80d94c..000000000 --- a/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412/c16850f8-0b80-4455-8f38-8ec453cd1d41.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct-2412/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-Rabbit-Ko-3B-Instruct-2412", - "id": "CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412", - "developer": "CarrotAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4782 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3134 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct/0d400b0f-cc82-4c86-b600-93a31b133f9d.json b/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct/0d400b0f-cc82-4c86-b600-93a31b133f9d.json deleted file mode 100644 index 727b90c2f..000000000 --- a/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct/0d400b0f-cc82-4c86-b600-93a31b133f9d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-Rabbit-Ko-3B-Instruct", - "id": "CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct", - "developer": "CarrotAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4427 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2054 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2822 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B/90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json b/data/hfopenllm_v2/Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B/90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json deleted file mode 100644 index ea628405b..000000000 --- a/data/hfopenllm_v2/Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B/90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Casual-Autopsy_L3-Umbral-Mind-RP-v2.0-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Umbral-Mind-RP-v2.0-8B", - "id": "Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B", - "developer": "Casual-Autopsy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7123 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CausalLM/14B/6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json b/data/hfopenllm_v2/CausalLM/14B/6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json deleted file mode 100644 index aaa13fb97..000000000 --- a/data/hfopenllm_v2/CausalLM/14B/6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CausalLM_14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "14B", - "id": "CausalLM/14B", - "developer": "CausalLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2788 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4155 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CausalLM/34b-beta/e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json b/data/hfopenllm_v2/CausalLM/34b-beta/e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json deleted file mode 100644 index 19e23bb71..000000000 --- a/data/hfopenllm_v2/CausalLM/34b-beta/e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CausalLM_34b-beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "34b-beta", - "id": "CausalLM/34b-beta", - "developer": "CausalLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5591 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3749 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5325 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CausalLM/preview-1-hf/5e9c1273-536d-4280-8fff-9931f46dc968.json b/data/hfopenllm_v2/CausalLM/preview-1-hf/5e9c1273-536d-4280-8fff-9931f46dc968.json deleted file mode 100644 index 571a2c125..000000000 --- a/data/hfopenllm_v2/CausalLM/preview-1-hf/5e9c1273-536d-4280-8fff-9931f46dc968.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CausalLM_preview-1-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "preview-1-hf", - "id": "CausalLM/preview-1-hf", - "developer": "CausalLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GlmForCausalLM", - "params_billions": 9.543 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5559 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3597 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Changgil/K2S3-14b-v0.2/460ca160-ac34-4091-ba2d-986b53532b55.json b/data/hfopenllm_v2/Changgil/K2S3-14b-v0.2/460ca160-ac34-4091-ba2d-986b53532b55.json deleted file mode 100644 index c087ab2dc..000000000 --- a/data/hfopenllm_v2/Changgil/K2S3-14b-v0.2/460ca160-ac34-4091-ba2d-986b53532b55.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Changgil_K2S3-14b-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "K2S3-14b-v0.2", - "id": "Changgil/K2S3-14b-v0.2", - "developer": "Changgil", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 14.352 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3243 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4613 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3923 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2644 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Changgil/K2S3-v0.1/ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json b/data/hfopenllm_v2/Changgil/K2S3-v0.1/ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json deleted file mode 100644 index efefe2dc0..000000000 --- a/data/hfopenllm_v2/Changgil/K2S3-v0.1/ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Changgil_K2S3-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "K2S3-v0.1", - "id": "Changgil/K2S3-v0.1", - "developer": "Changgil", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 14.352 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3277 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4014 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ClaudioItaly/Albacus/a29a69d3-d64e-4463-aa52-0a9d6d012c98.json b/data/hfopenllm_v2/ClaudioItaly/Albacus/a29a69d3-d64e-4463-aa52-0a9d6d012c98.json deleted file mode 100644 index 8b6f83423..000000000 --- a/data/hfopenllm_v2/ClaudioItaly/Albacus/a29a69d3-d64e-4463-aa52-0a9d6d012c98.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ClaudioItaly_Albacus/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Albacus", - "id": "ClaudioItaly/Albacus", - "developer": "ClaudioItaly", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.987 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4667 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5113 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3165 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ClaudioItaly/Book-Gut12B/4539c16e-1ac6-47f4-88eb-a09842497330.json b/data/hfopenllm_v2/ClaudioItaly/Book-Gut12B/4539c16e-1ac6-47f4-88eb-a09842497330.json deleted file mode 100644 index c0c13e7aa..000000000 --- a/data/hfopenllm_v2/ClaudioItaly/Book-Gut12B/4539c16e-1ac6-47f4-88eb-a09842497330.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ClaudioItaly_Book-Gut12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Book-Gut12B", - "id": "ClaudioItaly/Book-Gut12B", - "developer": "ClaudioItaly", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5417 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4635 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.367 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ClaudioItaly/Evolutionstory-7B-v2.2/2ff33c55-1236-4c57-8809-2d3076e43cc7.json b/data/hfopenllm_v2/ClaudioItaly/Evolutionstory-7B-v2.2/2ff33c55-1236-4c57-8809-2d3076e43cc7.json deleted file mode 100644 index da6fa6915..000000000 --- a/data/hfopenllm_v2/ClaudioItaly/Evolutionstory-7B-v2.2/2ff33c55-1236-4c57-8809-2d3076e43cc7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ClaudioItaly_Evolutionstory-7B-v2.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Evolutionstory-7B-v2.2", - "id": "ClaudioItaly/Evolutionstory-7B-v2.2", - "developer": "ClaudioItaly", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4814 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5108 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3159 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ClaudioItaly/intelligence-cod-rag-7b-v3/281ba822-49a2-4746-bc04-8de046439508.json b/data/hfopenllm_v2/ClaudioItaly/intelligence-cod-rag-7b-v3/281ba822-49a2-4746-bc04-8de046439508.json deleted file mode 100644 index 5635777d0..000000000 --- a/data/hfopenllm_v2/ClaudioItaly/intelligence-cod-rag-7b-v3/281ba822-49a2-4746-bc04-8de046439508.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ClaudioItaly_intelligence-cod-rag-7b-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "intelligence-cod-rag-7b-v3", - "id": "ClaudioItaly/intelligence-cod-rag-7b-v3", - "developer": "ClaudioItaly", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6898 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5366 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4153 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4195 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CohereForAI/aya-23-35B/0606d916-95ea-4318-af0c-3942329071c6.json b/data/hfopenllm_v2/CohereForAI/aya-23-35B/0606d916-95ea-4318-af0c-3942329071c6.json deleted file mode 100644 index 3eb7bd011..000000000 --- a/data/hfopenllm_v2/CohereForAI/aya-23-35B/0606d916-95ea-4318-af0c-3942329071c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CohereForAI_aya-23-35B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "aya-23-35B", - "id": "CohereForAI/aya-23-35B", - "developer": "CohereForAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "CohereForCausalLM", - "params_billions": 34.981 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CohereForAI/aya-23-8B/005159f0-da68-480d-972c-c160d145a682.json b/data/hfopenllm_v2/CohereForAI/aya-23-8B/005159f0-da68-480d-972c-c160d145a682.json deleted file mode 100644 index ff20417ff..000000000 --- a/data/hfopenllm_v2/CohereForAI/aya-23-8B/005159f0-da68-480d-972c-c160d145a682.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CohereForAI_aya-23-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "aya-23-8B", - "id": "CohereForAI/aya-23-8B", - "developer": "CohereForAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "CohereForCausalLM", - "params_billions": 8.028 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4699 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4296 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3941 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2278 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CohereForAI/aya-expanse-32b/2f6abb5d-52b3-44b0-b960-115793485fb1.json b/data/hfopenllm_v2/CohereForAI/aya-expanse-32b/2f6abb5d-52b3-44b0-b960-115793485fb1.json deleted file mode 100644 index 398bb4dca..000000000 --- a/data/hfopenllm_v2/CohereForAI/aya-expanse-32b/2f6abb5d-52b3-44b0-b960-115793485fb1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CohereForAI_aya-expanse-32b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "aya-expanse-32b", - "id": "CohereForAI/aya-expanse-32b", - "developer": "CohereForAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "CohereForCausalLM", - "params_billions": 32.296 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7302 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5649 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CohereForAI/aya-expanse-8b/6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json b/data/hfopenllm_v2/CohereForAI/aya-expanse-8b/6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json deleted file mode 100644 index 724fe03bd..000000000 --- a/data/hfopenllm_v2/CohereForAI/aya-expanse-8b/6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CohereForAI_aya-expanse-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "aya-expanse-8b", - "id": "CohereForAI/aya-expanse-8b", - "developer": "CohereForAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "CohereForCausalLM", - "params_billions": 8.028 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6359 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4977 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3729 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus-08-2024/26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json b/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus-08-2024/26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json deleted file mode 100644 index 294a861d5..000000000 --- a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus-08-2024/26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CohereForAI_c4ai-command-r-plus-08-2024/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "c4ai-command-r-plus-08-2024", - "id": "CohereForAI/c4ai-command-r-plus-08-2024", - "developer": "CohereForAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "CohereForCausalLM", - "params_billions": 103.811 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.754 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5996 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4829 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4421 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus/d4536913-5708-45e4-a024-45ae37fdae13.json b/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus/d4536913-5708-45e4-a024-45ae37fdae13.json deleted file mode 100644 index e568ee283..000000000 --- a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus/d4536913-5708-45e4-a024-45ae37fdae13.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CohereForAI_c4ai-command-r-plus/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "c4ai-command-r-plus", - "id": "CohereForAI/c4ai-command-r-plus", - "developer": "CohereForAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "CohereForCausalLM", - "params_billions": 103.811 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7664 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5815 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0801 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4807 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3992 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-v01/848860aa-7de3-4fae-afca-ac11224b96c5.json b/data/hfopenllm_v2/CohereForAI/c4ai-command-r-v01/848860aa-7de3-4fae-afca-ac11224b96c5.json deleted file mode 100644 index 5b2083324..000000000 --- a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-v01/848860aa-7de3-4fae-afca-ac11224b96c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CohereForAI_c4ai-command-r-v01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "c4ai-command-r-v01", - "id": "CohereForAI/c4ai-command-r-v01", - "developer": "CohereForAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "CohereForCausalLM", - "params_billions": 34.981 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6748 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CohereForAI/c4ai-command-r7b-12-2024/0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json b/data/hfopenllm_v2/CohereForAI/c4ai-command-r7b-12-2024/0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json deleted file mode 100644 index 0cc392d32..000000000 --- a/data/hfopenllm_v2/CohereForAI/c4ai-command-r7b-12-2024/0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CohereForAI_c4ai-command-r7b-12-2024/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "c4ai-command-r7b-12-2024", - "id": "CohereForAI/c4ai-command-r7b-12-2024", - "developer": "CohereForAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Cohere2ForCausalLM", - "params_billions": 8.028 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7713 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5503 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2991 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3572 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/20b69120-d476-4e34-b3c6-8cef11d6ee78.json b/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/20b69120-d476-4e34-b3c6-8cef11d6ee78.json deleted file mode 100644 index 047b3a3d1..000000000 --- a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/20b69120-d476-4e34-b3c6-8cef11d6ee78.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-Gemma-2b-dpo-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LION-Gemma-2b-dpo-v1.0", - "id": "Columbia-NLP/LION-Gemma-2b-dpo-v1.0", - "developer": "Columbia-NLP", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3102 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3881 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1665 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/696bbbfc-49dd-444e-a90b-76821845a726.json b/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/696bbbfc-49dd-444e-a90b-76821845a726.json deleted file mode 100644 index 9deccf82b..000000000 --- a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/696bbbfc-49dd-444e-a90b-76821845a726.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-Gemma-2b-dpo-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LION-Gemma-2b-dpo-v1.0", - "id": "Columbia-NLP/LION-Gemma-2b-dpo-v1.0", - "developer": "Columbia-NLP", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3278 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.412 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1666 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-odpo-v1.0/e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json b/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-odpo-v1.0/e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json deleted file mode 100644 index 7b19f887e..000000000 --- a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-odpo-v1.0/e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-Gemma-2b-odpo-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LION-Gemma-2b-odpo-v1.0", - "id": "Columbia-NLP/LION-Gemma-2b-odpo-v1.0", - "developer": "Columbia-NLP", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3066 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1692 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-sft-v1.0/b26ba2b7-1365-4b1c-a1be-35d588e02d36.json b/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-sft-v1.0/b26ba2b7-1365-4b1c-a1be-35d588e02d36.json deleted file mode 100644 index 2d900597c..000000000 --- a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-sft-v1.0/b26ba2b7-1365-4b1c-a1be-35d588e02d36.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-Gemma-2b-sft-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LION-Gemma-2b-sft-v1.0", - "id": "Columbia-NLP/LION-Gemma-2b-sft-v1.0", - "developer": "Columbia-NLP", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3692 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3879 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4027 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0/64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json b/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0/64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json deleted file mode 100644 index f22d9e63b..000000000 --- a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0/64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-LLaMA-3-8b-dpo-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LION-LLaMA-3-8b-dpo-v1.0", - "id": "Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0", - "developer": "Columbia-NLP", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4957 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5028 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4097 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3219 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0/c4e572cb-1d12-4baf-a4d8-a55422692207.json b/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0/c4e572cb-1d12-4baf-a4d8-a55422692207.json deleted file mode 100644 index 6e2795510..000000000 --- a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0/c4e572cb-1d12-4baf-a4d8-a55422692207.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-LLaMA-3-8b-odpo-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LION-LLaMA-3-8b-odpo-v1.0", - "id": "Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0", - "developer": "Columbia-NLP", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5024 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4057 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0/c6123e10-b1f9-49dc-888b-083881e6ef09.json b/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0/c6123e10-b1f9-49dc-888b-083881e6ef09.json deleted file mode 100644 index 1377d56ef..000000000 --- a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0/c6123e10-b1f9-49dc-888b-083881e6ef09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-LLaMA-3-8b-sft-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LION-LLaMA-3-8b-sft-v1.0", - "id": "Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0", - "developer": "Columbia-NLP", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5088 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4503 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3237 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/e1647f10-fec5-463d-b8e5-6b2b880bd687.json b/data/hfopenllm_v2/CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/e1647f10-fec5-463d-b8e5-6b2b880bd687.json deleted file mode 100644 index 739ac6a43..000000000 --- a/data/hfopenllm_v2/CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/e1647f10-fec5-463d-b8e5-6b2b880bd687.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CombinHorizon_Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES", - "id": "CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES", - "developer": "CombinHorizon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4979 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/6d5fa235-8d69-456e-9f23-0f702760baf4.json b/data/hfopenllm_v2/CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/6d5fa235-8d69-456e-9f23-0f702760baf4.json deleted file mode 100644 index e1668a135..000000000 --- a/data/hfopenllm_v2/CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/6d5fa235-8d69-456e-9f23-0f702760baf4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CombinHorizon_Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES", - "id": "CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES", - "developer": "CombinHorizon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5402 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4932 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4033 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4342 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CombinHorizon/YiSM-blossom5.1-34B-SLERP/e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json b/data/hfopenllm_v2/CombinHorizon/YiSM-blossom5.1-34B-SLERP/e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json deleted file mode 100644 index 0da86caa5..000000000 --- a/data/hfopenllm_v2/CombinHorizon/YiSM-blossom5.1-34B-SLERP/e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CombinHorizon_YiSM-blossom5.1-34B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "YiSM-blossom5.1-34B-SLERP", - "id": "CombinHorizon/YiSM-blossom5.1-34B-SLERP", - "developer": "CombinHorizon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5033 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6208 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2153 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4741 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/603e95c9-7e7f-4892-93f7-92f92b256865.json b/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/603e95c9-7e7f-4892-93f7-92f92b256865.json deleted file mode 100644 index f4ca1a794..000000000 --- a/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/603e95c9-7e7f-4892-93f7-92f92b256865.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CombinHorizon_huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES", - "id": "CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES", - "developer": "CombinHorizon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8206 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6929 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5721 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/3e2fd38a-186e-49aa-915c-7eb3cde50562.json b/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/3e2fd38a-186e-49aa-915c-7eb3cde50562.json deleted file mode 100644 index 859925816..000000000 --- a/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/3e2fd38a-186e-49aa-915c-7eb3cde50562.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CombinHorizon_huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES", - "id": "CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES", - "developer": "CombinHorizon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8176 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6336 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.491 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/16d55e66-9015-4d72-81e4-3f14c42b0368.json b/data/hfopenllm_v2/CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/16d55e66-9015-4d72-81e4-3f14c42b0368.json deleted file mode 100644 index f7f8dee1c..000000000 --- a/data/hfopenllm_v2/CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/16d55e66-9015-4d72-81e4-3f14c42b0368.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CombinHorizon_zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES", - "id": "CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES", - "developer": "CombinHorizon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8328 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6955 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4314 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5685 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ContactDoctor/Bio-Medical-3B-CoT-012025/696644b9-bd40-4047-bb85-0cb19510a96c.json b/data/hfopenllm_v2/ContactDoctor/Bio-Medical-3B-CoT-012025/696644b9-bd40-4047-bb85-0cb19510a96c.json deleted file mode 100644 index ea7e83334..000000000 --- a/data/hfopenllm_v2/ContactDoctor/Bio-Medical-3B-CoT-012025/696644b9-bd40-4047-bb85-0cb19510a96c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ContactDoctor_Bio-Medical-3B-CoT-012025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bio-Medical-3B-CoT-012025", - "id": "ContactDoctor/Bio-Medical-3B-CoT-012025", - "developer": "ContactDoctor", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.085 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4383 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2934 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ContactDoctor/Bio-Medical-Llama-3-8B/cbae8c39-0aec-4859-98bc-3b2d065833ad.json b/data/hfopenllm_v2/ContactDoctor/Bio-Medical-Llama-3-8B/cbae8c39-0aec-4859-98bc-3b2d065833ad.json deleted file mode 100644 index 0ff3b9020..000000000 --- a/data/hfopenllm_v2/ContactDoctor/Bio-Medical-Llama-3-8B/cbae8c39-0aec-4859-98bc-3b2d065833ad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ContactDoctor_Bio-Medical-Llama-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bio-Medical-Llama-3-8B", - "id": "ContactDoctor/Bio-Medical-Llama-3-8B", - "developer": "ContactDoctor", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4422 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4863 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3648 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge2/15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json b/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge2/15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json deleted file mode 100644 index 23736286b..000000000 --- a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge2/15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CoolSpring_Qwen2-0.5B-Abyme-merge2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-0.5B-Abyme-merge2", - "id": "CoolSpring/Qwen2-0.5B-Abyme-merge2", - "developer": "CoolSpring", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2022 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2994 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1489 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge3/357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json b/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge3/357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json deleted file mode 100644 index a155e6c18..000000000 --- a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge3/357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CoolSpring_Qwen2-0.5B-Abyme-merge3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-0.5B-Abyme-merge3", - "id": "CoolSpring/Qwen2-0.5B-Abyme-merge3", - "developer": "CoolSpring", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2386 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.15 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme/a50a542b-668e-47b1-a37e-805a58eea3d1.json b/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme/a50a542b-668e-47b1-a37e-805a58eea3d1.json deleted file mode 100644 index 43ce6bc2b..000000000 --- a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme/a50a542b-668e-47b1-a37e-805a58eea3d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CoolSpring_Qwen2-0.5B-Abyme/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-0.5B-Abyme", - "id": "CoolSpring/Qwen2-0.5B-Abyme", - "developer": "CoolSpring", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1915 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2862 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1333 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Corianas/Neural-Mistral-7B/00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json b/data/hfopenllm_v2/Corianas/Neural-Mistral-7B/00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json deleted file mode 100644 index 002cf8d4c..000000000 --- a/data/hfopenllm_v2/Corianas/Neural-Mistral-7B/00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Corianas_Neural-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neural-Mistral-7B", - "id": "Corianas/Neural-Mistral-7B", - "developer": "Corianas", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5489 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4428 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Corianas/Quokka_2.7b/26782941-b918-44c5-a7f6-5f770e47c3d6.json b/data/hfopenllm_v2/Corianas/Quokka_2.7b/26782941-b918-44c5-a7f6-5f770e47c3d6.json deleted file mode 100644 index 04534ef20..000000000 --- a/data/hfopenllm_v2/Corianas/Quokka_2.7b/26782941-b918-44c5-a7f6-5f770e47c3d6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Corianas_Quokka_2.7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Quokka_2.7b", - "id": "Corianas/Quokka_2.7b", - "developer": "Corianas", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPT2LMHeadModel", - "params_billions": 2.786 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1749 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3055 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3908 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Corianas/llama-3-reactor/5547ddaf-8fbb-4259-8b88-e946fc3d2404.json b/data/hfopenllm_v2/Corianas/llama-3-reactor/5547ddaf-8fbb-4259-8b88-e946fc3d2404.json deleted file mode 100644 index 0f76b0c3e..000000000 --- a/data/hfopenllm_v2/Corianas/llama-3-reactor/5547ddaf-8fbb-4259-8b88-e946fc3d2404.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Corianas_llama-3-reactor/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-reactor", - "id": "Corianas/llama-3-reactor", - "developer": "Corianas", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": -1.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.23 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4457 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2801 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CortexLM/btlm-7b-base-v0.2/bee5ea59-b97a-4783-b763-b6bd432d4558.json b/data/hfopenllm_v2/CortexLM/btlm-7b-base-v0.2/bee5ea59-b97a-4783-b763-b6bd432d4558.json deleted file mode 100644 index 9f5b604cb..000000000 --- a/data/hfopenllm_v2/CortexLM/btlm-7b-base-v0.2/bee5ea59-b97a-4783-b763-b6bd432d4558.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CortexLM_btlm-7b-base-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "btlm-7b-base-v0.2", - "id": "CortexLM/btlm-7b-base-v0.2", - "developer": "CortexLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.885 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1483 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4006 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3846 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.235 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Cran-May/SCE-2-24B/8150333f-8e79-4230-af8b-7ddb1d5eeb21.json b/data/hfopenllm_v2/Cran-May/SCE-2-24B/8150333f-8e79-4230-af8b-7ddb1d5eeb21.json deleted file mode 100644 index d8282b048..000000000 --- a/data/hfopenllm_v2/Cran-May/SCE-2-24B/8150333f-8e79-4230-af8b-7ddb1d5eeb21.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Cran-May_SCE-2-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SCE-2-24B", - "id": "Cran-May/SCE-2-24B", - "developer": "Cran-May", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5866 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6265 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1896 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4612 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Cran-May/SCE-3-24B/be8510a9-ecd4-4ac7-9930-3200cacb7b50.json b/data/hfopenllm_v2/Cran-May/SCE-3-24B/be8510a9-ecd4-4ac7-9930-3200cacb7b50.json deleted file mode 100644 index 17424927b..000000000 --- a/data/hfopenllm_v2/Cran-May/SCE-3-24B/be8510a9-ecd4-4ac7-9930-3200cacb7b50.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Cran-May_SCE-3-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SCE-3-24B", - "id": "Cran-May/SCE-3-24B", - "developer": "Cran-May", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5973 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1881 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4647 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Cran-May/T.E-8.1/887e4574-f876-4e75-afb8-e543bcb30020.json b/data/hfopenllm_v2/Cran-May/T.E-8.1/887e4574-f876-4e75-afb8-e543bcb30020.json deleted file mode 100644 index 18db9bb98..000000000 --- a/data/hfopenllm_v2/Cran-May/T.E-8.1/887e4574-f876-4e75-afb8-e543bcb30020.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Cran-May_T.E-8.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "T.E-8.1", - "id": "Cran-May/T.E-8.1", - "developer": "Cran-May", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5582 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Cran-May/merge_model_20250308_2/fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json b/data/hfopenllm_v2/Cran-May/merge_model_20250308_2/fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json deleted file mode 100644 index e21b806dc..000000000 --- a/data/hfopenllm_v2/Cran-May/merge_model_20250308_2/fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Cran-May_merge_model_20250308_2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merge_model_20250308_2", - "id": "Cran-May/merge_model_20250308_2", - "developer": "Cran-May", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5932 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6585 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Cran-May/merge_model_20250308_3/c0f05e38-6592-478a-9c46-26567f24ff85.json b/data/hfopenllm_v2/Cran-May/merge_model_20250308_3/c0f05e38-6592-478a-9c46-26567f24ff85.json deleted file mode 100644 index 2aca4048b..000000000 --- a/data/hfopenllm_v2/Cran-May/merge_model_20250308_3/c0f05e38-6592-478a-9c46-26567f24ff85.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Cran-May_merge_model_20250308_3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merge_model_20250308_3", - "id": "Cran-May/merge_model_20250308_3", - "developer": "Cran-May", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6018 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6271 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2545 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Cran-May/merge_model_20250308_4/06cc2913-8e05-44bf-a128-9a7c4aeff536.json b/data/hfopenllm_v2/Cran-May/merge_model_20250308_4/06cc2913-8e05-44bf-a128-9a7c4aeff536.json deleted file mode 100644 index 9d92eeeaa..000000000 --- a/data/hfopenllm_v2/Cran-May/merge_model_20250308_4/06cc2913-8e05-44bf-a128-9a7c4aeff536.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Cran-May_merge_model_20250308_4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merge_model_20250308_4", - "id": "Cran-May/merge_model_20250308_4", - "developer": "Cran-May", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.454 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6664 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5367 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Cran-May/tempmotacilla-cinerea-0308/86368d5b-0509-4b52-b988-58bcf7e1043e.json b/data/hfopenllm_v2/Cran-May/tempmotacilla-cinerea-0308/86368d5b-0509-4b52-b988-58bcf7e1043e.json deleted file mode 100644 index 4adc26fd6..000000000 --- a/data/hfopenllm_v2/Cran-May/tempmotacilla-cinerea-0308/86368d5b-0509-4b52-b988-58bcf7e1043e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Cran-May_tempmotacilla-cinerea-0308/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempmotacilla-cinerea-0308", - "id": "Cran-May/tempmotacilla-cinerea-0308", - "developer": "Cran-May", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8085 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6551 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.525 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CreitinGameplays/Llama-3.1-8B-R1-v0.1/77b89fe6-464b-4017-a77f-8750e2668a82.json b/data/hfopenllm_v2/CreitinGameplays/Llama-3.1-8B-R1-v0.1/77b89fe6-464b-4017-a77f-8750e2668a82.json deleted file mode 100644 index 213246109..000000000 --- a/data/hfopenllm_v2/CreitinGameplays/Llama-3.1-8B-R1-v0.1/77b89fe6-464b-4017-a77f-8750e2668a82.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CreitinGameplays_Llama-3.1-8B-R1-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-R1-v0.1", - "id": "CreitinGameplays/Llama-3.1-8B-R1-v0.1", - "developer": "CreitinGameplays", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3057 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1813 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3622 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1252 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Broca/d2e47d86-23dd-4c95-a7fb-99518615d09f.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Broca/d2e47d86-23dd-4c95-a7fb-99518615d09f.json deleted file mode 100644 index b78a75058..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Broca/d2e47d86-23dd-4c95-a7fb-99518615d09f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Broca/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Broca", - "id": "CultriX/Qwen2.5-14B-Broca", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6527 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4767 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5364 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-BrocaV9/0a09891e-ac97-4c3a-8364-7106a851f1a8.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-BrocaV9/0a09891e-ac97-4c3a-8364-7106a851f1a8.json deleted file mode 100644 index 31c50cf81..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-BrocaV9/0a09891e-ac97-4c3a-8364-7106a851f1a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-BrocaV9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-BrocaV9", - "id": "CultriX/Qwen2.5-14B-BrocaV9", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6391 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.469 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav3/eb41fe62-ac46-4630-bb2d-6b907f271737.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav3/eb41fe62-ac46-4630-bb2d-6b907f271737.json deleted file mode 100644 index e98b2f9be..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav3/eb41fe62-ac46-4630-bb2d-6b907f271737.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Brocav3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Brocav3", - "id": "CultriX/Qwen2.5-14B-Brocav3", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6952 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4756 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5317 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav6/d540a6c8-e9ec-4413-b9d2-dee68533c377.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav6/d540a6c8-e9ec-4413-b9d2-dee68533c377.json deleted file mode 100644 index 9a923a074..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav6/d540a6c8-e9ec-4413-b9d2-dee68533c377.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Brocav6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Brocav6", - "id": "CultriX/Qwen2.5-14B-Brocav6", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6995 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6389 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4742 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5319 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav7/5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav7/5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json deleted file mode 100644 index 3819754c9..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav7/5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Brocav7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Brocav7", - "id": "CultriX/Qwen2.5-14B-Brocav7", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6724 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6444 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4796 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5258 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emerged/6701738c-27e4-4bbd-b614-fbc297c3164f.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emerged/6701738c-27e4-4bbd-b614-fbc297c3164f.json deleted file mode 100644 index 84b4c7784..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emerged/6701738c-27e4-4bbd-b614-fbc297c3164f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Emerged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Emerged", - "id": "CultriX/Qwen2.5-14B-Emerged", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.626 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4691 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5186 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emergedv3/7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emergedv3/7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json deleted file mode 100644 index 0ba7935ad..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emergedv3/7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Emergedv3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Emergedv3", - "id": "CultriX/Qwen2.5-14B-Emergedv3", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6388 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6191 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4728 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5174 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-FinalMerge/32b6e4af-69ba-49b7-9367-dfafe3e390e8.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-FinalMerge/32b6e4af-69ba-49b7-9367-dfafe3e390e8.json deleted file mode 100644 index 481bd01ec..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-FinalMerge/32b6e4af-69ba-49b7-9367-dfafe3e390e8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-FinalMerge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-FinalMerge", - "id": "CultriX/Qwen2.5-14B-FinalMerge", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4891 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5715 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4379 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyper/e16deaf7-da55-40ba-ac18-860fa3f14d34.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyper/e16deaf7-da55-40ba-ac18-860fa3f14d34.json deleted file mode 100644 index e702043d7..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyper/e16deaf7-da55-40ba-ac18-860fa3f14d34.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Hyper/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Hyper", - "id": "CultriX/Qwen2.5-14B-Hyper", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3437 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3918 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4898 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-HyperMarck-dl/8a7a5886-0618-4615-9cdf-46f5d19a29fe.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-HyperMarck-dl/8a7a5886-0618-4615-9cdf-46f5d19a29fe.json deleted file mode 100644 index 33af3ceb9..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-HyperMarck-dl/8a7a5886-0618-4615-9cdf-46f5d19a29fe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-HyperMarck-dl/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-HyperMarck-dl", - "id": "CultriX/Qwen2.5-14B-HyperMarck-dl", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.665 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4416 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5091 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv3/66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv3/66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json deleted file mode 100644 index ad906fa69..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv3/66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Hyperionv3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Hyperionv3", - "id": "CultriX/Qwen2.5-14B-Hyperionv3", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6836 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6522 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv4/a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv4/a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json deleted file mode 100644 index 2baa36524..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv4/a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Hyperionv4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Hyperionv4", - "id": "CultriX/Qwen2.5-14B-Hyperionv4", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6472 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3474 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4832 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5364 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv5/4a6237a7-019c-4310-971e-84b08d1b5067.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv5/4a6237a7-019c-4310-971e-84b08d1b5067.json deleted file mode 100644 index bdfbba563..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv5/4a6237a7-019c-4310-971e-84b08d1b5067.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Hyperionv5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Hyperionv5", - "id": "CultriX/Qwen2.5-14B-Hyperionv5", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6729 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6443 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3822 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4795 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MegaMerge-pt2/996e781e-5939-41ac-b347-95c99037c34a.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MegaMerge-pt2/996e781e-5939-41ac-b347-95c99037c34a.json deleted file mode 100644 index 0815bb180..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MegaMerge-pt2/996e781e-5939-41ac-b347-95c99037c34a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-MegaMerge-pt2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-MegaMerge-pt2", - "id": "CultriX/Qwen2.5-14B-MegaMerge-pt2", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5683 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6578 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3995 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4729 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5421 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MergeStock/e880fa0e-ae49-4398-91bd-eadf8695425f.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MergeStock/e880fa0e-ae49-4398-91bd-eadf8695425f.json deleted file mode 100644 index 64f818a07..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MergeStock/e880fa0e-ae49-4398-91bd-eadf8695425f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-MergeStock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-MergeStock", - "id": "CultriX/Qwen2.5-14B-MergeStock", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6579 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4147 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4676 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-ReasoningMerge/da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-ReasoningMerge/da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json deleted file mode 100644 index a2f6c4850..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-ReasoningMerge/da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-ReasoningMerge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-ReasoningMerge", - "id": "CultriX/Qwen2.5-14B-ReasoningMerge", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4605 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6578 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4077 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5166 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5345 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Ultimav2/6d709396-1ae1-4e5c-a03c-13c1e9425202.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Ultimav2/6d709396-1ae1-4e5c-a03c-13c1e9425202.json deleted file mode 100644 index 0693a1c39..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Ultimav2/6d709396-1ae1-4e5c-a03c-13c1e9425202.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Ultimav2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Ultimav2", - "id": "CultriX/Qwen2.5-14B-Ultimav2", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6555 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4966 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5417 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Unity/5b616df9-e15a-4f84-98b4-c2cb532c1b95.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Unity/5b616df9-e15a-4f84-98b4-c2cb532c1b95.json deleted file mode 100644 index 5dd3c6ef1..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Unity/5b616df9-e15a-4f84-98b4-c2cb532c1b95.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Unity/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Unity", - "id": "CultriX/Qwen2.5-14B-Unity", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6739 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4313 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4679 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5076 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SFT/0f6552d9-3cbe-447e-909b-068e5ceed4c9.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SFT/0f6552d9-3cbe-447e-909b-068e5ceed4c9.json deleted file mode 100644 index 05fdd29f4..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SFT/0f6552d9-3cbe-447e-909b-068e5ceed4c9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Wernicke-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Wernicke-SFT", - "id": "CultriX/Qwen2.5-14B-Wernicke-SFT", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4937 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6461 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3595 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SLERP/2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SLERP/2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json deleted file mode 100644 index 19347fafa..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SLERP/2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Wernicke-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Wernicke-SLERP", - "id": "CultriX/Qwen2.5-14B-Wernicke-SLERP", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.491 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5589 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6441 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4486 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5094 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke/51a64f37-256c-4fe7-b28c-6117520f04ec.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke/51a64f37-256c-4fe7-b28c-6117520f04ec.json deleted file mode 100644 index 62b9abb48..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke/51a64f37-256c-4fe7-b28c-6117520f04ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Wernicke/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Wernicke", - "id": "CultriX/Qwen2.5-14B-Wernicke", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6568 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4689 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5424 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernickev3/03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernickev3/03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json deleted file mode 100644 index ece044f89..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernickev3/03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Wernickev3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Wernickev3", - "id": "CultriX/Qwen2.5-14B-Wernickev3", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7048 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6184 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4717 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5151 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-partialmergept1/3b0f5dea-db9b-4657-9807-6b3e56d38823.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-partialmergept1/3b0f5dea-db9b-4657-9807-6b3e56d38823.json deleted file mode 100644 index a8c01a5f1..000000000 --- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-partialmergept1/3b0f5dea-db9b-4657-9807-6b3e56d38823.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-partialmergept1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-partialmergept1", - "id": "CultriX/Qwen2.5-14B-partialmergept1", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6337 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6151 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5208 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwenfinity-2.5-14B/2d19e9ff-e331-4171-ae90-47e44f3f8885.json b/data/hfopenllm_v2/CultriX/Qwenfinity-2.5-14B/2d19e9ff-e331-4171-ae90-47e44f3f8885.json deleted file mode 100644 index 4b70d0648..000000000 --- a/data/hfopenllm_v2/CultriX/Qwenfinity-2.5-14B/2d19e9ff-e331-4171-ae90-47e44f3f8885.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwenfinity-2.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenfinity-2.5-14B", - "id": "CultriX/Qwenfinity-2.5-14B", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4814 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4506 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4498 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/Qwestion-14B/6bfb8b24-1abd-405b-b01d-7d7111705dbb.json b/data/hfopenllm_v2/CultriX/Qwestion-14B/6bfb8b24-1abd-405b-b01d-7d7111705dbb.json deleted file mode 100644 index 27242621d..000000000 --- a/data/hfopenllm_v2/CultriX/Qwestion-14B/6bfb8b24-1abd-405b-b01d-7d7111705dbb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_Qwestion-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwestion-14B", - "id": "CultriX/Qwestion-14B", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6318 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.645 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4636 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5422 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMerge/c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json b/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMerge/c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json deleted file mode 100644 index 2ff8194ea..000000000 --- a/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMerge/c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14B-EvolMerge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeQwence-14B-EvolMerge", - "id": "CultriX/SeQwence-14B-EvolMerge", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5382 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6572 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4821 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5419 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMergev1/72569796-1b11-48cc-ada7-e8c09522dd54.json b/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMergev1/72569796-1b11-48cc-ada7-e8c09522dd54.json deleted file mode 100644 index eb95ee2a0..000000000 --- a/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMergev1/72569796-1b11-48cc-ada7-e8c09522dd54.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14B-EvolMergev1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeQwence-14B-EvolMergev1", - "id": "CultriX/SeQwence-14B-EvolMergev1", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6546 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4215 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4623 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14B-v5/58403e30-bd2b-4f4c-ad41-daa890c77d40.json b/data/hfopenllm_v2/CultriX/SeQwence-14B-v5/58403e30-bd2b-4f4c-ad41-daa890c77d40.json deleted file mode 100644 index 3cbe2fbd3..000000000 --- a/data/hfopenllm_v2/CultriX/SeQwence-14B-v5/58403e30-bd2b-4f4c-ad41-daa890c77d40.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14B-v5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeQwence-14B-v5", - "id": "CultriX/SeQwence-14B-v5", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.592 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6517 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3308 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4714 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5415 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14B/eb8e1f1d-c6b3-407c-b172-d240553d2f89.json b/data/hfopenllm_v2/CultriX/SeQwence-14B/eb8e1f1d-c6b3-407c-b172-d240553d2f89.json deleted file mode 100644 index c20b9bd39..000000000 --- a/data/hfopenllm_v2/CultriX/SeQwence-14B/eb8e1f1d-c6b3-407c-b172-d240553d2f89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeQwence-14B", - "id": "CultriX/SeQwence-14B", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5352 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6506 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4666 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5419 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14Bv1/356d75a0-6520-46c1-afa9-7dbb2596a5c1.json b/data/hfopenllm_v2/CultriX/SeQwence-14Bv1/356d75a0-6520-46c1-afa9-7dbb2596a5c1.json deleted file mode 100644 index 4bc3ce623..000000000 --- a/data/hfopenllm_v2/CultriX/SeQwence-14Bv1/356d75a0-6520-46c1-afa9-7dbb2596a5c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14Bv1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeQwence-14Bv1", - "id": "CultriX/SeQwence-14Bv1", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6678 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6345 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.361 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4704 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14Bv2/78681e0c-5fe2-4920-af7b-99345cea3efe.json b/data/hfopenllm_v2/CultriX/SeQwence-14Bv2/78681e0c-5fe2-4920-af7b-99345cea3efe.json deleted file mode 100644 index a0e28a509..000000000 --- a/data/hfopenllm_v2/CultriX/SeQwence-14Bv2/78681e0c-5fe2-4920-af7b-99345cea3efe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14Bv2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeQwence-14Bv2", - "id": "CultriX/SeQwence-14Bv2", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5786 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6305 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4758 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4601 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5334 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14Bv3/ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json b/data/hfopenllm_v2/CultriX/SeQwence-14Bv3/ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json deleted file mode 100644 index ff855fb3e..000000000 --- a/data/hfopenllm_v2/CultriX/SeQwence-14Bv3/ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14Bv3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeQwence-14Bv3", - "id": "CultriX/SeQwence-14Bv3", - "developer": "CultriX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5719 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6302 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4766 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5335 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DRXD1000/Atlas-7B/17d0d377-bca4-411c-be11-6c5cfce07798.json b/data/hfopenllm_v2/DRXD1000/Atlas-7B/17d0d377-bca4-411c-be11-6c5cfce07798.json deleted file mode 100644 index 3ae174b80..000000000 --- a/data/hfopenllm_v2/DRXD1000/Atlas-7B/17d0d377-bca4-411c-be11-6c5cfce07798.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DRXD1000_Atlas-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Atlas-7B", - "id": "DRXD1000/Atlas-7B", - "developer": "DRXD1000", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.768 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3704 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1401 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DRXD1000/Phoenix-7B/d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json b/data/hfopenllm_v2/DRXD1000/Phoenix-7B/d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json deleted file mode 100644 index 9ea88c590..000000000 --- a/data/hfopenllm_v2/DRXD1000/Phoenix-7B/d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DRXD1000_Phoenix-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phoenix-7B", - "id": "DRXD1000/Phoenix-7B", - "developer": "DRXD1000", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3932 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3849 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2343 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DUAL-GPO/zephyr-7b-ipo-0k-15k-i1/389821ff-d8e2-4d1d-8fb2-57a689867ac5.json b/data/hfopenllm_v2/DUAL-GPO/zephyr-7b-ipo-0k-15k-i1/389821ff-d8e2-4d1d-8fb2-57a689867ac5.json deleted file mode 100644 index 1cd2505fb..000000000 --- a/data/hfopenllm_v2/DUAL-GPO/zephyr-7b-ipo-0k-15k-i1/389821ff-d8e2-4d1d-8fb2-57a689867ac5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DUAL-GPO_zephyr-7b-ipo-0k-15k-i1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-7b-ipo-0k-15k-i1", - "id": "DUAL-GPO/zephyr-7b-ipo-0k-15k-i1", - "developer": "DUAL-GPO", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 14.483 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2756 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4473 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.313 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DZgas/GIGABATEMAN-7B/7913f782-29b0-48bd-bc62-37da9a5ac7d9.json b/data/hfopenllm_v2/DZgas/GIGABATEMAN-7B/7913f782-29b0-48bd-bc62-37da9a5ac7d9.json deleted file mode 100644 index 150aa45c9..000000000 --- a/data/hfopenllm_v2/DZgas/GIGABATEMAN-7B/7913f782-29b0-48bd-bc62-37da9a5ac7d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DZgas_GIGABATEMAN-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GIGABATEMAN-7B", - "id": "DZgas/GIGABATEMAN-7B", - "developer": "DZgas", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4607 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3177 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/AetherDrake-SFT/b0930974-999e-4372-9d21-b9790e0bad4c.json b/data/hfopenllm_v2/Daemontatox/AetherDrake-SFT/b0930974-999e-4372-9d21-b9790e0bad4c.json deleted file mode 100644 index e4aedebfc..000000000 --- a/data/hfopenllm_v2/Daemontatox/AetherDrake-SFT/b0930974-999e-4372-9d21-b9790e0bad4c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_AetherDrake-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AetherDrake-SFT", - "id": "Daemontatox/AetherDrake-SFT", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4813 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4872 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1511 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4088 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3499 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/AetherSett/8265f577-f504-4a56-9cf0-42c34766559a.json b/data/hfopenllm_v2/Daemontatox/AetherSett/8265f577-f504-4a56-9cf0-42c34766559a.json deleted file mode 100644 index 66d790f66..000000000 --- a/data/hfopenllm_v2/Daemontatox/AetherSett/8265f577-f504-4a56-9cf0-42c34766559a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_AetherSett/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AetherSett", - "id": "Daemontatox/AetherSett", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3973 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4603 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/AetherTOT/82044cd2-1a46-406e-bc68-397ce41b29ea.json b/data/hfopenllm_v2/Daemontatox/AetherTOT/82044cd2-1a46-406e-bc68-397ce41b29ea.json deleted file mode 100644 index 859888bd9..000000000 --- a/data/hfopenllm_v2/Daemontatox/AetherTOT/82044cd2-1a46-406e-bc68-397ce41b29ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_AetherTOT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AetherTOT", - "id": "Daemontatox/AetherTOT", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MllamaForConditionalGeneration", - "params_billions": 10.67 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4383 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5034 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1443 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4052 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3778 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/AetherTOT/de09e323-8cf1-4aa9-9537-e8ad30a8c297.json b/data/hfopenllm_v2/Daemontatox/AetherTOT/de09e323-8cf1-4aa9-9537-e8ad30a8c297.json deleted file mode 100644 index e4d9830af..000000000 --- a/data/hfopenllm_v2/Daemontatox/AetherTOT/de09e323-8cf1-4aa9-9537-e8ad30a8c297.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_AetherTOT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AetherTOT", - "id": "Daemontatox/AetherTOT", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MllamaForConditionalGeneration", - "params_billions": 10.67 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4398 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1488 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4079 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/AetherUncensored/bfe543b4-ec38-488e-ae04-125cd358b61f.json b/data/hfopenllm_v2/Daemontatox/AetherUncensored/bfe543b4-ec38-488e-ae04-125cd358b61f.json deleted file mode 100644 index a8cb84de3..000000000 --- a/data/hfopenllm_v2/Daemontatox/AetherUncensored/bfe543b4-ec38-488e-ae04-125cd358b61f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_AetherUncensored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AetherUncensored", - "id": "Daemontatox/AetherUncensored", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4042 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4463 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3747 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/Cogito-MIS/be36d8ae-b81c-4b4e-aa2f-5999c7582237.json b/data/hfopenllm_v2/Daemontatox/Cogito-MIS/be36d8ae-b81c-4b4e-aa2f-5999c7582237.json deleted file mode 100644 index 8a0c7af29..000000000 --- a/data/hfopenllm_v2/Daemontatox/Cogito-MIS/be36d8ae-b81c-4b4e-aa2f-5999c7582237.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_Cogito-MIS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cogito-MIS", - "id": "Daemontatox/Cogito-MIS", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1815 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.506 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1435 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/CogitoDistil/342b435f-89e9-48ad-ab0f-2c1f52f4571a.json b/data/hfopenllm_v2/Daemontatox/CogitoDistil/342b435f-89e9-48ad-ab0f-2c1f52f4571a.json deleted file mode 100644 index c37151cbd..000000000 --- a/data/hfopenllm_v2/Daemontatox/CogitoDistil/342b435f-89e9-48ad-ab0f-2c1f52f4571a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_CogitoDistil/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CogitoDistil", - "id": "Daemontatox/CogitoDistil", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2776 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3677 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3927 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3755 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2625 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/CogitoZ/b0c8737d-d838-4da1-909b-b218e22119dc.json b/data/hfopenllm_v2/Daemontatox/CogitoZ/b0c8737d-d838-4da1-909b-b218e22119dc.json deleted file mode 100644 index f78c0c39c..000000000 --- a/data/hfopenllm_v2/Daemontatox/CogitoZ/b0c8737d-d838-4da1-909b-b218e22119dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_CogitoZ/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CogitoZ", - "id": "Daemontatox/CogitoZ", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3967 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6734 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5593 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/CogitoZ14/4cd40f28-842f-44d5-9eb2-86238077fc55.json b/data/hfopenllm_v2/Daemontatox/CogitoZ14/4cd40f28-842f-44d5-9eb2-86238077fc55.json deleted file mode 100644 index e16a58952..000000000 --- a/data/hfopenllm_v2/Daemontatox/CogitoZ14/4cd40f28-842f-44d5-9eb2-86238077fc55.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_CogitoZ14/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CogitoZ14", - "id": "Daemontatox/CogitoZ14", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6637 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4222 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4059 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3999 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/DocumentCogito/0758051c-2d75-402e-af0e-769096cbb17c.json b/data/hfopenllm_v2/Daemontatox/DocumentCogito/0758051c-2d75-402e-af0e-769096cbb17c.json deleted file mode 100644 index b59d83df4..000000000 --- a/data/hfopenllm_v2/Daemontatox/DocumentCogito/0758051c-2d75-402e-af0e-769096cbb17c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_DocumentCogito/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DocumentCogito", - "id": "Daemontatox/DocumentCogito", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MllamaForConditionalGeneration", - "params_billions": 10.67 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5187 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/DocumentCogito/c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json b/data/hfopenllm_v2/Daemontatox/DocumentCogito/c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json deleted file mode 100644 index 4744e7468..000000000 --- a/data/hfopenllm_v2/Daemontatox/DocumentCogito/c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_DocumentCogito/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DocumentCogito", - "id": "Daemontatox/DocumentCogito", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MllamaForConditionalGeneration", - "params_billions": 10.67 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1631 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3973 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3802 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/Llama3.3-70B-CogniLink/b8467118-d895-41fa-81c7-89892e1844d5.json b/data/hfopenllm_v2/Daemontatox/Llama3.3-70B-CogniLink/b8467118-d895-41fa-81c7-89892e1844d5.json deleted file mode 100644 index 7fa8b968c..000000000 --- a/data/hfopenllm_v2/Daemontatox/Llama3.3-70B-CogniLink/b8467118-d895-41fa-81c7-89892e1844d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_Llama3.3-70B-CogniLink/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.3-70B-CogniLink", - "id": "Daemontatox/Llama3.3-70B-CogniLink", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6931 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6668 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4139 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4455 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4877 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/Llama_cot/30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json b/data/hfopenllm_v2/Daemontatox/Llama_cot/30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json deleted file mode 100644 index e654aab90..000000000 --- a/data/hfopenllm_v2/Daemontatox/Llama_cot/30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_Llama_cot/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_cot", - "id": "Daemontatox/Llama_cot", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MllamaForConditionalGeneration", - "params_billions": 10.67 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7549 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4838 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/MawaredT1/89b92cda-c5b6-45ed-a534-361c9d34794a.json b/data/hfopenllm_v2/Daemontatox/MawaredT1/89b92cda-c5b6-45ed-a534-361c9d34794a.json deleted file mode 100644 index 9ac085f5f..000000000 --- a/data/hfopenllm_v2/Daemontatox/MawaredT1/89b92cda-c5b6-45ed-a534-361c9d34794a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_MawaredT1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MawaredT1", - "id": "Daemontatox/MawaredT1", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3021 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4702 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4718 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/Mini_QwQ/48cdf76a-886d-41ec-8580-00ed4232b601.json b/data/hfopenllm_v2/Daemontatox/Mini_QwQ/48cdf76a-886d-41ec-8580-00ed4232b601.json deleted file mode 100644 index e1b1e912d..000000000 --- a/data/hfopenllm_v2/Daemontatox/Mini_QwQ/48cdf76a-886d-41ec-8580-00ed4232b601.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_Mini_QwQ/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mini_QwQ", - "id": "Daemontatox/Mini_QwQ", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4497 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5549 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4192 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4682 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/NemoR/116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json b/data/hfopenllm_v2/Daemontatox/NemoR/116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json deleted file mode 100644 index 450d53bc4..000000000 --- a/data/hfopenllm_v2/Daemontatox/NemoR/116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_NemoR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NemoR", - "id": "Daemontatox/NemoR", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2287 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3908 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/PathFinderAI2.0/bb103828-70fe-4767-9302-6750d839129e.json b/data/hfopenllm_v2/Daemontatox/PathFinderAI2.0/bb103828-70fe-4767-9302-6750d839129e.json deleted file mode 100644 index 32487f192..000000000 --- a/data/hfopenllm_v2/Daemontatox/PathFinderAI2.0/bb103828-70fe-4767-9302-6750d839129e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_PathFinderAI2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PathFinderAI2.0", - "id": "Daemontatox/PathFinderAI2.0", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4541 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6658 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4216 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5547 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/PathFinderAi3.0/7b58ab54-239b-4e49-93f1-c3940df61474.json b/data/hfopenllm_v2/Daemontatox/PathFinderAi3.0/7b58ab54-239b-4e49-93f1-c3940df61474.json deleted file mode 100644 index 1fd11dd1d..000000000 --- a/data/hfopenllm_v2/Daemontatox/PathFinderAi3.0/7b58ab54-239b-4e49-93f1-c3940df61474.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_PathFinderAi3.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PathFinderAi3.0", - "id": "Daemontatox/PathFinderAi3.0", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4271 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6884 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4807 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5757 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/PathfinderAI/559067a2-816c-4091-893e-b1c7860171ec.json b/data/hfopenllm_v2/Daemontatox/PathfinderAI/559067a2-816c-4091-893e-b1c7860171ec.json deleted file mode 100644 index a2410c22f..000000000 --- a/data/hfopenllm_v2/Daemontatox/PathfinderAI/559067a2-816c-4091-893e-b1c7860171ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_PathfinderAI/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PathfinderAI", - "id": "Daemontatox/PathfinderAI", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4855 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6627 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4841 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4256 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5542 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/PathfinderAI/ec502619-880b-4b7c-acfe-c43cf6514e3f.json b/data/hfopenllm_v2/Daemontatox/PathfinderAI/ec502619-880b-4b7c-acfe-c43cf6514e3f.json deleted file mode 100644 index 02616b562..000000000 --- a/data/hfopenllm_v2/Daemontatox/PathfinderAI/ec502619-880b-4b7c-acfe-c43cf6514e3f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_PathfinderAI/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PathfinderAI", - "id": "Daemontatox/PathfinderAI", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3745 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6668 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4758 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4858 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5593 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/Phi-4-COT/6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json b/data/hfopenllm_v2/Daemontatox/Phi-4-COT/6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json deleted file mode 100644 index d40331320..000000000 --- a/data/hfopenllm_v2/Daemontatox/Phi-4-COT/6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_Phi-4-COT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-COT", - "id": "Daemontatox/Phi-4-COT", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1793 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6173 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.453 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5005 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/PixelParse_AI/636e2f93-3242-491c-9df5-003aa1dacecf.json b/data/hfopenllm_v2/Daemontatox/PixelParse_AI/636e2f93-3242-491c-9df5-003aa1dacecf.json deleted file mode 100644 index c6518eaf3..000000000 --- a/data/hfopenllm_v2/Daemontatox/PixelParse_AI/636e2f93-3242-491c-9df5-003aa1dacecf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_PixelParse_AI/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PixelParse_AI", - "id": "Daemontatox/PixelParse_AI", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MllamaForConditionalGeneration", - "params_billions": 10.67 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4383 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5034 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1473 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4052 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3778 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/RA2.0/1f4efa23-816d-49be-8659-feb003f4b3ef.json b/data/hfopenllm_v2/Daemontatox/RA2.0/1f4efa23-816d-49be-8659-feb003f4b3ef.json deleted file mode 100644 index cdbe9004f..000000000 --- a/data/hfopenllm_v2/Daemontatox/RA2.0/1f4efa23-816d-49be-8659-feb003f4b3ef.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_RA2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RA2.0", - "id": "Daemontatox/RA2.0", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4889 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3837 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4091 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2616 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/RA_Reasoner/d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json b/data/hfopenllm_v2/Daemontatox/RA_Reasoner/d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json deleted file mode 100644 index 01700c90a..000000000 --- a/data/hfopenllm_v2/Daemontatox/RA_Reasoner/d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_RA_Reasoner/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RA_Reasoner", - "id": "Daemontatox/RA_Reasoner", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5592 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6054 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2122 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3964 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/RA_Reasoner2.0/9ab53055-86f5-4a88-976f-015dd9c9e832.json b/data/hfopenllm_v2/Daemontatox/RA_Reasoner2.0/9ab53055-86f5-4a88-976f-015dd9c9e832.json deleted file mode 100644 index beab11efc..000000000 --- a/data/hfopenllm_v2/Daemontatox/RA_Reasoner2.0/9ab53055-86f5-4a88-976f-015dd9c9e832.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_RA_Reasoner2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RA_Reasoner2.0", - "id": "Daemontatox/RA_Reasoner2.0", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5366 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6062 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2311 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/ReasonTest/ba34083a-9b13-46d9-8f36-aa3ddd586711.json b/data/hfopenllm_v2/Daemontatox/ReasonTest/ba34083a-9b13-46d9-8f36-aa3ddd586711.json deleted file mode 100644 index 60c753137..000000000 --- a/data/hfopenllm_v2/Daemontatox/ReasonTest/ba34083a-9b13-46d9-8f36-aa3ddd586711.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_ReasonTest/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasonTest", - "id": "Daemontatox/ReasonTest", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.808 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2137 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4272 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/Research_PathfinderAI/6a39d734-ad73-4c4a-9583-3563e336d4b3.json b/data/hfopenllm_v2/Daemontatox/Research_PathfinderAI/6a39d734-ad73-4c4a-9583-3563e336d4b3.json deleted file mode 100644 index fbb52784d..000000000 --- a/data/hfopenllm_v2/Daemontatox/Research_PathfinderAI/6a39d734-ad73-4c4a-9583-3563e336d4b3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_Research_PathfinderAI/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Research_PathfinderAI", - "id": "Daemontatox/Research_PathfinderAI", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3457 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2872 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1699 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/SphinX/2af71e88-4931-4359-b92a-c64fa33df802.json b/data/hfopenllm_v2/Daemontatox/SphinX/2af71e88-4931-4359-b92a-c64fa33df802.json deleted file mode 100644 index 4195d2bc9..000000000 --- a/data/hfopenllm_v2/Daemontatox/SphinX/2af71e88-4931-4359-b92a-c64fa33df802.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_SphinX/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SphinX", - "id": "Daemontatox/SphinX", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5725 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5441 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3082 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4405 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/Sphinx2.0/bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json b/data/hfopenllm_v2/Daemontatox/Sphinx2.0/bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json deleted file mode 100644 index ab374a609..000000000 --- a/data/hfopenllm_v2/Daemontatox/Sphinx2.0/bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_Sphinx2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sphinx2.0", - "id": "Daemontatox/Sphinx2.0", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7123 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6473 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4018 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5184 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/TinySphinx/2de872b2-10c7-44dd-91c3-f20205207da6.json b/data/hfopenllm_v2/Daemontatox/TinySphinx/2de872b2-10c7-44dd-91c3-f20205207da6.json deleted file mode 100644 index 8e4831568..000000000 --- a/data/hfopenllm_v2/Daemontatox/TinySphinx/2de872b2-10c7-44dd-91c3-f20205207da6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_TinySphinx/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinySphinx", - "id": "Daemontatox/TinySphinx", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.331 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1698 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/TinySphinx2.0/5cabed09-d8ea-46c2-bb78-012dac954d6b.json b/data/hfopenllm_v2/Daemontatox/TinySphinx2.0/5cabed09-d8ea-46c2-bb78-012dac954d6b.json deleted file mode 100644 index 7a83a9405..000000000 --- a/data/hfopenllm_v2/Daemontatox/TinySphinx2.0/5cabed09-d8ea-46c2-bb78-012dac954d6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_TinySphinx2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinySphinx2.0", - "id": "Daemontatox/TinySphinx2.0", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2535 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3168 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1731 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/Zirel-7B-Math/8236db6a-ff8a-4237-af5a-03bb258f8e59.json b/data/hfopenllm_v2/Daemontatox/Zirel-7B-Math/8236db6a-ff8a-4237-af5a-03bb258f8e59.json deleted file mode 100644 index 00c47b014..000000000 --- a/data/hfopenllm_v2/Daemontatox/Zirel-7B-Math/8236db6a-ff8a-4237-af5a-03bb258f8e59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_Zirel-7B-Math/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zirel-7B-Math", - "id": "Daemontatox/Zirel-7B-Math", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6639 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5448 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1979 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4789 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/Zirel_1.5/1a7b078e-bc1f-400f-a0cd-f7b535548f23.json b/data/hfopenllm_v2/Daemontatox/Zirel_1.5/1a7b078e-bc1f-400f-a0cd-f7b535548f23.json deleted file mode 100644 index 160e661a4..000000000 --- a/data/hfopenllm_v2/Daemontatox/Zirel_1.5/1a7b078e-bc1f-400f-a0cd-f7b535548f23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_Zirel_1.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zirel_1.5", - "id": "Daemontatox/Zirel_1.5", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4168 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3985 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/mini-Cogito-R1/fdaf561c-567c-416d-a74a-ac3c07c5be5b.json b/data/hfopenllm_v2/Daemontatox/mini-Cogito-R1/fdaf561c-567c-416d-a74a-ac3c07c5be5b.json deleted file mode 100644 index a9a68a873..000000000 --- a/data/hfopenllm_v2/Daemontatox/mini-Cogito-R1/fdaf561c-567c-416d-a74a-ac3c07c5be5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_mini-Cogito-R1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mini-Cogito-R1", - "id": "Daemontatox/mini-Cogito-R1", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2298 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2749 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1482 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Daemontatox/mini_Pathfinder/58900b3b-303b-49c8-b807-7b8d06601568.json b/data/hfopenllm_v2/Daemontatox/mini_Pathfinder/58900b3b-303b-49c8-b807-7b8d06601568.json deleted file mode 100644 index 08611e986..000000000 --- a/data/hfopenllm_v2/Daemontatox/mini_Pathfinder/58900b3b-303b-49c8-b807-7b8d06601568.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Daemontatox_mini_Pathfinder/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mini_Pathfinder", - "id": "Daemontatox/mini_Pathfinder", - "developer": "Daemontatox", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2962 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3956 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4751 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dampfinchen/Llama-3.1-8B-Ultra-Instruct/7ac5a45a-7b41-4f63-8556-8737638a00ea.json b/data/hfopenllm_v2/Dampfinchen/Llama-3.1-8B-Ultra-Instruct/7ac5a45a-7b41-4f63-8556-8737638a00ea.json deleted file mode 100644 index d6ac101ed..000000000 --- a/data/hfopenllm_v2/Dampfinchen/Llama-3.1-8B-Ultra-Instruct/7ac5a45a-7b41-4f63-8556-8737638a00ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dampfinchen_Llama-3.1-8B-Ultra-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Ultra-Instruct", - "id": "Dampfinchen/Llama-3.1-8B-Ultra-Instruct", - "developer": "Dampfinchen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8081 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5258 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-10b/3cb55475-30c8-43c8-8d7d-394450fdc117.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-10b/3cb55475-30c8-43c8-8d7d-394450fdc117.json deleted file mode 100644 index 104efe008..000000000 --- a/data/hfopenllm_v2/Danielbrdz/Barcenas-10b/3cb55475-30c8-43c8-8d7d-394450fdc117.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-10b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Barcenas-10b", - "id": "Danielbrdz/Barcenas-10b", - "developer": "Danielbrdz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6608 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6121 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2153 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4361 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO/f5e140ff-0c0e-4769-8116-63cf50255773.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO/f5e140ff-0c0e-4769-8116-63cf50255773.json deleted file mode 100644 index 4dc6150c8..000000000 --- a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO/f5e140ff-0c0e-4769-8116-63cf50255773.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-14b-Phi-3-medium-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Barcenas-14b-Phi-3-medium-ORPO", - "id": "Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO", - "developer": "Danielbrdz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4799 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6536 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4808 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4723 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4-v2/df85ec6e-1325-40ce-8087-d960a1d767dd.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4-v2/df85ec6e-1325-40ce-8087-d960a1d767dd.json deleted file mode 100644 index 604f4b71a..000000000 --- a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4-v2/df85ec6e-1325-40ce-8087-d960a1d767dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-14b-phi-4-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Barcenas-14b-phi-4-v2", - "id": "Danielbrdz/Barcenas-14b-phi-4-v2", - "developer": "Danielbrdz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2775 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6573 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4399 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5244 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4/a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4/a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json deleted file mode 100644 index e976af74e..000000000 --- a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4/a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-14b-phi-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Barcenas-14b-phi-4", - "id": "Danielbrdz/Barcenas-14b-phi-4", - "developer": "Danielbrdz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6769 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2583 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5097 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5175 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-3b-GRPO/11842dd9-0572-41ef-aaa0-8d19f3420efc.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-3b-GRPO/11842dd9-0572-41ef-aaa0-8d19f3420efc.json deleted file mode 100644 index 85744ceae..000000000 --- a/data/hfopenllm_v2/Danielbrdz/Barcenas-3b-GRPO/11842dd9-0572-41ef-aaa0-8d19f3420efc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-3b-GRPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Barcenas-3b-GRPO", - "id": "Danielbrdz/Barcenas-3b-GRPO", - "developer": "Danielbrdz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5444 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4414 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1375 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3576 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-Llama3-8b-ORPO/01abccec-1cea-4060-89be-289987d0a2ce.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-Llama3-8b-ORPO/01abccec-1cea-4060-89be-289987d0a2ce.json deleted file mode 100644 index a208f17df..000000000 --- a/data/hfopenllm_v2/Danielbrdz/Barcenas-Llama3-8b-ORPO/01abccec-1cea-4060-89be-289987d0a2ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-Llama3-8b-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Barcenas-Llama3-8b-ORPO", - "id": "Danielbrdz/Barcenas-Llama3-8b-ORPO", - "developer": "Danielbrdz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7372 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4987 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.419 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.383 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-R1-Qwen-1.5b/dce8226c-57bd-4255-b813-8a70494f0a1a.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-R1-Qwen-1.5b/dce8226c-57bd-4255-b813-8a70494f0a1a.json deleted file mode 100644 index 03948309d..000000000 --- a/data/hfopenllm_v2/Danielbrdz/Barcenas-R1-Qwen-1.5b/dce8226c-57bd-4255-b813-8a70494f0a1a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-R1-Qwen-1.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Barcenas-R1-Qwen-1.5b", - "id": "Danielbrdz/Barcenas-R1-Qwen-1.5b", - "developer": "Danielbrdz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2428 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3587 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1909 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-2/7f80e69c-eec6-49ac-a088-6248ee25f736.json b/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-2/7f80e69c-eec6-49ac-a088-6248ee25f736.json deleted file mode 100644 index f352222ea..000000000 --- a/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-2/7f80e69c-eec6-49ac-a088-6248ee25f736.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_12b-mn-dans-reasoning-test-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "12b-mn-dans-reasoning-test-2", - "id": "Dans-DiscountModels/12b-mn-dans-reasoning-test-2", - "developer": "Dans-DiscountModels", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3711 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4807 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3702 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2507 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-3/e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json b/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-3/e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json deleted file mode 100644 index 570947dbd..000000000 --- a/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-3/e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_12b-mn-dans-reasoning-test-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "12b-mn-dans-reasoning-test-3", - "id": "Dans-DiscountModels/12b-mn-dans-reasoning-test-3", - "developer": "Dans-DiscountModels", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5053 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4839 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4168 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2516 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML/e6ad37be-28f4-43b4-9df1-b7b47d31232e.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML/e6ad37be-28f4-43b4-9df1-b7b47d31232e.json deleted file mode 100644 index fc2a36d87..000000000 --- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML/e6ad37be-28f4-43b4-9df1-b7b47d31232e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-CoreCurriculum-12b-ChatML/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-Instruct-CoreCurriculum-12b-ChatML", - "id": "Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML", - "developer": "Dans-DiscountModels", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2111 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4792 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3606 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2805 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0/5514368a-1f7d-4cd0-b7f7-d116b753f975.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0/5514368a-1f7d-4cd0-b7f7-d116b753f975.json deleted file mode 100644 index 83903ba4f..000000000 --- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0/5514368a-1f7d-4cd0-b7f7-d116b753f975.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-Instruct-Mix-8b-ChatML-V0.1.0", - "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0", - "developer": "Dans-DiscountModels", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0668 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4775 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3786 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3284 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1/c0e29cf8-897f-4e07-abb4-71c801d34301.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1/c0e29cf8-897f-4e07-abb4-71c801d34301.json deleted file mode 100644 index 4a453f833..000000000 --- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1/c0e29cf8-897f-4e07-abb4-71c801d34301.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-Instruct-Mix-8b-ChatML-V0.1.1", - "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1", - "developer": "Dans-DiscountModels", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0911 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4749 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3279 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0/68310379-65b2-482d-892b-f76547bce2b0.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0/68310379-65b2-482d-892b-f76547bce2b0.json deleted file mode 100644 index a2a7418c6..000000000 --- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0/68310379-65b2-482d-892b-f76547bce2b0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-Instruct-Mix-8b-ChatML-V0.2.0", - "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0", - "developer": "Dans-DiscountModels", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML/a034c4ec-d4cd-439b-8dbd-e67685ea7616.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML/a034c4ec-d4cd-439b-8dbd-e67685ea7616.json deleted file mode 100644 index d03dd14df..000000000 --- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML/a034c4ec-d4cd-439b-8dbd-e67685ea7616.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-Instruct-Mix-8b-ChatML", - "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML", - "developer": "Dans-DiscountModels", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0825 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4738 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3918 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7/e4b761d3-bb84-4433-b9fb-4c92ecae6279.json b/data/hfopenllm_v2/Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7/e4b761d3-bb84-4433-b9fb-4c92ecae6279.json deleted file mode 100644 index 17f92f2fb..000000000 --- a/data/hfopenllm_v2/Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7/e4b761d3-bb84-4433-b9fb-4c92ecae6279.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Mistral-7b-v0.3-Test-E0.7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7b-v0.3-Test-E0.7", - "id": "Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7", - "developer": "Dans-DiscountModels", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5124 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.475 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4005 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2744 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dans-DiscountModels/mistral-7b-test-merged/38d78d30-be6d-476c-a3aa-d9a40f570a56.json b/data/hfopenllm_v2/Dans-DiscountModels/mistral-7b-test-merged/38d78d30-be6d-476c-a3aa-d9a40f570a56.json deleted file mode 100644 index 530b81d04..000000000 --- a/data/hfopenllm_v2/Dans-DiscountModels/mistral-7b-test-merged/38d78d30-be6d-476c-a3aa-d9a40f570a56.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_mistral-7b-test-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-7b-test-merged", - "id": "Dans-DiscountModels/mistral-7b-test-merged", - "developer": "Dans-DiscountModels", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6678 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4898 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Darkknight535/OpenCrystal-12B-L3/36e60f6c-60f7-4b17-88fe-82810e195fc7.json b/data/hfopenllm_v2/Darkknight535/OpenCrystal-12B-L3/36e60f6c-60f7-4b17-88fe-82810e195fc7.json deleted file mode 100644 index 62f99b5b5..000000000 --- a/data/hfopenllm_v2/Darkknight535/OpenCrystal-12B-L3/36e60f6c-60f7-4b17-88fe-82810e195fc7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Darkknight535_OpenCrystal-12B-L3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenCrystal-12B-L3", - "id": "Darkknight535/OpenCrystal-12B-L3", - "developer": "Darkknight535", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 11.52 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5223 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3657 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/a6c647e8-ed24-4150-8563-dd9b20e21498.json b/data/hfopenllm_v2/DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/a6c647e8-ed24-4150-8563-dd9b20e21498.json deleted file mode 100644 index 61ffee2d2..000000000 --- a/data/hfopenllm_v2/DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/a6c647e8-ed24-4150-8563-dd9b20e21498.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm", - "id": "DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 16.537 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3136 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4762 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1057 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json b/data/hfopenllm_v2/DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json deleted file mode 100644 index 408f2c43a..000000000 --- a/data/hfopenllm_v2/DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B", - "id": "DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4887 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2976 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json b/data/hfopenllm_v2/DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json deleted file mode 100644 index ba72ae399..000000000 --- a/data/hfopenllm_v2/DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B", - "id": "DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 15.664 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2507 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4488 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4164 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2709 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json b/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json deleted file mode 100644 index 55cc9cff5..000000000 --- a/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B", - "id": "DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.942 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3024 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json b/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json deleted file mode 100644 index e4618c8df..000000000 --- a/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B", - "id": "DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.942 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4769 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/db8c6169-bfc1-48bb-be53-fa93c673f051.json b/data/hfopenllm_v2/DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/db8c6169-bfc1-48bb-be53-fa93c673f051.json deleted file mode 100644 index a94c29f09..000000000 --- a/data/hfopenllm_v2/DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/db8c6169-bfc1-48bb-be53-fa93c673f051.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm", - "id": "DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 25.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5807 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5155 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/41437fc9-6d48-4317-a8de-ab4e63b2cf46.json b/data/hfopenllm_v2/DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/41437fc9-6d48-4317-a8de-ab4e63b2cf46.json deleted file mode 100644 index 0f3796fae..000000000 --- a/data/hfopenllm_v2/DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/41437fc9-6d48-4317-a8de-ab4e63b2cf46.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B", - "id": "DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 16.537 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2853 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4179 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2778 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json b/data/hfopenllm_v2/DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json deleted file mode 100644 index 774de1d5e..000000000 --- a/data/hfopenllm_v2/DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B", - "id": "DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 18.405 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3793 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.272 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-9B/3349d66c-e12b-49c1-a406-e0e77b697458.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-9B/3349d66c-e12b-49c1-a406-e0e77b697458.json deleted file mode 100644 index 1978b8349..000000000 --- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-9B/3349d66c-e12b-49c1-a406-e0e77b697458.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-The-Writer-9B", - "id": "DavidAU/Gemma-The-Writer-9B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.174 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5905 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4099 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3979 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-DEADLINE-10B/7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-DEADLINE-10B/7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json deleted file mode 100644 index 5af39ed94..000000000 --- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-DEADLINE-10B/7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-DEADLINE-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-The-Writer-DEADLINE-10B", - "id": "DavidAU/Gemma-The-Writer-DEADLINE-10B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.952 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2332 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5896 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4189 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3946 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-J.GutenBerg-10B/ac749485-df6d-485e-8fa7-63bdfd744167.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-J.GutenBerg-10B/ac749485-df6d-485e-8fa7-63bdfd744167.json deleted file mode 100644 index f0e5db406..000000000 --- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-J.GutenBerg-10B/ac749485-df6d-485e-8fa7-63bdfd744167.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-J.GutenBerg-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-The-Writer-J.GutenBerg-10B", - "id": "DavidAU/Gemma-The-Writer-J.GutenBerg-10B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.034 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2858 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5909 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0921 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4176 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3947 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-Mighty-Sword-9B/54363a4b-312b-4035-a1c3-b5321311cec4.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-Mighty-Sword-9B/54363a4b-312b-4035-a1c3-b5321311cec4.json deleted file mode 100644 index fdb1da8cf..000000000 --- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-Mighty-Sword-9B/54363a4b-312b-4035-a1c3-b5321311cec4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-Mighty-Sword-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-The-Writer-Mighty-Sword-9B", - "id": "DavidAU/Gemma-The-Writer-Mighty-Sword-9B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7528 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5912 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1911 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4112 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/aa9e2b9e-cd25-4492-9801-eba7d40b4365.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/aa9e2b9e-cd25-4492-9801-eba7d40b4365.json deleted file mode 100644 index 949b96b5e..000000000 --- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/aa9e2b9e-cd25-4492-9801-eba7d40b4365.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-The-Writer-N-Restless-Quill-10B-Uncensored", - "id": "DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.034 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7071 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5922 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2296 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4163 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3966 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/L3-DARKEST-PLANET-16.5B/c6b484b8-f6f3-4516-aff5-c2f6438c9047.json b/data/hfopenllm_v2/DavidAU/L3-DARKEST-PLANET-16.5B/c6b484b8-f6f3-4516-aff5-c2f6438c9047.json deleted file mode 100644 index adcdba506..000000000 --- a/data/hfopenllm_v2/DavidAU/L3-DARKEST-PLANET-16.5B/c6b484b8-f6f3-4516-aff5-c2f6438c9047.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_L3-DARKEST-PLANET-16.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-DARKEST-PLANET-16.5B", - "id": "DavidAU/L3-DARKEST-PLANET-16.5B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 16.537 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6231 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.523 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/L3-Dark-Planet-8B/c6c760c9-a345-4e25-b333-b403bf6db389.json b/data/hfopenllm_v2/DavidAU/L3-Dark-Planet-8B/c6c760c9-a345-4e25-b333-b403bf6db389.json deleted file mode 100644 index 2857bf738..000000000 --- a/data/hfopenllm_v2/DavidAU/L3-Dark-Planet-8B/c6c760c9-a345-4e25-b333-b403bf6db389.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_L3-Dark-Planet-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Dark-Planet-8B", - "id": "DavidAU/L3-Dark-Planet-8B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4134 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0823 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3737 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct/65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json b/data/hfopenllm_v2/DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct/65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json deleted file mode 100644 index d2d3fcadd..000000000 --- a/data/hfopenllm_v2/DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct/65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_L3-Jamet-12.2B-MK.V-Blackroot-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Jamet-12.2B-MK.V-Blackroot-Instruct", - "id": "DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 12.174 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3962 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4766 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3291 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct/92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json b/data/hfopenllm_v2/DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct/92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json deleted file mode 100644 index 0b5fe1f21..000000000 --- a/data/hfopenllm_v2/DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct/92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_L3-Lumimaid-12.2B-v0.1-OAS-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Lumimaid-12.2B-v0.1-OAS-Instruct", - "id": "DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 12.174 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3924 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4693 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4194 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/L3-SMB-Instruct-12.2B-F32/59ddd478-c1cd-4bd8-80c3-fdebe762414a.json b/data/hfopenllm_v2/DavidAU/L3-SMB-Instruct-12.2B-F32/59ddd478-c1cd-4bd8-80c3-fdebe762414a.json deleted file mode 100644 index 39a911ded..000000000 --- a/data/hfopenllm_v2/DavidAU/L3-SMB-Instruct-12.2B-F32/59ddd478-c1cd-4bd8-80c3-fdebe762414a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_L3-SMB-Instruct-12.2B-F32/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-SMB-Instruct-12.2B-F32", - "id": "DavidAU/L3-SMB-Instruct-12.2B-F32", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 12.174 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4303 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4786 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4087 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/02f63fc6-9376-4fb5-b067-63493238cc27.json b/data/hfopenllm_v2/DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/02f63fc6-9376-4fb5-b067-63493238cc27.json deleted file mode 100644 index abc99f61c..000000000 --- a/data/hfopenllm_v2/DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/02f63fc6-9376-4fb5-b067-63493238cc27.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Stheno-Maid-Blackroot-Grand-HORROR-16B", - "id": "DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 16.537 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3439 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4736 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/L3-Stheno-v3.2-12.2B-Instruct/dd7597fd-27f5-4e77-a44f-b01d0db82719.json b/data/hfopenllm_v2/DavidAU/L3-Stheno-v3.2-12.2B-Instruct/dd7597fd-27f5-4e77-a44f-b01d0db82719.json deleted file mode 100644 index e2f7047b8..000000000 --- a/data/hfopenllm_v2/DavidAU/L3-Stheno-v3.2-12.2B-Instruct/dd7597fd-27f5-4e77-a44f-b01d0db82719.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_L3-Stheno-v3.2-12.2B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Stheno-v3.2-12.2B-Instruct", - "id": "DavidAU/L3-Stheno-v3.2-12.2B-Instruct", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 12.174 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4028 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4846 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4103 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3345 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B/20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json b/data/hfopenllm_v2/DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B/20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json deleted file mode 100644 index c256c420f..000000000 --- a/data/hfopenllm_v2/DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B/20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_L3.1-Dark-Planet-SpinFire-Uncensored-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Dark-Planet-SpinFire-Uncensored-8B", - "id": "DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5261 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.367 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/c4e9d045-3769-4828-a2ca-7fa508873089.json b/data/hfopenllm_v2/DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/c4e9d045-3769-4828-a2ca-7fa508873089.json deleted file mode 100644 index 0420b014e..000000000 --- a/data/hfopenllm_v2/DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/c4e9d045-3769-4828-a2ca-7fa508873089.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B", - "id": "DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 13.668 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3345 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4421 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2606 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3749 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2892 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/0a0501ec-4ecd-47c1-914b-d473f795cef2.json b/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/0a0501ec-4ecd-47c1-914b-d473f795cef2.json deleted file mode 100644 index 0a0b5cf10..000000000 --- a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/0a0501ec-4ecd-47c1-914b-d473f795cef2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B", - "id": "DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 4.089 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1783 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3033 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3715 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json b/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json deleted file mode 100644 index 3d03444da..000000000 --- a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B", - "id": "DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 19.022 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2835 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3592 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2417 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1636 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json b/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json deleted file mode 100644 index 60721bfc8..000000000 --- a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavidAU_Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32", - "id": "DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32", - "developer": "DavidAU", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 8.714 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2107 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3286 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3404 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Davidsv/SUONG-1/def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json b/data/hfopenllm_v2/Davidsv/SUONG-1/def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json deleted file mode 100644 index 392a9563b..000000000 --- a/data/hfopenllm_v2/Davidsv/SUONG-1/def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Davidsv_SUONG-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SUONG-1", - "id": "Davidsv/SUONG-1", - "developer": "Davidsv", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 2.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2497 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3578 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1085 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/5e1aa809-ef20-445e-a05b-eccd585d5991.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/5e1aa809-ef20-445e-a05b-eccd585d5991.json deleted file mode 100644 index a70497e8f..000000000 --- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/5e1aa809-ef20-445e-a05b-eccd585d5991.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-SPIN-iter0", - "id": "DavieLion/Llama-3.2-1B-SPIN-iter0", - "developer": "DavieLion", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1507 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.293 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json deleted file mode 100644 index ff2f7e65f..000000000 --- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-SPIN-iter0", - "id": "DavieLion/Llama-3.2-1B-SPIN-iter0", - "developer": "DavieLion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1549 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2937 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter1/cfe4ea72-ddb9-49b5-9599-99f215e112e5.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter1/cfe4ea72-ddb9-49b5-9599-99f215e112e5.json deleted file mode 100644 index 6cf1f5440..000000000 --- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter1/cfe4ea72-ddb9-49b5-9599-99f215e112e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-SPIN-iter1", - "id": "DavieLion/Llama-3.2-1B-SPIN-iter1", - "developer": "DavieLion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.294 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3646 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter2/81d63d8e-88dd-4b16-b9b8-d07604878f8f.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter2/81d63d8e-88dd-4b16-b9b8-d07604878f8f.json deleted file mode 100644 index b5a6f68d3..000000000 --- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter2/81d63d8e-88dd-4b16-b9b8-d07604878f8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-SPIN-iter2", - "id": "DavieLion/Llama-3.2-1B-SPIN-iter2", - "developer": "DavieLion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1376 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3553 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/81f8208b-f7e7-4685-bb84-321d9e097470.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/81f8208b-f7e7-4685-bb84-321d9e097470.json deleted file mode 100644 index fe2d84160..000000000 --- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/81f8208b-f7e7-4685-bb84-321d9e097470.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-SPIN-iter3", - "id": "DavieLion/Llama-3.2-1B-SPIN-iter3", - "developer": "DavieLion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1324 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2972 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/a0c9a434-9b8c-47c5-b511-9daac7901686.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/a0c9a434-9b8c-47c5-b511-9daac7901686.json deleted file mode 100644 index e22bf8212..000000000 --- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/a0c9a434-9b8c-47c5-b511-9daac7901686.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-SPIN-iter3", - "id": "DavieLion/Llama-3.2-1B-SPIN-iter3", - "developer": "DavieLion", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2975 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DavieLion/Lllma-3.2-1B/28b60eae-1b38-4404-8db1-3fb2997583f4.json b/data/hfopenllm_v2/DavieLion/Lllma-3.2-1B/28b60eae-1b38-4404-8db1-3fb2997583f4.json deleted file mode 100644 index 9df57bb74..000000000 --- a/data/hfopenllm_v2/DavieLion/Lllma-3.2-1B/28b60eae-1b38-4404-8db1-3fb2997583f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DavieLion_Lllma-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lllma-3.2-1B", - "id": "DavieLion/Lllma-3.2-1B", - "developer": "DavieLion", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1601 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2965 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3578 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT/746862a2-a90c-4612-91d0-f989b9eed1a5.json b/data/hfopenllm_v2/DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT/746862a2-a90c-4612-91d0-f989b9eed1a5.json deleted file mode 100644 index 64711247e..000000000 --- a/data/hfopenllm_v2/DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT/746862a2-a90c-4612-91d0-f989b9eed1a5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DebateLabKIT_Llama-3.1-Argunaut-1-8B-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Argunaut-1-8B-SFT", - "id": "DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT", - "developer": "DebateLabKIT", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5519 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4824 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4503 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3472 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Deci/DeciLM-7B-instruct/715ee057-9c9a-4e04-991c-7040b1eef65b.json b/data/hfopenllm_v2/Deci/DeciLM-7B-instruct/715ee057-9c9a-4e04-991c-7040b1eef65b.json deleted file mode 100644 index 19a0bd337..000000000 --- a/data/hfopenllm_v2/Deci/DeciLM-7B-instruct/715ee057-9c9a-4e04-991c-7040b1eef65b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Deci_DeciLM-7B-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeciLM-7B-instruct", - "id": "Deci/DeciLM-7B-instruct", - "developer": "Deci", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "DeciLMForCausalLM", - "params_billions": 7.044 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.488 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2608 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Deci/DeciLM-7B/4dc1d103-3458-4b8c-9e63-b98effd69667.json b/data/hfopenllm_v2/Deci/DeciLM-7B/4dc1d103-3458-4b8c-9e63-b98effd69667.json deleted file mode 100644 index 7a095aa90..000000000 --- a/data/hfopenllm_v2/Deci/DeciLM-7B/4dc1d103-3458-4b8c-9e63-b98effd69667.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Deci_DeciLM-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeciLM-7B", - "id": "Deci/DeciLM-7B", - "developer": "Deci", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "DeciLMForCausalLM", - "params_billions": 7.044 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2813 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4423 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4359 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2692 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.1-8B-Inst/070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.1-8B-Inst/070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json deleted file mode 100644 index 693d2501c..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.1-8B-Inst/070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.1-8B-Inst/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Explore_Llama-3.1-8B-Inst", - "id": "DeepAutoAI/Explore_Llama-3.1-8B-Inst", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7795 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5117 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2009 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst/8406a5b8-a87d-489b-b75b-00e9f675f09f.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst/8406a5b8-a87d-489b-b75b-00e9f675f09f.json deleted file mode 100644 index 2d0dd9463..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst/8406a5b8-a87d-489b-b75b-00e9f675f09f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.2-1B-Inst/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Explore_Llama-3.2-1B-Inst", - "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3505 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3183 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0/11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0/11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json deleted file mode 100644 index 6773c3179..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0/11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Explore_Llama-3.2-1B-Inst_v0", - "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5597 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3365 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3103 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1804 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1/6b542f5a-ea62-45ce-8e98-436a4d058877.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1/6b542f5a-ea62-45ce-8e98-436a4d058877.json deleted file mode 100644 index a3c9bf2aa..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1/6b542f5a-ea62-45ce-8e98-436a4d058877.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Explore_Llama-3.2-1B-Inst_v1.1", - "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5844 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3513 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3117 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1818 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1/9b280640-bfee-4730-acc3-386a54b2434c.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1/9b280640-bfee-4730-acc3-386a54b2434c.json deleted file mode 100644 index 0e0834213..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1/9b280640-bfee-4730-acc3-386a54b2434c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Explore_Llama-3.2-1B-Inst_v1", - "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4999 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3141 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/causal_gpt2/eff5171b-6119-4013-8aa8-8a4f0215b045.json b/data/hfopenllm_v2/DeepAutoAI/causal_gpt2/eff5171b-6119-4013-8aa8-8a4f0215b045.json deleted file mode 100644 index f4355b69c..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/causal_gpt2/eff5171b-6119-4013-8aa8-8a4f0215b045.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_causal_gpt2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "causal_gpt2", - "id": "DeepAutoAI/causal_gpt2", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1813 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.427 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0/471c5fed-f155-4521-9d9c-b5370ca91bec.json b/data/hfopenllm_v2/DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0/471c5fed-f155-4521-9d9c-b5370ca91bec.json deleted file mode 100644 index db64d4ff1..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0/471c5fed-f155-4521-9d9c-b5370ca91bec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_d2nwg_Llama-3.1-8B-Instruct-v0.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "d2nwg_Llama-3.1-8B-Instruct-v0.0", - "id": "DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7893 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3877 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2/690be099-3ace-484f-b01f-2fe6b324d12a.json b/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2/690be099-3ace-484f-b01f-2fe6b324d12a.json deleted file mode 100644 index e1f3baeef..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2/690be099-3ace-484f-b01f-2fe6b324d12a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_d2nwg_causal_gpt2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "d2nwg_causal_gpt2", - "id": "DeepAutoAI/d2nwg_causal_gpt2", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1916 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3027 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4297 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1151 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2_v1/71fbd15f-5eec-40d9-84e8-07323f3ffac6.json b/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2_v1/71fbd15f-5eec-40d9-84e8-07323f3ffac6.json deleted file mode 100644 index 97cbbb48b..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2_v1/71fbd15f-5eec-40d9-84e8-07323f3ffac6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_d2nwg_causal_gpt2_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "d2nwg_causal_gpt2_v1", - "id": "DeepAutoAI/d2nwg_causal_gpt2_v1", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1989 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2992 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4337 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1135 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst/eb93dd3e-3d13-4234-bb66-f6177648aa2b.json b/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst/eb93dd3e-3d13-4234-bb66-f6177648aa2b.json deleted file mode 100644 index 02684506f..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst/eb93dd3e-3d13-4234-bb66-f6177648aa2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_ldm_soup_Llama-3.1-8B-Inst/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ldm_soup_Llama-3.1-8B-Inst", - "id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8033 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5121 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4161 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3886 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0/f7ec1ed7-cc30-4879-8ab1-4909011553d5.json b/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0/f7ec1ed7-cc30-4879-8ab1-4909011553d5.json deleted file mode 100644 index 78c4abb58..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0/f7ec1ed7-cc30-4879-8ab1-4909011553d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ldm_soup_Llama-3.1-8B-Instruct-v0.0", - "id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7889 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5125 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4121 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3895 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1/3e100704-dbd3-4d05-b325-5bb4bc90e51c.json b/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1/3e100704-dbd3-4d05-b325-5bb4bc90e51c.json deleted file mode 100644 index dbebcb08a..000000000 --- a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1/3e100704-dbd3-4d05-b325-5bb4bc90e51c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ldm_soup_Llama-3.1-8B-Instruct-v0.1", - "id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1", - "developer": "DeepAutoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7889 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5125 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4121 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3895 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B/12f003ef-1098-4d3f-aed7-7343034157bc.json b/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B/12f003ef-1098-4d3f-aed7-7343034157bc.json deleted file mode 100644 index d3e936e6c..000000000 --- a/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B/12f003ef-1098-4d3f-aed7-7343034157bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Lexora-Lite-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lexora-Lite-3B", - "id": "DeepMount00/Lexora-Lite-3B", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5776 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4873 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2304 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3966 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3602 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B_v2/9de2e564-3a30-4f1c-80da-6432a245a64f.json b/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B_v2/9de2e564-3a30-4f1c-80da-6432a245a64f.json deleted file mode 100644 index 6c7c5b9aa..000000000 --- a/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B_v2/9de2e564-3a30-4f1c-80da-6432a245a64f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Lexora-Lite-3B_v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lexora-Lite-3B_v2", - "id": "DeepMount00/Lexora-Lite-3B_v2", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4943 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4812 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2281 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3822 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3544 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Lexora-Medium-7B/dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json b/data/hfopenllm_v2/DeepMount00/Lexora-Medium-7B/dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json deleted file mode 100644 index 65cb6dbf1..000000000 --- a/data/hfopenllm_v2/DeepMount00/Lexora-Medium-7B/dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Lexora-Medium-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lexora-Medium-7B", - "id": "DeepMount00/Lexora-Medium-7B", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4103 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5145 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2221 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4439 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4325 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Llama-3-8b-Ita/8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json b/data/hfopenllm_v2/DeepMount00/Llama-3-8b-Ita/8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json deleted file mode 100644 index c5cf4ed7b..000000000 --- a/data/hfopenllm_v2/DeepMount00/Llama-3-8b-Ita/8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3-8b-Ita/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8b-Ita", - "id": "DeepMount00/Llama-3-8b-Ita", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.753 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4936 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4268 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json b/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json deleted file mode 100644 index 3e1809697..000000000 --- a/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-Ita/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8b-Ita", - "id": "DeepMount00/Llama-3.1-8b-Ita", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Unknown", - "params_billions": 0.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5365 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/baf93ef6-56f3-4809-93f6-32dcf4730388.json b/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/baf93ef6-56f3-4809-93f6-32dcf4730388.json deleted file mode 100644 index 3d7b72221..000000000 --- a/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/baf93ef6-56f3-4809-93f6-32dcf4730388.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-ITA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8b-ITA", - "id": "DeepMount00/Llama-3.1-8b-ITA", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7917 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5109 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Llama-3.1-Distilled/f6df14bd-207c-4fea-b789-c9f9aef749b3.json b/data/hfopenllm_v2/DeepMount00/Llama-3.1-Distilled/f6df14bd-207c-4fea-b789-c9f9aef749b3.json deleted file mode 100644 index 5d76d499b..000000000 --- a/data/hfopenllm_v2/DeepMount00/Llama-3.1-Distilled/f6df14bd-207c-4fea-b789-c9f9aef749b3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-Distilled/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Distilled", - "id": "DeepMount00/Llama-3.1-Distilled", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7844 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5101 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2032 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4058 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita/97766a7f-cf5b-46ae-b51e-5c5702ae000b.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita/97766a7f-cf5b-46ae-b51e-5c5702ae000b.json deleted file mode 100644 index dfa22cee3..000000000 --- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita/97766a7f-cf5b-46ae-b51e-5c5702ae000b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-1.5B-Ita", - "id": "DeepMount00/Qwen2-1.5B-Ita", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5173 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3504 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2772 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v2/d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v2/d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json deleted file mode 100644 index 28635f94d..000000000 --- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v2/d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita_v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-1.5B-Ita_v2", - "id": "DeepMount00/Qwen2-1.5B-Ita_v2", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3954 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3702 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3032 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v3/275d4bf0-566c-4b50-86b9-38c7f45df143.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v3/275d4bf0-566c-4b50-86b9-38c7f45df143.json deleted file mode 100644 index 4d8e80e4f..000000000 --- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v3/275d4bf0-566c-4b50-86b9-38c7f45df143.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita_v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-1.5B-Ita_v3", - "id": "DeepMount00/Qwen2-1.5B-Ita_v3", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.489 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3948 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1042 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3018 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v5/aa504db9-81f3-424f-b7d9-683ebe31f5d8.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v5/aa504db9-81f3-424f-b7d9-683ebe31f5d8.json deleted file mode 100644 index d9344d807..000000000 --- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v5/aa504db9-81f3-424f-b7d9-683ebe31f5d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita_v5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-1.5B-Ita_v5", - "id": "DeepMount00/Qwen2-1.5B-Ita_v5", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4987 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2943 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v6/2cc209b7-ef10-435d-a840-b904ab741491.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v6/2cc209b7-ef10-435d-a840-b904ab741491.json deleted file mode 100644 index a299046f1..000000000 --- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v6/2cc209b7-ef10-435d-a840-b904ab741491.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita_v6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-1.5B-Ita_v6", - "id": "DeepMount00/Qwen2-1.5B-Ita_v6", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.497 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2999 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4249 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3755 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2872 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2.5-7B-Instruct-MathCoder/9b9390ac-fd65-4a58-9834-5352aa340cdc.json b/data/hfopenllm_v2/DeepMount00/Qwen2.5-7B-Instruct-MathCoder/9b9390ac-fd65-4a58-9834-5352aa340cdc.json deleted file mode 100644 index b0e72b4e2..000000000 --- a/data/hfopenllm_v2/DeepMount00/Qwen2.5-7B-Instruct-MathCoder/9b9390ac-fd65-4a58-9834-5352aa340cdc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2.5-7B-Instruct-MathCoder/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Instruct-MathCoder", - "id": "DeepMount00/Qwen2.5-7B-Instruct-MathCoder", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.153 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2998 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3806 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DeepMount00/mergekit-ties-okvgjfz/4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json b/data/hfopenllm_v2/DeepMount00/mergekit-ties-okvgjfz/4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json deleted file mode 100644 index 9daeaca32..000000000 --- a/data/hfopenllm_v2/DeepMount00/mergekit-ties-okvgjfz/4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DeepMount00_mergekit-ties-okvgjfz/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-ties-okvgjfz", - "id": "DeepMount00/mergekit-ties-okvgjfz", - "developer": "DeepMount00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.153 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2998 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3806 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Delta-Vector/Baldur-8B/4bc5a0db-1c88-4c61-9343-1d340305ecc5.json b/data/hfopenllm_v2/Delta-Vector/Baldur-8B/4bc5a0db-1c88-4c61-9343-1d340305ecc5.json deleted file mode 100644 index 4eaef8e92..000000000 --- a/data/hfopenllm_v2/Delta-Vector/Baldur-8B/4bc5a0db-1c88-4c61-9343-1d340305ecc5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Delta-Vector_Baldur-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Baldur-8B", - "id": "Delta-Vector/Baldur-8B", - "developer": "Delta-Vector", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4782 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5306 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1435 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3654 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Delta-Vector/Control-8B-V1.1/74527f51-dcec-4b82-8ba8-075c933404f5.json b/data/hfopenllm_v2/Delta-Vector/Control-8B-V1.1/74527f51-dcec-4b82-8ba8-075c933404f5.json deleted file mode 100644 index 5f8e589b6..000000000 --- a/data/hfopenllm_v2/Delta-Vector/Control-8B-V1.1/74527f51-dcec-4b82-8ba8-075c933404f5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Delta-Vector_Control-8B-V1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Control-8B-V1.1", - "id": "Delta-Vector/Control-8B-V1.1", - "developer": "Delta-Vector", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5697 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4993 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3745 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Delta-Vector/Control-8B/ac31bc90-3854-4d38-925d-ef8dc7e75d24.json b/data/hfopenllm_v2/Delta-Vector/Control-8B/ac31bc90-3854-4d38-925d-ef8dc7e75d24.json deleted file mode 100644 index e9f8fccf8..000000000 --- a/data/hfopenllm_v2/Delta-Vector/Control-8B/ac31bc90-3854-4d38-925d-ef8dc7e75d24.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Delta-Vector_Control-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Control-8B", - "id": "Delta-Vector/Control-8B", - "developer": "Delta-Vector", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5041 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.139 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3732 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Delta-Vector/Darkens-8B/88583cff-1adc-4b1b-8e68-07f0074d0ae2.json b/data/hfopenllm_v2/Delta-Vector/Darkens-8B/88583cff-1adc-4b1b-8e68-07f0074d0ae2.json deleted file mode 100644 index da2477729..000000000 --- a/data/hfopenllm_v2/Delta-Vector/Darkens-8B/88583cff-1adc-4b1b-8e68-07f0074d0ae2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Delta-Vector_Darkens-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Darkens-8B", - "id": "Delta-Vector/Darkens-8B", - "developer": "Delta-Vector", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.414 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2548 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5251 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4106 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3736 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Delta-Vector/Henbane-7b-attempt2/fadbac9e-7224-41d1-abfa-7039cbcba9f6.json b/data/hfopenllm_v2/Delta-Vector/Henbane-7b-attempt2/fadbac9e-7224-41d1-abfa-7039cbcba9f6.json deleted file mode 100644 index 18bdc7508..000000000 --- a/data/hfopenllm_v2/Delta-Vector/Henbane-7b-attempt2/fadbac9e-7224-41d1-abfa-7039cbcba9f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Delta-Vector_Henbane-7b-attempt2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Henbane-7b-attempt2", - "id": "Delta-Vector/Henbane-7b-attempt2", - "developer": "Delta-Vector", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4157 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5061 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2273 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3973 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4028 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Delta-Vector/Odin-9B/1fb90540-0fa0-44ca-ad67-1e3503f6b729.json b/data/hfopenllm_v2/Delta-Vector/Odin-9B/1fb90540-0fa0-44ca-ad67-1e3503f6b729.json deleted file mode 100644 index e6b6d563d..000000000 --- a/data/hfopenllm_v2/Delta-Vector/Odin-9B/1fb90540-0fa0-44ca-ad67-1e3503f6b729.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Delta-Vector_Odin-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Odin-9B", - "id": "Delta-Vector/Odin-9B", - "developer": "Delta-Vector", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3692 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4648 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4047 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Delta-Vector/Tor-8B/047784e2-c1ee-40d9-a60d-e43504825801.json b/data/hfopenllm_v2/Delta-Vector/Tor-8B/047784e2-c1ee-40d9-a60d-e43504825801.json deleted file mode 100644 index 226245e5e..000000000 --- a/data/hfopenllm_v2/Delta-Vector/Tor-8B/047784e2-c1ee-40d9-a60d-e43504825801.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Delta-Vector_Tor-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tor-8B", - "id": "Delta-Vector/Tor-8B", - "developer": "Delta-Vector", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.414 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2382 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4092 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.373 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DevQuasar/DevQuasar-R1-Uncensored-Llama-8B/ee60453d-2d51-46f7-8a18-c651d590f0e7.json b/data/hfopenllm_v2/DevQuasar/DevQuasar-R1-Uncensored-Llama-8B/ee60453d-2d51-46f7-8a18-c651d590f0e7.json deleted file mode 100644 index 0c6a93423..000000000 --- a/data/hfopenllm_v2/DevQuasar/DevQuasar-R1-Uncensored-Llama-8B/ee60453d-2d51-46f7-8a18-c651d590f0e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DevQuasar_DevQuasar-R1-Uncensored-Llama-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DevQuasar-R1-Uncensored-Llama-8B", - "id": "DevQuasar/DevQuasar-R1-Uncensored-Llama-8B", - "developer": "DevQuasar", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3849 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5118 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3308 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4436 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO/b0ac4b11-f7b4-4753-baae-310a92f08259.json b/data/hfopenllm_v2/Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO/b0ac4b11-f7b4-4753-baae-310a92f08259.json deleted file mode 100644 index c68c27f32..000000000 --- a/data/hfopenllm_v2/Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO/b0ac4b11-f7b4-4753-baae-310a92f08259.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Dongwei_DeepSeek-R1-Distill-Qwen-7B-GRPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-7B-GRPO", - "id": "Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO", - "developer": "Dongwei", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4038 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3443 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1956 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2322 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test/324db8b3-38c7-4a2c-82e8-7bebfa38e760.json b/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test/324db8b3-38c7-4a2c-82e8-7bebfa38e760.json deleted file mode 100644 index 735b5e369..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test/324db8b3-38c7-4a2c-82e8-7bebfa38e760.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_L3-8B-R1-WolfCore-V1.5-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B-R1-WolfCore-V1.5-test", - "id": "DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3955 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5315 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3841 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore/54dd9033-61b9-4f26-9cde-e04c7136524b.json b/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore/54dd9033-61b9-4f26-9cde-e04c7136524b.json deleted file mode 100644 index 806bf87b7..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore/54dd9033-61b9-4f26-9cde-e04c7136524b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_L3-8B-R1-WolfCore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B-R1-WolfCore", - "id": "DoppelReflEx/L3-8B-R1-WolfCore", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5318 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1631 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3717 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/L3-8B-WolfCore/d0973d6c-373c-41cd-9e62-52470c044dac.json b/data/hfopenllm_v2/DoppelReflEx/L3-8B-WolfCore/d0973d6c-373c-41cd-9e62-52470c044dac.json deleted file mode 100644 index 1553f5b9c..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/L3-8B-WolfCore/d0973d6c-373c-41cd-9e62-52470c044dac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_L3-8B-WolfCore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B-WolfCore", - "id": "DoppelReflEx/L3-8B-WolfCore", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4022 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5182 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3973 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3705 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame-test/da15da67-b316-4c2e-86a5-c1f88eece9cb.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame-test/da15da67-b316-4c2e-86a5-c1f88eece9cb.json deleted file mode 100644 index 49166bb43..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame-test/da15da67-b316-4c2e-86a5-c1f88eece9cb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-FoxFrame-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-FoxFrame-test", - "id": "DoppelReflEx/MN-12B-FoxFrame-test", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4222 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5456 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3503 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame2-test/b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame2-test/b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json deleted file mode 100644 index b9e5d4976..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame2-test/b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-FoxFrame2-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-FoxFrame2-test", - "id": "DoppelReflEx/MN-12B-FoxFrame2-test", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4319 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5485 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1405 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4252 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3569 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame3-test/bce7b15d-1670-46db-bdff-24fb38bc3fd9.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame3-test/bce7b15d-1670-46db-bdff-24fb38bc3fd9.json deleted file mode 100644 index 9a33bd885..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame3-test/bce7b15d-1670-46db-bdff-24fb38bc3fd9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-FoxFrame3-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-FoxFrame3-test", - "id": "DoppelReflEx/MN-12B-FoxFrame3-test", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4598 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Kakigori/15e5e02f-27b9-4063-b601-42c2b17180f9.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Kakigori/15e5e02f-27b9-4063-b601-42c2b17180f9.json deleted file mode 100644 index 14634831b..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Kakigori/15e5e02f-27b9-4063-b601-42c2b17180f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Kakigori/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Kakigori", - "id": "DoppelReflEx/MN-12B-Kakigori", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3593 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4052 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3581 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-2/51b0c546-0dde-4668-a8b8-3b9753a31aa0.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-2/51b0c546-0dde-4668-a8b8-3b9753a31aa0.json deleted file mode 100644 index 91323e70b..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-2/51b0c546-0dde-4668-a8b8-3b9753a31aa0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame-Experiment-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-LilithFrame-Experiment-2", - "id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-2", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4299 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4983 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3276 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-3/45842b1c-cf68-44a7-928f-2da454cdd13f.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-3/45842b1c-cf68-44a7-928f-2da454cdd13f.json deleted file mode 100644 index e8a0610db..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-3/45842b1c-cf68-44a7-928f-2da454cdd13f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame-Experiment-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-LilithFrame-Experiment-3", - "id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-3", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4128 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5468 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4039 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-4/c15cdefd-dbe3-432e-aab0-3c43540cd320.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-4/c15cdefd-dbe3-432e-aab0-3c43540cd320.json deleted file mode 100644 index 61d3a73e2..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-4/c15cdefd-dbe3-432e-aab0-3c43540cd320.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame-Experiment-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-LilithFrame-Experiment-4", - "id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-4", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5534 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/1f489afa-a01d-40f3-836a-9e386c502d1d.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/1f489afa-a01d-40f3-836a-9e386c502d1d.json deleted file mode 100644 index 87c8dd012..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/1f489afa-a01d-40f3-836a-9e386c502d1d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-LilithFrame", - "id": "DoppelReflEx/MN-12B-LilithFrame", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4944 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3256 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/94bcc87e-eb06-4321-9b72-2f99168cf92a.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/94bcc87e-eb06-4321-9b72-2f99168cf92a.json deleted file mode 100644 index 8a84d871c..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/94bcc87e-eb06-4321-9b72-2f99168cf92a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-LilithFrame", - "id": "DoppelReflEx/MN-12B-LilithFrame", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4956 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3237 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-GreenSnake/c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-GreenSnake/c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json deleted file mode 100644 index c66fd0b26..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-GreenSnake/c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-GreenSnake/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-GreenSnake", - "id": "DoppelReflEx/MN-12B-Mimicore-GreenSnake", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5481 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.139 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4306 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Nocturne/b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Nocturne/b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json deleted file mode 100644 index 8af1a665e..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Nocturne/b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Nocturne/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-Nocturne", - "id": "DoppelReflEx/MN-12B-Mimicore-Nocturne", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3957 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5703 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1057 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4569 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3634 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment/a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment/a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json deleted file mode 100644 index eb3e1d5ef..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment/a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Orochi-v2-Experiment/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-Orochi-v2-Experiment", - "id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2842 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5323 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4574 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment/f07c3a4a-2a8e-45c4-a726-be95726df2db.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment/f07c3a4a-2a8e-45c4-a726-be95726df2db.json deleted file mode 100644 index 368728f64..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment/f07c3a4a-2a8e-45c4-a726-be95726df2db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Orochi-v3-Experiment/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-Orochi-v3-Experiment", - "id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4102 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5438 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4438 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment/f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment/f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json deleted file mode 100644 index e36c363a4..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment/f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Orochi-v4-Experiment/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-Orochi-v4-Experiment", - "id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5463 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi/65acabdc-ea5f-426c-820b-2b79f2b20b44.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi/65acabdc-ea5f-426c-820b-2b79f2b20b44.json deleted file mode 100644 index 443a4c901..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi/65acabdc-ea5f-426c-820b-2b79f2b20b44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Orochi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-Orochi", - "id": "DoppelReflEx/MN-12B-Mimicore-Orochi", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5498 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4546 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/96b00cfa-1383-4b36-a043-17eb39678ffc.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/96b00cfa-1383-4b36-a043-17eb39678ffc.json deleted file mode 100644 index 995274f5e..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/96b00cfa-1383-4b36-a043-17eb39678ffc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-1", - "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4866 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.379 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3114 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/3b8a796e-6bde-4506-8335-bd3cc72482e1.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/3b8a796e-6bde-4506-8335-bd3cc72482e1.json deleted file mode 100644 index 037fde141..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/3b8a796e-6bde-4506-8335-bd3cc72482e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-2", - "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3124 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5126 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3975 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json deleted file mode 100644 index acc3b57f9..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-3", - "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4302 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4812 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3198 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/65d9e237-2757-459e-94e7-e382213e4eeb.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/65d9e237-2757-459e-94e7-e382213e4eeb.json deleted file mode 100644 index bd2328696..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/65d9e237-2757-459e-94e7-e382213e4eeb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-4", - "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5185 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4002 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake/c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake/c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json deleted file mode 100644 index 2aafdb196..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake/c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mimicore-WhiteSnake", - "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4438 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5605 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4569 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Unleashed-Twilight/2e7d3674-d0b0-4b87-8bd8-8202114b7665.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Unleashed-Twilight/2e7d3674-d0b0-4b87-8bd8-8202114b7665.json deleted file mode 100644 index c22d0071e..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Unleashed-Twilight/2e7d3674-d0b0-4b87-8bd8-8202114b7665.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Unleashed-Twilight/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Unleashed-Twilight", - "id": "DoppelReflEx/MN-12B-Unleashed-Twilight", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5521 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4384 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3678 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-WolFrame/30d21295-beb1-4179-8c6f-7bac79b29474.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-WolFrame/30d21295-beb1-4179-8c6f-7bac79b29474.json deleted file mode 100644 index 4c6187607..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-WolFrame/30d21295-beb1-4179-8c6f-7bac79b29474.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-WolFrame/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-WolFrame", - "id": "DoppelReflEx/MN-12B-WolFrame", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4397 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5117 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-test/e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-test/e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json deleted file mode 100644 index 3848f175e..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-test/e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniusLight-24B-test", - "id": "DoppelReflEx/MiniusLight-24B-test", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0394 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6334 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4093 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5182 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1b-test/7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1b-test/7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json deleted file mode 100644 index fa3042ed0..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1b-test/7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B-v1b-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniusLight-24B-v1b-test", - "id": "DoppelReflEx/MiniusLight-24B-v1b-test", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3791 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6617 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2394 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4557 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5365 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1c-test/336effcd-d8fc-4477-846f-70fc40bdc111.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1c-test/336effcd-d8fc-4477-846f-70fc40bdc111.json deleted file mode 100644 index c424357f5..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1c-test/336effcd-d8fc-4477-846f-70fc40bdc111.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B-v1c-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniusLight-24B-v1c-test", - "id": "DoppelReflEx/MiniusLight-24B-v1c-test", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3786 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6753 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2968 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4634 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5487 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1d-test/28f87820-d587-498e-b713-7c0af0cdc324.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1d-test/28f87820-d587-498e-b713-7c0af0cdc324.json deleted file mode 100644 index f4fb232bb..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1d-test/28f87820-d587-498e-b713-7c0af0cdc324.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B-v1d-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniusLight-24B-v1d-test", - "id": "DoppelReflEx/MiniusLight-24B-v1d-test", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4032 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6712 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2946 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5489 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B/f1b671ab-ebb3-43ec-86fa-832982d04cc1.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B/f1b671ab-ebb3-43ec-86fa-832982d04cc1.json deleted file mode 100644 index 5cc2e4360..000000000 --- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B/f1b671ab-ebb3-43ec-86fa-832982d04cc1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniusLight-24B", - "id": "DoppelReflEx/MiniusLight-24B", - "developer": "DoppelReflEx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2577 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4319 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5091 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Again-8B-Model_Stock/327cde83-d107-4455-bc03-7e03026c52e6.json b/data/hfopenllm_v2/DreadPoor/Again-8B-Model_Stock/327cde83-d107-4455-bc03-7e03026c52e6.json deleted file mode 100644 index e2c3518ef..000000000 --- a/data/hfopenllm_v2/DreadPoor/Again-8B-Model_Stock/327cde83-d107-4455-bc03-7e03026c52e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Again-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Again-8B-Model_Stock", - "id": "DreadPoor/Again-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6724 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3987 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Alita99-8B-LINEAR/7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json b/data/hfopenllm_v2/DreadPoor/Alita99-8B-LINEAR/7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json deleted file mode 100644 index de70affaf..000000000 --- a/data/hfopenllm_v2/DreadPoor/Alita99-8B-LINEAR/7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Alita99-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Alita99-8B-LINEAR", - "id": "DreadPoor/Alita99-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5442 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1647 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4266 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/AnotherTest/92c8afbe-7735-40c8-af0e-29da687c2070.json b/data/hfopenllm_v2/DreadPoor/AnotherTest/92c8afbe-7735-40c8-af0e-29da687c2070.json deleted file mode 100644 index c4e25da29..000000000 --- a/data/hfopenllm_v2/DreadPoor/AnotherTest/92c8afbe-7735-40c8-af0e-29da687c2070.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_AnotherTest/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AnotherTest", - "id": "DreadPoor/AnotherTest", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4701 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4683 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4213 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2875 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aspire-8B-model_stock/bca052ac-6556-49d8-94e3-f4bda560a5d3.json b/data/hfopenllm_v2/DreadPoor/Aspire-8B-model_stock/bca052ac-6556-49d8-94e3-f4bda560a5d3.json deleted file mode 100644 index 6e0e74802..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aspire-8B-model_stock/bca052ac-6556-49d8-94e3-f4bda560a5d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire-8B-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aspire-8B-model_stock", - "id": "DreadPoor/Aspire-8B-model_stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7141 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1495 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4212 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3763 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_1.3-8B_model-stock/5f74fe6e-8575-4cea-959b-e6ba03c7e273.json b/data/hfopenllm_v2/DreadPoor/Aspire_1.3-8B_model-stock/5f74fe6e-8575-4cea-959b-e6ba03c7e273.json deleted file mode 100644 index 91197dc03..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aspire_1.3-8B_model-stock/5f74fe6e-8575-4cea-959b-e6ba03c7e273.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_1.3-8B_model-stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aspire_1.3-8B_model-stock", - "id": "DreadPoor/Aspire_1.3-8B_model-stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7062 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1692 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4105 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V2-8B-Model_Stock/b0f696f5-ed70-4293-999d-a9121192c137.json b/data/hfopenllm_v2/DreadPoor/Aspire_V2-8B-Model_Stock/b0f696f5-ed70-4293-999d-a9121192c137.json deleted file mode 100644 index 206f0a344..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aspire_V2-8B-Model_Stock/b0f696f5-ed70-4293-999d-a9121192c137.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V2-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aspire_V2-8B-Model_Stock", - "id": "DreadPoor/Aspire_V2-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.533 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3894 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V2.1-8B-Model_Stock/18751a6f-062c-4915-bbe0-ae222cf9ae0b.json b/data/hfopenllm_v2/DreadPoor/Aspire_V2.1-8B-Model_Stock/18751a6f-062c-4915-bbe0-ae222cf9ae0b.json deleted file mode 100644 index 2e1a6a761..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aspire_V2.1-8B-Model_Stock/18751a6f-062c-4915-bbe0-ae222cf9ae0b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V2.1-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aspire_V2.1-8B-Model_Stock", - "id": "DreadPoor/Aspire_V2.1-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7238 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5236 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1767 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3801 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT-8B-Model_Stock/398ebe04-638f-4a11-b99d-6778ff3ff97b.json b/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT-8B-Model_Stock/398ebe04-638f-4a11-b99d-6778ff3ff97b.json deleted file mode 100644 index 98689999d..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT-8B-Model_Stock/398ebe04-638f-4a11-b99d-6778ff3ff97b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V2_ALT-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aspire_V2_ALT-8B-Model_Stock", - "id": "DreadPoor/Aspire_V2_ALT-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.173 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3975 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3727 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock/b4f197f2-3456-4221-b222-10dfbbb50f56.json b/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock/b4f197f2-3456-4221-b222-10dfbbb50f56.json deleted file mode 100644 index 2a86794fd..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock/b4f197f2-3456-4221-b222-10dfbbb50f56.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V2_ALT_ROW-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aspire_V2_ALT_ROW-8B-Model_Stock", - "id": "DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.173 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3975 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3727 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V3-8B-Model_Stock/0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json b/data/hfopenllm_v2/DreadPoor/Aspire_V3-8B-Model_Stock/0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json deleted file mode 100644 index f208a5c89..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aspire_V3-8B-Model_Stock/0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V3-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aspire_V3-8B-Model_Stock", - "id": "DreadPoor/Aspire_V3-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5119 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5268 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3642 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V4-8B-Model_Stock/1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json b/data/hfopenllm_v2/DreadPoor/Aspire_V4-8B-Model_Stock/1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json deleted file mode 100644 index b90acd330..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aspire_V4-8B-Model_Stock/1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V4-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aspire_V4-8B-Model_Stock", - "id": "DreadPoor/Aspire_V4-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7694 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V4_ALT-8B-Model_Stock/496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json b/data/hfopenllm_v2/DreadPoor/Aspire_V4_ALT-8B-Model_Stock/496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json deleted file mode 100644 index d418f3c08..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aspire_V4_ALT-8B-Model_Stock/496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V4_ALT-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aspire_V4_ALT-8B-Model_Stock", - "id": "DreadPoor/Aspire_V4_ALT-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7366 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5268 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1813 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3682 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Asymmetric_Linearity-8B-Model_Stock/37071760-d24c-43cc-9965-d8c7873c0ee8.json b/data/hfopenllm_v2/DreadPoor/Asymmetric_Linearity-8B-Model_Stock/37071760-d24c-43cc-9965-d8c7873c0ee8.json deleted file mode 100644 index 77c551bb3..000000000 --- a/data/hfopenllm_v2/DreadPoor/Asymmetric_Linearity-8B-Model_Stock/37071760-d24c-43cc-9965-d8c7873c0ee8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Asymmetric_Linearity-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Asymmetric_Linearity-8B-Model_Stock", - "id": "DreadPoor/Asymmetric_Linearity-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7174 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1647 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LINEAR/91a71a49-5dd4-43b1-9e1c-fd9492236712.json b/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LINEAR/91a71a49-5dd4-43b1-9e1c-fd9492236712.json deleted file mode 100644 index ea3d38b72..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LINEAR/91a71a49-5dd4-43b1-9e1c-fd9492236712.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aurora_faustus-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aurora_faustus-8B-LINEAR", - "id": "DreadPoor/Aurora_faustus-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7281 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4146 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED/d1d48abb-6dcf-4905-958f-c3a3e75feac6.json b/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED/d1d48abb-6dcf-4905-958f-c3a3e75feac6.json deleted file mode 100644 index 1abc59610..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED/d1d48abb-6dcf-4905-958f-c3a3e75feac6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aurora_faustus-8B-LORABLATED/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aurora_faustus-8B-LORABLATED", - "id": "DreadPoor/Aurora_faustus-8B-LORABLATED", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7527 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1488 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4239 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3673 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED_ALT/68282f29-f56f-420b-bd1e-9cc54783c1a5.json b/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED_ALT/68282f29-f56f-420b-bd1e-9cc54783c1a5.json deleted file mode 100644 index 12017ff39..000000000 --- a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED_ALT/68282f29-f56f-420b-bd1e-9cc54783c1a5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Aurora_faustus-8B-LORABLATED_ALT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aurora_faustus-8B-LORABLATED_ALT", - "id": "DreadPoor/Aurora_faustus-8B-LORABLATED_ALT", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1586 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4225 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3694 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Autumn_Dawn-8B-LINEAR/cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json b/data/hfopenllm_v2/DreadPoor/Autumn_Dawn-8B-LINEAR/cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json deleted file mode 100644 index ab92e181e..000000000 --- a/data/hfopenllm_v2/DreadPoor/Autumn_Dawn-8B-LINEAR/cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Autumn_Dawn-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Autumn_Dawn-8B-LINEAR", - "id": "DreadPoor/Autumn_Dawn-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7293 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5459 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel-8B-LINEAR/22a9d3b8-ac45-4433-8926-5d28681af922.json b/data/hfopenllm_v2/DreadPoor/BaeZel-8B-LINEAR/22a9d3b8-ac45-4433-8926-5d28681af922.json deleted file mode 100644 index 52019043a..000000000 --- a/data/hfopenllm_v2/DreadPoor/BaeZel-8B-LINEAR/22a9d3b8-ac45-4433-8926-5d28681af922.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BaeZel-8B-LINEAR", - "id": "DreadPoor/BaeZel-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5464 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1813 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4227 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3861 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel-8B-Model_Stock/57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json b/data/hfopenllm_v2/DreadPoor/BaeZel-8B-Model_Stock/57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json deleted file mode 100644 index 0d62d9248..000000000 --- a/data/hfopenllm_v2/DreadPoor/BaeZel-8B-Model_Stock/57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BaeZel-8B-Model_Stock", - "id": "DreadPoor/BaeZel-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7713 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5408 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel_V2-8B-Model_Stock/24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json b/data/hfopenllm_v2/DreadPoor/BaeZel_V2-8B-Model_Stock/24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json deleted file mode 100644 index 92150ff2e..000000000 --- a/data/hfopenllm_v2/DreadPoor/BaeZel_V2-8B-Model_Stock/24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel_V2-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BaeZel_V2-8B-Model_Stock", - "id": "DreadPoor/BaeZel_V2-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7677 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1798 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3947 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel_V2_ALT-8B-Model_Stock/6ed62f64-c2be-4bca-b17d-bd0184a3d498.json b/data/hfopenllm_v2/DreadPoor/BaeZel_V2_ALT-8B-Model_Stock/6ed62f64-c2be-4bca-b17d-bd0184a3d498.json deleted file mode 100644 index 4fbb404d5..000000000 --- a/data/hfopenllm_v2/DreadPoor/BaeZel_V2_ALT-8B-Model_Stock/6ed62f64-c2be-4bca-b17d-bd0184a3d498.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel_V2_ALT-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BaeZel_V2_ALT-8B-Model_Stock", - "id": "DreadPoor/BaeZel_V2_ALT-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7677 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1798 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3947 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel_V3-8B-Model_Stock/db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json b/data/hfopenllm_v2/DreadPoor/BaeZel_V3-8B-Model_Stock/db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json deleted file mode 100644 index 6a2b2bd8f..000000000 --- a/data/hfopenllm_v2/DreadPoor/BaeZel_V3-8B-Model_Stock/db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel_V3-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BaeZel_V3-8B-Model_Stock", - "id": "DreadPoor/BaeZel_V3-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7832 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1896 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4174 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3888 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Blunt_Edge-8B-SLERP/7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json b/data/hfopenllm_v2/DreadPoor/Blunt_Edge-8B-SLERP/7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json deleted file mode 100644 index c3a3e4356..000000000 --- a/data/hfopenllm_v2/DreadPoor/Blunt_Edge-8B-SLERP/7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Blunt_Edge-8B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Blunt_Edge-8B-SLERP", - "id": "DreadPoor/Blunt_Edge-8B-SLERP", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7497 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5389 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4174 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/BulkUp/6f286418-d8e3-4c11-8941-cfe5a18b1037.json b/data/hfopenllm_v2/DreadPoor/BulkUp/6f286418-d8e3-4c11-8941-cfe5a18b1037.json deleted file mode 100644 index dfd176ab1..000000000 --- a/data/hfopenllm_v2/DreadPoor/BulkUp/6f286418-d8e3-4c11-8941-cfe5a18b1037.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_BulkUp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BulkUp", - "id": "DreadPoor/BulkUp", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1778 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.287 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Cadence-8B-LINEAR/b0a83b1f-3af2-45e8-9d88-d7302a529112.json b/data/hfopenllm_v2/DreadPoor/Cadence-8B-LINEAR/b0a83b1f-3af2-45e8-9d88-d7302a529112.json deleted file mode 100644 index 6b7ee9836..000000000 --- a/data/hfopenllm_v2/DreadPoor/Cadence-8B-LINEAR/b0a83b1f-3af2-45e8-9d88-d7302a529112.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Cadence-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cadence-8B-LINEAR", - "id": "DreadPoor/Cadence-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7682 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5433 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1677 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3803 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Caelid-8B-Model_Stock/0462fce1-51b4-48d8-8278-a90048ffd637.json b/data/hfopenllm_v2/DreadPoor/Caelid-8B-Model_Stock/0462fce1-51b4-48d8-8278-a90048ffd637.json deleted file mode 100644 index 2d3dba8f9..000000000 --- a/data/hfopenllm_v2/DreadPoor/Caelid-8B-Model_Stock/0462fce1-51b4-48d8-8278-a90048ffd637.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Caelid-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Caelid-8B-Model_Stock", - "id": "DreadPoor/Caelid-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7247 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.546 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1511 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4001 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3816 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Casuar-9B-Model_Stock/e02f597c-c368-4223-ac90-c99d82c90634.json b/data/hfopenllm_v2/DreadPoor/Casuar-9B-Model_Stock/e02f597c-c368-4223-ac90-c99d82c90634.json deleted file mode 100644 index c97dcca00..000000000 --- a/data/hfopenllm_v2/DreadPoor/Casuar-9B-Model_Stock/e02f597c-c368-4223-ac90-c99d82c90634.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Casuar-9B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Casuar-9B-Model_Stock", - "id": "DreadPoor/Casuar-9B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7765 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4165 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4156 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Condensed_Milk-8B-Model_Stock/32e63ffc-c64e-4562-ba99-14873f5bac2e.json b/data/hfopenllm_v2/DreadPoor/Condensed_Milk-8B-Model_Stock/32e63ffc-c64e-4562-ba99-14873f5bac2e.json deleted file mode 100644 index 3cd2321bc..000000000 --- a/data/hfopenllm_v2/DreadPoor/Condensed_Milk-8B-Model_Stock/32e63ffc-c64e-4562-ba99-14873f5bac2e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Condensed_Milk-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Condensed_Milk-8B-Model_Stock", - "id": "DreadPoor/Condensed_Milk-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7536 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1745 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/CoolerCoder-8B-LINEAR/6af4faad-05c2-488b-9685-e11ae4e1cbf0.json b/data/hfopenllm_v2/DreadPoor/CoolerCoder-8B-LINEAR/6af4faad-05c2-488b-9685-e11ae4e1cbf0.json deleted file mode 100644 index c49036cd8..000000000 --- a/data/hfopenllm_v2/DreadPoor/CoolerCoder-8B-LINEAR/6af4faad-05c2-488b-9685-e11ae4e1cbf0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_CoolerCoder-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CoolerCoder-8B-LINEAR", - "id": "DreadPoor/CoolerCoder-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4519 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4762 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3964 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3159 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Damasteel-8B-LINEAR/8aa7701b-7019-44a0-851f-cfc9108fdfbd.json b/data/hfopenllm_v2/DreadPoor/Damasteel-8B-LINEAR/8aa7701b-7019-44a0-851f-cfc9108fdfbd.json deleted file mode 100644 index 7160aa1de..000000000 --- a/data/hfopenllm_v2/DreadPoor/Damasteel-8B-LINEAR/8aa7701b-7019-44a0-851f-cfc9108fdfbd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Damasteel-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Damasteel-8B-LINEAR", - "id": "DreadPoor/Damasteel-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7384 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1669 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4212 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3779 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Dearly_Beloved-8B-TIES/a2f95fad-5ab5-47d0-b9aa-33358c673caf.json b/data/hfopenllm_v2/DreadPoor/Dearly_Beloved-8B-TIES/a2f95fad-5ab5-47d0-b9aa-33358c673caf.json deleted file mode 100644 index a39f8ede9..000000000 --- a/data/hfopenllm_v2/DreadPoor/Dearly_Beloved-8B-TIES/a2f95fad-5ab5-47d0-b9aa-33358c673caf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Dearly_Beloved-8B-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dearly_Beloved-8B-TIES", - "id": "DreadPoor/Dearly_Beloved-8B-TIES", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8267 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.405 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4175 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Decayed-8B-LINEAR/aef73a77-9df7-4d4f-89ef-50905d326198.json b/data/hfopenllm_v2/DreadPoor/Decayed-8B-LINEAR/aef73a77-9df7-4d4f-89ef-50905d326198.json deleted file mode 100644 index 28e908428..000000000 --- a/data/hfopenllm_v2/DreadPoor/Decayed-8B-LINEAR/aef73a77-9df7-4d4f-89ef-50905d326198.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Decayed-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Decayed-8B-LINEAR", - "id": "DreadPoor/Decayed-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7676 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5417 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1715 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3763 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Derivative-8B-Model_Stock/e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json b/data/hfopenllm_v2/DreadPoor/Derivative-8B-Model_Stock/e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json deleted file mode 100644 index 5b83a9506..000000000 --- a/data/hfopenllm_v2/DreadPoor/Derivative-8B-Model_Stock/e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Derivative-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Derivative-8B-Model_Stock", - "id": "DreadPoor/Derivative-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7667 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.179 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3811 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Derivative_V2-8B-Model_Stock/8ff39438-907c-465f-ac7a-5a25cfd8d824.json b/data/hfopenllm_v2/DreadPoor/Derivative_V2-8B-Model_Stock/8ff39438-907c-465f-ac7a-5a25cfd8d824.json deleted file mode 100644 index 332287ece..000000000 --- a/data/hfopenllm_v2/DreadPoor/Derivative_V2-8B-Model_Stock/8ff39438-907c-465f-ac7a-5a25cfd8d824.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Derivative_V2-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Derivative_V2-8B-Model_Stock", - "id": "DreadPoor/Derivative_V2-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7537 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5393 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1798 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4123 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3856 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Derivative_V2_ALT-8B-Model_Stock/83d831c5-a74f-4699-9961-664a7a51b7b8.json b/data/hfopenllm_v2/DreadPoor/Derivative_V2_ALT-8B-Model_Stock/83d831c5-a74f-4699-9961-664a7a51b7b8.json deleted file mode 100644 index 2b2d015d1..000000000 --- a/data/hfopenllm_v2/DreadPoor/Derivative_V2_ALT-8B-Model_Stock/83d831c5-a74f-4699-9961-664a7a51b7b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Derivative_V2_ALT-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Derivative_V2_ALT-8B-Model_Stock", - "id": "DreadPoor/Derivative_V2_ALT-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.772 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5365 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1881 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Derivative_V3-8B-Model_Stock/83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json b/data/hfopenllm_v2/DreadPoor/Derivative_V3-8B-Model_Stock/83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json deleted file mode 100644 index cfc2e036f..000000000 --- a/data/hfopenllm_v2/DreadPoor/Derivative_V3-8B-Model_Stock/83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Derivative_V3-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Derivative_V3-8B-Model_Stock", - "id": "DreadPoor/Derivative_V3-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6964 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5243 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1465 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3502 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Elusive_Dragon_Heart-8B-LINEAR/3811cc34-45cb-4932-b862-39bf042331e0.json b/data/hfopenllm_v2/DreadPoor/Elusive_Dragon_Heart-8B-LINEAR/3811cc34-45cb-4932-b862-39bf042331e0.json deleted file mode 100644 index cf8031e7d..000000000 --- a/data/hfopenllm_v2/DreadPoor/Elusive_Dragon_Heart-8B-LINEAR/3811cc34-45cb-4932-b862-39bf042331e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Elusive_Dragon_Heart-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Elusive_Dragon_Heart-8B-LINEAR", - "id": "DreadPoor/Elusive_Dragon_Heart-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7131 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5456 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4146 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Emu_Eggs-9B-Model_Stock/5b2a16a1-7a2a-40b7-add6-b99378b6af00.json b/data/hfopenllm_v2/DreadPoor/Emu_Eggs-9B-Model_Stock/5b2a16a1-7a2a-40b7-add6-b99378b6af00.json deleted file mode 100644 index 34eb53856..000000000 --- a/data/hfopenllm_v2/DreadPoor/Emu_Eggs-9B-Model_Stock/5b2a16a1-7a2a-40b7-add6-b99378b6af00.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Emu_Eggs-9B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Emu_Eggs-9B-Model_Stock", - "id": "DreadPoor/Emu_Eggs-9B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7607 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6052 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4227 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Eunoia_Vespera-8B-LINEAR/1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json b/data/hfopenllm_v2/DreadPoor/Eunoia_Vespera-8B-LINEAR/1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json deleted file mode 100644 index be6c2967a..000000000 --- a/data/hfopenllm_v2/DreadPoor/Eunoia_Vespera-8B-LINEAR/1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Eunoia_Vespera-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Eunoia_Vespera-8B-LINEAR", - "id": "DreadPoor/Eunoia_Vespera-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5399 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3839 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Fu_sion_HA-8B-SLERP/742e0a1c-7496-4076-bdbf-ada0a8e528c2.json b/data/hfopenllm_v2/DreadPoor/Fu_sion_HA-8B-SLERP/742e0a1c-7496-4076-bdbf-ada0a8e528c2.json deleted file mode 100644 index cc65f1f79..000000000 --- a/data/hfopenllm_v2/DreadPoor/Fu_sion_HA-8B-SLERP/742e0a1c-7496-4076-bdbf-ada0a8e528c2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Fu_sion_HA-8B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fu_sion_HA-8B-SLERP", - "id": "DreadPoor/Fu_sion_HA-8B-SLERP", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7609 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5373 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1752 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/HOT_STINKING_GARBAGE/f0664035-3256-444c-b848-ef603e0d46b5.json b/data/hfopenllm_v2/DreadPoor/HOT_STINKING_GARBAGE/f0664035-3256-444c-b848-ef603e0d46b5.json deleted file mode 100644 index 0bd391e53..000000000 --- a/data/hfopenllm_v2/DreadPoor/HOT_STINKING_GARBAGE/f0664035-3256-444c-b848-ef603e0d46b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_HOT_STINKING_GARBAGE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HOT_STINKING_GARBAGE", - "id": "DreadPoor/HOT_STINKING_GARBAGE", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5754 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4884 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3017 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/H_the_eighth-8B-LINEAR/9159aaa6-8663-491f-901a-74da4c343d20.json b/data/hfopenllm_v2/DreadPoor/H_the_eighth-8B-LINEAR/9159aaa6-8663-491f-901a-74da4c343d20.json deleted file mode 100644 index 506db7e4d..000000000 --- a/data/hfopenllm_v2/DreadPoor/H_the_eighth-8B-LINEAR/9159aaa6-8663-491f-901a-74da4c343d20.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_H_the_eighth-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "H_the_eighth-8B-LINEAR", - "id": "DreadPoor/H_the_eighth-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7469 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3824 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Happy_New_Year-8B-Model_Stock/5179b145-9fdb-4ab5-8cca-87966ecf6519.json b/data/hfopenllm_v2/DreadPoor/Happy_New_Year-8B-Model_Stock/5179b145-9fdb-4ab5-8cca-87966ecf6519.json deleted file mode 100644 index c37605ceb..000000000 --- a/data/hfopenllm_v2/DreadPoor/Happy_New_Year-8B-Model_Stock/5179b145-9fdb-4ab5-8cca-87966ecf6519.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Happy_New_Year-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Happy_New_Year-8B-Model_Stock", - "id": "DreadPoor/Happy_New_Year-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7616 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5368 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1594 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3879 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Heart_Stolen-8B-Model_Stock/da872193-1d25-4e8e-bc22-9138a9d121ba.json b/data/hfopenllm_v2/DreadPoor/Heart_Stolen-8B-Model_Stock/da872193-1d25-4e8e-bc22-9138a9d121ba.json deleted file mode 100644 index 5afe7e574..000000000 --- a/data/hfopenllm_v2/DreadPoor/Heart_Stolen-8B-Model_Stock/da872193-1d25-4e8e-bc22-9138a9d121ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Heart_Stolen-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Heart_Stolen-8B-Model_Stock", - "id": "DreadPoor/Heart_Stolen-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7245 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1722 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Heart_Stolen-ALT-8B-Model_Stock/967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json b/data/hfopenllm_v2/DreadPoor/Heart_Stolen-ALT-8B-Model_Stock/967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json deleted file mode 100644 index 1724bbb8b..000000000 --- a/data/hfopenllm_v2/DreadPoor/Heart_Stolen-ALT-8B-Model_Stock/967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Heart_Stolen-ALT-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Heart_Stolen-ALT-8B-Model_Stock", - "id": "DreadPoor/Heart_Stolen-ALT-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7184 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5263 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1563 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4055 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3772 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Here_We_Go_Again-8B-SLERP/dd615b4c-189e-4361-bcf4-879fd59b28a2.json b/data/hfopenllm_v2/DreadPoor/Here_We_Go_Again-8B-SLERP/dd615b4c-189e-4361-bcf4-879fd59b28a2.json deleted file mode 100644 index 040701af7..000000000 --- a/data/hfopenllm_v2/DreadPoor/Here_We_Go_Again-8B-SLERP/dd615b4c-189e-4361-bcf4-879fd59b28a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Here_We_Go_Again-8B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Here_We_Go_Again-8B-SLERP", - "id": "DreadPoor/Here_We_Go_Again-8B-SLERP", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7442 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.546 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.173 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Howdy-8B-LINEAR/0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json b/data/hfopenllm_v2/DreadPoor/Howdy-8B-LINEAR/0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json deleted file mode 100644 index 6da88e2f6..000000000 --- a/data/hfopenllm_v2/DreadPoor/Howdy-8B-LINEAR/0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Howdy-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Howdy-8B-LINEAR", - "id": "DreadPoor/Howdy-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4121 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Incidental-8B-Model_Stock/8c583b51-4349-48af-98d9-8eaaf43d60b6.json b/data/hfopenllm_v2/DreadPoor/Incidental-8B-Model_Stock/8c583b51-4349-48af-98d9-8eaaf43d60b6.json deleted file mode 100644 index f986ac603..000000000 --- a/data/hfopenllm_v2/DreadPoor/Incidental-8B-Model_Stock/8c583b51-4349-48af-98d9-8eaaf43d60b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Incidental-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Incidental-8B-Model_Stock", - "id": "DreadPoor/Incidental-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1616 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.424 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Irina-8B-model_stock/34aab556-5e97-4ea2-9ada-d17dc3624be2.json b/data/hfopenllm_v2/DreadPoor/Irina-8B-model_stock/34aab556-5e97-4ea2-9ada-d17dc3624be2.json deleted file mode 100644 index b23dc6544..000000000 --- a/data/hfopenllm_v2/DreadPoor/Irina-8B-model_stock/34aab556-5e97-4ea2-9ada-d17dc3624be2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Irina-8B-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Irina-8B-model_stock", - "id": "DreadPoor/Irina-8B-model_stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6799 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5237 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Kindling-8B-Model_Stock/fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json b/data/hfopenllm_v2/DreadPoor/Kindling-8B-Model_Stock/fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json deleted file mode 100644 index 26f843f70..000000000 --- a/data/hfopenllm_v2/DreadPoor/Kindling-8B-Model_Stock/fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Kindling-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kindling-8B-Model_Stock", - "id": "DreadPoor/Kindling-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7308 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1752 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4068 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.383 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/L3.1-BaeZel-8B-Della/b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json b/data/hfopenllm_v2/DreadPoor/L3.1-BaeZel-8B-Della/b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json deleted file mode 100644 index 5d86d24a4..000000000 --- a/data/hfopenllm_v2/DreadPoor/L3.1-BaeZel-8B-Della/b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_L3.1-BaeZel-8B-Della/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-BaeZel-8B-Della", - "id": "DreadPoor/L3.1-BaeZel-8B-Della", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5448 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1745 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3902 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Laughing_Stock-8B-Model_Stock/7f371c11-e8f0-4233-b359-aac39c0a1110.json b/data/hfopenllm_v2/DreadPoor/Laughing_Stock-8B-Model_Stock/7f371c11-e8f0-4233-b359-aac39c0a1110.json deleted file mode 100644 index aec03b176..000000000 --- a/data/hfopenllm_v2/DreadPoor/Laughing_Stock-8B-Model_Stock/7f371c11-e8f0-4233-b359-aac39c0a1110.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Laughing_Stock-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Laughing_Stock-8B-Model_Stock", - "id": "DreadPoor/Laughing_Stock-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5449 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1579 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4146 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Lava_Lamp-8B-SLERP/9f758d4e-d121-4688-8ece-8dc67a499811.json b/data/hfopenllm_v2/DreadPoor/Lava_Lamp-8B-SLERP/9f758d4e-d121-4688-8ece-8dc67a499811.json deleted file mode 100644 index e0d95bfba..000000000 --- a/data/hfopenllm_v2/DreadPoor/Lava_Lamp-8B-SLERP/9f758d4e-d121-4688-8ece-8dc67a499811.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Lava_Lamp-8B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lava_Lamp-8B-SLERP", - "id": "DreadPoor/Lava_Lamp-8B-SLERP", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5368 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1737 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/LemonP-8B-Model_Stock/903b8c71-d54d-4ce4-9845-71eb8ca8733a.json b/data/hfopenllm_v2/DreadPoor/LemonP-8B-Model_Stock/903b8c71-d54d-4ce4-9845-71eb8ca8733a.json deleted file mode 100644 index bd0f0baf1..000000000 --- a/data/hfopenllm_v2/DreadPoor/LemonP-8B-Model_Stock/903b8c71-d54d-4ce4-9845-71eb8ca8733a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_LemonP-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LemonP-8B-Model_Stock", - "id": "DreadPoor/LemonP-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7676 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5439 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1767 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Lydia_of_Whiterun-8B-LINEAR/9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json b/data/hfopenllm_v2/DreadPoor/Lydia_of_Whiterun-8B-LINEAR/9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json deleted file mode 100644 index d1ecbf772..000000000 --- a/data/hfopenllm_v2/DreadPoor/Lydia_of_Whiterun-8B-LINEAR/9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Lydia_of_Whiterun-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lydia_of_Whiterun-8B-LINEAR", - "id": "DreadPoor/Lydia_of_Whiterun-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7603 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.538 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1767 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3801 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Matryoshka-8B-LINEAR/28109e00-87c1-4809-a4fc-dddebba52621.json b/data/hfopenllm_v2/DreadPoor/Matryoshka-8B-LINEAR/28109e00-87c1-4809-a4fc-dddebba52621.json deleted file mode 100644 index 81c216cdf..000000000 --- a/data/hfopenllm_v2/DreadPoor/Matryoshka-8B-LINEAR/28109e00-87c1-4809-a4fc-dddebba52621.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Matryoshka-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Matryoshka-8B-LINEAR", - "id": "DreadPoor/Matryoshka-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7263 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5444 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1752 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4252 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3866 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Mercury_In_Retrograde-8b-Model-Stock/6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json b/data/hfopenllm_v2/DreadPoor/Mercury_In_Retrograde-8b-Model-Stock/6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json deleted file mode 100644 index 84fcc3ee6..000000000 --- a/data/hfopenllm_v2/DreadPoor/Mercury_In_Retrograde-8b-Model-Stock/6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Mercury_In_Retrograde-8b-Model-Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mercury_In_Retrograde-8b-Model-Stock", - "id": "DreadPoor/Mercury_In_Retrograde-8b-Model-Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7296 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1647 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3829 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Minthy-8B-Model_Stock/03a8091c-473e-4fbe-af70-35f791a23a0f.json b/data/hfopenllm_v2/DreadPoor/Minthy-8B-Model_Stock/03a8091c-473e-4fbe-af70-35f791a23a0f.json deleted file mode 100644 index 8bace15e3..000000000 --- a/data/hfopenllm_v2/DreadPoor/Minthy-8B-Model_Stock/03a8091c-473e-4fbe-af70-35f791a23a0f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Minthy-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minthy-8B-Model_Stock", - "id": "DreadPoor/Minthy-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7658 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5353 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4094 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3993 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Minthy_ALT-8B-Model_Stock/ed75e9ed-841b-4783-a201-bc72651afd0a.json b/data/hfopenllm_v2/DreadPoor/Minthy_ALT-8B-Model_Stock/ed75e9ed-841b-4783-a201-bc72651afd0a.json deleted file mode 100644 index bb6a90a55..000000000 --- a/data/hfopenllm_v2/DreadPoor/Minthy_ALT-8B-Model_Stock/ed75e9ed-841b-4783-a201-bc72651afd0a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Minthy_ALT-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minthy_ALT-8B-Model_Stock", - "id": "DreadPoor/Minthy_ALT-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6992 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5375 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4225 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Minthy_V2-8B-Model_Stock/38cd418c-9770-49d2-8b30-ac47e445cee3.json b/data/hfopenllm_v2/DreadPoor/Minthy_V2-8B-Model_Stock/38cd418c-9770-49d2-8b30-ac47e445cee3.json deleted file mode 100644 index f602119ec..000000000 --- a/data/hfopenllm_v2/DreadPoor/Minthy_V2-8B-Model_Stock/38cd418c-9770-49d2-8b30-ac47e445cee3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Minthy_V2-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minthy_V2-8B-Model_Stock", - "id": "DreadPoor/Minthy_V2-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7126 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5491 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1594 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3737 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Minus_Penus-8B-Model_Stock/d49b6a48-ae81-467d-87c5-b17f9ca306f8.json b/data/hfopenllm_v2/DreadPoor/Minus_Penus-8B-Model_Stock/d49b6a48-ae81-467d-87c5-b17f9ca306f8.json deleted file mode 100644 index 214eb3bc5..000000000 --- a/data/hfopenllm_v2/DreadPoor/Minus_Penus-8B-Model_Stock/d49b6a48-ae81-467d-87c5-b17f9ca306f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Minus_Penus-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minus_Penus-8B-Model_Stock", - "id": "DreadPoor/Minus_Penus-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7311 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5344 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2002 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Morphing-8B-Model_Stock/39b7e250-9f71-4833-941e-85692a48b6e6.json b/data/hfopenllm_v2/DreadPoor/Morphing-8B-Model_Stock/39b7e250-9f71-4833-941e-85692a48b6e6.json deleted file mode 100644 index 0295f0f91..000000000 --- a/data/hfopenllm_v2/DreadPoor/Morphing-8B-Model_Stock/39b7e250-9f71-4833-941e-85692a48b6e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Morphing-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Morphing-8B-Model_Stock", - "id": "DreadPoor/Morphing-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5397 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4069 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock/c0d102a2-ff8c-45ac-a825-31472b98b871.json b/data/hfopenllm_v2/DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock/c0d102a2-ff8c-45ac-a825-31472b98b871.json deleted file mode 100644 index c07ee195d..000000000 --- a/data/hfopenllm_v2/DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock/c0d102a2-ff8c-45ac-a825-31472b98b871.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Not_Even_My_Final_Form-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Not_Even_My_Final_Form-8B-Model_Stock", - "id": "DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7722 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5351 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4147 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Nother_One-8B-Model_Stock/7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json b/data/hfopenllm_v2/DreadPoor/Nother_One-8B-Model_Stock/7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json deleted file mode 100644 index 3a365d7fc..000000000 --- a/data/hfopenllm_v2/DreadPoor/Nother_One-8B-Model_Stock/7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Nother_One-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nother_One-8B-Model_Stock", - "id": "DreadPoor/Nother_One-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6863 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5205 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1518 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3595 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Noxis-8B-LINEAR/d34b899e-b067-4c9c-9fa2-439f8b2d589d.json b/data/hfopenllm_v2/DreadPoor/Noxis-8B-LINEAR/d34b899e-b067-4c9c-9fa2-439f8b2d589d.json deleted file mode 100644 index 73cd484d6..000000000 --- a/data/hfopenllm_v2/DreadPoor/Noxis-8B-LINEAR/d34b899e-b067-4c9c-9fa2-439f8b2d589d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Noxis-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Noxis-8B-LINEAR", - "id": "DreadPoor/Noxis-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6913 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5421 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1979 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Nullsworn-12B-LINEAR/8c7b2332-510b-42d3-bcbb-e177c35d27d5.json b/data/hfopenllm_v2/DreadPoor/Nullsworn-12B-LINEAR/8c7b2332-510b-42d3-bcbb-e177c35d27d5.json deleted file mode 100644 index 4c95be2fd..000000000 --- a/data/hfopenllm_v2/DreadPoor/Nullsworn-12B-LINEAR/8c7b2332-510b-42d3-bcbb-e177c35d27d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Nullsworn-12B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nullsworn-12B-LINEAR", - "id": "DreadPoor/Nullsworn-12B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5483 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3645 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Nwah-8B-Model_Stock/685f107f-e431-4dba-a117-8d6f1dd2c296.json b/data/hfopenllm_v2/DreadPoor/Nwah-8B-Model_Stock/685f107f-e431-4dba-a117-8d6f1dd2c296.json deleted file mode 100644 index ed00a7392..000000000 --- a/data/hfopenllm_v2/DreadPoor/Nwah-8B-Model_Stock/685f107f-e431-4dba-a117-8d6f1dd2c296.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Nwah-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nwah-8B-Model_Stock", - "id": "DreadPoor/Nwah-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7716 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1798 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4039 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/ONeil-model_stock-8B/e1570804-85b6-4518-a099-5f21ab27d12c.json b/data/hfopenllm_v2/DreadPoor/ONeil-model_stock-8B/e1570804-85b6-4518-a099-5f21ab27d12c.json deleted file mode 100644 index 43888a5d9..000000000 --- a/data/hfopenllm_v2/DreadPoor/ONeil-model_stock-8B/e1570804-85b6-4518-a099-5f21ab27d12c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_ONeil-model_stock-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ONeil-model_stock-8B", - "id": "DreadPoor/ONeil-model_stock-8B", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6786 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Oh_Boy-8B-LINEAR/a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json b/data/hfopenllm_v2/DreadPoor/Oh_Boy-8B-LINEAR/a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json deleted file mode 100644 index a4b064bc6..000000000 --- a/data/hfopenllm_v2/DreadPoor/Oh_Boy-8B-LINEAR/a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Oh_Boy-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Oh_Boy-8B-LINEAR", - "id": "DreadPoor/Oh_Boy-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7503 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5375 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4108 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3849 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/OrangeJ-8B-Model_Stock/1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json b/data/hfopenllm_v2/DreadPoor/OrangeJ-8B-Model_Stock/1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json deleted file mode 100644 index 009d6a3f3..000000000 --- a/data/hfopenllm_v2/DreadPoor/OrangeJ-8B-Model_Stock/1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_OrangeJ-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OrangeJ-8B-Model_Stock", - "id": "DreadPoor/OrangeJ-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7841 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5413 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4028 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3969 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR-lorablated/c901a9ee-069a-4e3e-ac52-3017d67d8800.json b/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR-lorablated/c901a9ee-069a-4e3e-ac52-3017d67d8800.json deleted file mode 100644 index fbaecf746..000000000 --- a/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR-lorablated/c901a9ee-069a-4e3e-ac52-3017d67d8800.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Promissum_Mane-8B-LINEAR-lorablated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Promissum_Mane-8B-LINEAR-lorablated", - "id": "DreadPoor/Promissum_Mane-8B-LINEAR-lorablated", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7156 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR/08317b59-ff74-43c8-bea5-2a266c38816e.json b/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR/08317b59-ff74-43c8-bea5-2a266c38816e.json deleted file mode 100644 index c14c68ca6..000000000 --- a/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR/08317b59-ff74-43c8-bea5-2a266c38816e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Promissum_Mane-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Promissum_Mane-8B-LINEAR", - "id": "DreadPoor/Promissum_Mane-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.715 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5458 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/RPMash-8B-Model_Stock/4106d4d3-344a-4c1f-b9ce-a3140d435013.json b/data/hfopenllm_v2/DreadPoor/RPMash-8B-Model_Stock/4106d4d3-344a-4c1f-b9ce-a3140d435013.json deleted file mode 100644 index 4c1772a72..000000000 --- a/data/hfopenllm_v2/DreadPoor/RPMash-8B-Model_Stock/4106d4d3-344a-4c1f-b9ce-a3140d435013.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_RPMash-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RPMash-8B-Model_Stock", - "id": "DreadPoor/RPMash-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5169 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4054 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/RPMash_V3-8B-Model_Stock/2b308fad-8494-4056-8b84-82733cd2710a.json b/data/hfopenllm_v2/DreadPoor/RPMash_V3-8B-Model_Stock/2b308fad-8494-4056-8b84-82733cd2710a.json deleted file mode 100644 index 1c26373d3..000000000 --- a/data/hfopenllm_v2/DreadPoor/RPMash_V3-8B-Model_Stock/2b308fad-8494-4056-8b84-82733cd2710a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_RPMash_V3-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RPMash_V3-8B-Model_Stock", - "id": "DreadPoor/RPMash_V3-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7049 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5217 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1042 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3778 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3614 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Rusted_Gold-8B-LINEAR/93c867d0-4f10-440c-838c-91d1633fe584.json b/data/hfopenllm_v2/DreadPoor/Rusted_Gold-8B-LINEAR/93c867d0-4f10-440c-838c-91d1633fe584.json deleted file mode 100644 index f8b506d6b..000000000 --- a/data/hfopenllm_v2/DreadPoor/Rusted_Gold-8B-LINEAR/93c867d0-4f10-440c-838c-91d1633fe584.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Rusted_Gold-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rusted_Gold-8B-LINEAR", - "id": "DreadPoor/Rusted_Gold-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7296 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5387 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1934 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-LINEAR/1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json b/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-LINEAR/1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json deleted file mode 100644 index 928a50b08..000000000 --- a/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-LINEAR/1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Rusted_Platinum-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rusted_Platinum-8B-LINEAR", - "id": "DreadPoor/Rusted_Platinum-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.718 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5428 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1722 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3967 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.373 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-Model_Stock/151226ba-9744-45bc-b923-30df57f7aa3e.json b/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-Model_Stock/151226ba-9744-45bc-b923-30df57f7aa3e.json deleted file mode 100644 index f71f0f094..000000000 --- a/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-Model_Stock/151226ba-9744-45bc-b923-30df57f7aa3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Rusted_Platinum-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rusted_Platinum-8B-Model_Stock", - "id": "DreadPoor/Rusted_Platinum-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5243 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3741 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3546 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Sellen-8B-model_stock/98363657-0793-4eb3-94de-28961afc92ea.json b/data/hfopenllm_v2/DreadPoor/Sellen-8B-model_stock/98363657-0793-4eb3-94de-28961afc92ea.json deleted file mode 100644 index 1c2ffb709..000000000 --- a/data/hfopenllm_v2/DreadPoor/Sellen-8B-model_stock/98363657-0793-4eb3-94de-28961afc92ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Sellen-8B-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sellen-8B-model_stock", - "id": "DreadPoor/Sellen-8B-model_stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7113 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5232 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Something-8B-Model_Stock/a32b4ded-6bff-441e-afbd-736e6d8cce5c.json b/data/hfopenllm_v2/DreadPoor/Something-8B-Model_Stock/a32b4ded-6bff-441e-afbd-736e6d8cce5c.json deleted file mode 100644 index 7c20e3cb1..000000000 --- a/data/hfopenllm_v2/DreadPoor/Something-8B-Model_Stock/a32b4ded-6bff-441e-afbd-736e6d8cce5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Something-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Something-8B-Model_Stock", - "id": "DreadPoor/Something-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1798 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3885 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Spring_Dusk-8B-SCE/326bcf4a-02e9-4218-8bf2-55a94a79435e.json b/data/hfopenllm_v2/DreadPoor/Spring_Dusk-8B-SCE/326bcf4a-02e9-4218-8bf2-55a94a79435e.json deleted file mode 100644 index decc0df8c..000000000 --- a/data/hfopenllm_v2/DreadPoor/Spring_Dusk-8B-SCE/326bcf4a-02e9-4218-8bf2-55a94a79435e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Spring_Dusk-8B-SCE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Spring_Dusk-8B-SCE", - "id": "DreadPoor/Spring_Dusk-8B-SCE", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5635 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3436 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Summer_Dawn-8B-SCE/145facc2-ab11-4c68-b841-762e0ad9bd5a.json b/data/hfopenllm_v2/DreadPoor/Summer_Dawn-8B-SCE/145facc2-ab11-4c68-b841-762e0ad9bd5a.json deleted file mode 100644 index 4227020d6..000000000 --- a/data/hfopenllm_v2/DreadPoor/Summer_Dawn-8B-SCE/145facc2-ab11-4c68-b841-762e0ad9bd5a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Summer_Dawn-8B-SCE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Summer_Dawn-8B-SCE", - "id": "DreadPoor/Summer_Dawn-8B-SCE", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6642 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1722 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.412 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Summer_Dusk-8B-TIES/d3e6aae6-9284-4309-8d8c-02c9e797a58b.json b/data/hfopenllm_v2/DreadPoor/Summer_Dusk-8B-TIES/d3e6aae6-9284-4309-8d8c-02c9e797a58b.json deleted file mode 100644 index 563ad7232..000000000 --- a/data/hfopenllm_v2/DreadPoor/Summer_Dusk-8B-TIES/d3e6aae6-9284-4309-8d8c-02c9e797a58b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Summer_Dusk-8B-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Summer_Dusk-8B-TIES", - "id": "DreadPoor/Summer_Dusk-8B-TIES", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4922 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4267 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3856 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-SCE/6ee8537c-90e8-4455-83ca-c8c375a5ead7.json b/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-SCE/6ee8537c-90e8-4455-83ca-c8c375a5ead7.json deleted file mode 100644 index 9e3634f9a..000000000 --- a/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-SCE/6ee8537c-90e8-4455-83ca-c8c375a5ead7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Summer_Rain-8B-SCE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Summer_Rain-8B-SCE", - "id": "DreadPoor/Summer_Rain-8B-SCE", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5459 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5846 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4477 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-TIES/6efbfb38-57e5-46c7-b765-f7d0356afb97.json b/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-TIES/6efbfb38-57e5-46c7-b765-f7d0356afb97.json deleted file mode 100644 index 2e8e86b23..000000000 --- a/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-TIES/6efbfb38-57e5-46c7-b765-f7d0356afb97.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Summer_Rain-8B-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Summer_Rain-8B-TIES", - "id": "DreadPoor/Summer_Rain-8B-TIES", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5444 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5846 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4477 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Sun-8B-Model_Stock/f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json b/data/hfopenllm_v2/DreadPoor/Sun-8B-Model_Stock/f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json deleted file mode 100644 index 0c4fbb59f..000000000 --- a/data/hfopenllm_v2/DreadPoor/Sun-8B-Model_Stock/f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Sun-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sun-8B-Model_Stock", - "id": "DreadPoor/Sun-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7758 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5264 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3835 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock/1c9b325b-92b3-499a-a3ea-026269c63c88.json b/data/hfopenllm_v2/DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock/1c9b325b-92b3-499a-a3ea-026269c63c88.json deleted file mode 100644 index 28f4c0025..000000000 --- a/data/hfopenllm_v2/DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock/1c9b325b-92b3-499a-a3ea-026269c63c88.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Sweetened_Condensed_Milk-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sweetened_Condensed_Milk-8B-Model_Stock", - "id": "DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7417 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4107 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3848 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/TEST02-Ignore/c546ccde-cef3-4de2-a49f-24517d76dde5.json b/data/hfopenllm_v2/DreadPoor/TEST02-Ignore/c546ccde-cef3-4de2-a49f-24517d76dde5.json deleted file mode 100644 index 875b97c44..000000000 --- a/data/hfopenllm_v2/DreadPoor/TEST02-Ignore/c546ccde-cef3-4de2-a49f-24517d76dde5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_TEST02-Ignore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TEST02-Ignore", - "id": "DreadPoor/TEST02-Ignore", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6119 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5602 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3468 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/TEST03-ignore/e85d3ccf-f48d-4e5c-b893-771a107773d4.json b/data/hfopenllm_v2/DreadPoor/TEST03-ignore/e85d3ccf-f48d-4e5c-b893-771a107773d4.json deleted file mode 100644 index f463020e8..000000000 --- a/data/hfopenllm_v2/DreadPoor/TEST03-ignore/e85d3ccf-f48d-4e5c-b893-771a107773d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_TEST03-ignore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TEST03-ignore", - "id": "DreadPoor/TEST03-ignore", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6967 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5383 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1654 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3789 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/TEST06-ignore/b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json b/data/hfopenllm_v2/DreadPoor/TEST06-ignore/b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json deleted file mode 100644 index b39933b07..000000000 --- a/data/hfopenllm_v2/DreadPoor/TEST06-ignore/b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_TEST06-ignore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TEST06-ignore", - "id": "DreadPoor/TEST06-ignore", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7323 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5509 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4225 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/TEST07-ignore/97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json b/data/hfopenllm_v2/DreadPoor/TEST07-ignore/97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json deleted file mode 100644 index 42a699d62..000000000 --- a/data/hfopenllm_v2/DreadPoor/TEST07-ignore/97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_TEST07-ignore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TEST07-ignore", - "id": "DreadPoor/TEST07-ignore", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5561 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1662 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4094 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/TEST08-ignore/b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json b/data/hfopenllm_v2/DreadPoor/TEST08-ignore/b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json deleted file mode 100644 index 9477b8a75..000000000 --- a/data/hfopenllm_v2/DreadPoor/TEST08-ignore/b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_TEST08-ignore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TEST08-ignore", - "id": "DreadPoor/TEST08-ignore", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7467 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5454 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3853 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Trinas_Nectar-8B-model_stock/3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json b/data/hfopenllm_v2/DreadPoor/Trinas_Nectar-8B-model_stock/3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json deleted file mode 100644 index bf826442f..000000000 --- a/data/hfopenllm_v2/DreadPoor/Trinas_Nectar-8B-model_stock/3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Trinas_Nectar-8B-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Trinas_Nectar-8B-model_stock", - "id": "DreadPoor/Trinas_Nectar-8B-model_stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7259 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1526 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4068 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock/7ba52efb-3890-4691-8740-9f051f1f645e.json b/data/hfopenllm_v2/DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock/7ba52efb-3890-4691-8740-9f051f1f645e.json deleted file mode 100644 index 44c3145e9..000000000 --- a/data/hfopenllm_v2/DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock/7ba52efb-3890-4691-8740-9f051f1f645e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_UNTESTED-VENN_1.2-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "UNTESTED-VENN_1.2-8B-Model_Stock", - "id": "DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4718 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5475 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3787 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/VENN_1.2-8B-Model_Stock/7b192b49-057e-418a-b47d-44b0ec82a6b6.json b/data/hfopenllm_v2/DreadPoor/VENN_1.2-8B-Model_Stock/7b192b49-057e-418a-b47d-44b0ec82a6b6.json deleted file mode 100644 index 77da6fb17..000000000 --- a/data/hfopenllm_v2/DreadPoor/VENN_1.2-8B-Model_Stock/7b192b49-057e-418a-b47d-44b0ec82a6b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_VENN_1.2-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VENN_1.2-8B-Model_Stock", - "id": "DreadPoor/VENN_1.2-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7226 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5459 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3721 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/WIP-Acacia-8B-Model_Stock/f2120d53-bef6-44d6-84a6-a6f8e3537188.json b/data/hfopenllm_v2/DreadPoor/WIP-Acacia-8B-Model_Stock/f2120d53-bef6-44d6-84a6-a6f8e3537188.json deleted file mode 100644 index aadd0095c..000000000 --- a/data/hfopenllm_v2/DreadPoor/WIP-Acacia-8B-Model_Stock/f2120d53-bef6-44d6-84a6-a6f8e3537188.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_WIP-Acacia-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WIP-Acacia-8B-Model_Stock", - "id": "DreadPoor/WIP-Acacia-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6246 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5195 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1669 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3737 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/WIP_Damascus-8B-TIES/f5408aa9-85c8-46e5-b225-0480b2e18e97.json b/data/hfopenllm_v2/DreadPoor/WIP_Damascus-8B-TIES/f5408aa9-85c8-46e5-b225-0480b2e18e97.json deleted file mode 100644 index e05a5b321..000000000 --- a/data/hfopenllm_v2/DreadPoor/WIP_Damascus-8B-TIES/f5408aa9-85c8-46e5-b225-0480b2e18e97.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_WIP_Damascus-8B-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WIP_Damascus-8B-TIES", - "id": "DreadPoor/WIP_Damascus-8B-TIES", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4776 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1654 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4119 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Wannabe-8B-Model_Stock/c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json b/data/hfopenllm_v2/DreadPoor/Wannabe-8B-Model_Stock/c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json deleted file mode 100644 index f551e73cc..000000000 --- a/data/hfopenllm_v2/DreadPoor/Wannabe-8B-Model_Stock/c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Wannabe-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Wannabe-8B-Model_Stock", - "id": "DreadPoor/Wannabe-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7205 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/What_A_Thrill-8B-Model_Stock/52659d37-67f8-45b8-88e4-11917dc90488.json b/data/hfopenllm_v2/DreadPoor/What_A_Thrill-8B-Model_Stock/52659d37-67f8-45b8-88e4-11917dc90488.json deleted file mode 100644 index 00ab3ed08..000000000 --- a/data/hfopenllm_v2/DreadPoor/What_A_Thrill-8B-Model_Stock/52659d37-67f8-45b8-88e4-11917dc90488.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_What_A_Thrill-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "What_A_Thrill-8B-Model_Stock", - "id": "DreadPoor/What_A_Thrill-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5311 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Winter-8B-SCE/556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json b/data/hfopenllm_v2/DreadPoor/Winter-8B-SCE/556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json deleted file mode 100644 index b9a6a545d..000000000 --- a/data/hfopenllm_v2/DreadPoor/Winter-8B-SCE/556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Winter-8B-SCE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Winter-8B-SCE", - "id": "DreadPoor/Winter-8B-SCE", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7536 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3839 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Winter_Dawn-8B-TIES/048fc971-3baf-4740-a132-2f9476d01b7a.json b/data/hfopenllm_v2/DreadPoor/Winter_Dawn-8B-TIES/048fc971-3baf-4740-a132-2f9476d01b7a.json deleted file mode 100644 index f48ac4b48..000000000 --- a/data/hfopenllm_v2/DreadPoor/Winter_Dawn-8B-TIES/048fc971-3baf-4740-a132-2f9476d01b7a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Winter_Dawn-8B-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Winter_Dawn-8B-TIES", - "id": "DreadPoor/Winter_Dawn-8B-TIES", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5309 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Winter_Dusk-8B-TIES/abd28d25-01e0-474d-be35-08d816d281f5.json b/data/hfopenllm_v2/DreadPoor/Winter_Dusk-8B-TIES/abd28d25-01e0-474d-be35-08d816d281f5.json deleted file mode 100644 index 1470af900..000000000 --- a/data/hfopenllm_v2/DreadPoor/Winter_Dusk-8B-TIES/abd28d25-01e0-474d-be35-08d816d281f5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Winter_Dusk-8B-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Winter_Dusk-8B-TIES", - "id": "DreadPoor/Winter_Dusk-8B-TIES", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7153 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4952 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3478 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Winter_Night-8B-Model_Stock/17f49724-6553-4baa-b354-45ffd0f2c844.json b/data/hfopenllm_v2/DreadPoor/Winter_Night-8B-Model_Stock/17f49724-6553-4baa-b354-45ffd0f2c844.json deleted file mode 100644 index 2969a6c18..000000000 --- a/data/hfopenllm_v2/DreadPoor/Winter_Night-8B-Model_Stock/17f49724-6553-4baa-b354-45ffd0f2c844.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Winter_Night-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Winter_Night-8B-Model_Stock", - "id": "DreadPoor/Winter_Night-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5185 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Yafune-8B-Model_Stock/3e60d982-d7d5-432b-962e-b7734cc90534.json b/data/hfopenllm_v2/DreadPoor/Yafune-8B-Model_Stock/3e60d982-d7d5-432b-962e-b7734cc90534.json deleted file mode 100644 index 25b9ce545..000000000 --- a/data/hfopenllm_v2/DreadPoor/Yafune-8B-Model_Stock/3e60d982-d7d5-432b-962e-b7734cc90534.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Yafune-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yafune-8B-Model_Stock", - "id": "DreadPoor/Yafune-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7533 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5467 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1662 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Yearn_V3-8B-Model_Stock/79a0fdf3-b432-4598-be62-f9eb57fa5a43.json b/data/hfopenllm_v2/DreadPoor/Yearn_V3-8B-Model_Stock/79a0fdf3-b432-4598-be62-f9eb57fa5a43.json deleted file mode 100644 index 74619217a..000000000 --- a/data/hfopenllm_v2/DreadPoor/Yearn_V3-8B-Model_Stock/79a0fdf3-b432-4598-be62-f9eb57fa5a43.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Yearn_V3-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yearn_V3-8B-Model_Stock", - "id": "DreadPoor/Yearn_V3-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.729 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5322 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1896 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3802 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/ZEUS-8B-V17-Abliterated_ALT/662566e0-2af3-40d6-90de-9b361bcae355.json b/data/hfopenllm_v2/DreadPoor/ZEUS-8B-V17-Abliterated_ALT/662566e0-2af3-40d6-90de-9b361bcae355.json deleted file mode 100644 index 02d4db078..000000000 --- a/data/hfopenllm_v2/DreadPoor/ZEUS-8B-V17-Abliterated_ALT/662566e0-2af3-40d6-90de-9b361bcae355.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_ZEUS-8B-V17-Abliterated_ALT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V17-Abliterated_ALT", - "id": "DreadPoor/ZEUS-8B-V17-Abliterated_ALT", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5511 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5231 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1903 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4149 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Zelus-8B-Model_Stock/d81c0035-a0b1-426c-9080-8ccbf745642b.json b/data/hfopenllm_v2/DreadPoor/Zelus-8B-Model_Stock/d81c0035-a0b1-426c-9080-8ccbf745642b.json deleted file mode 100644 index 03e62bfa1..000000000 --- a/data/hfopenllm_v2/DreadPoor/Zelus-8B-Model_Stock/d81c0035-a0b1-426c-9080-8ccbf745642b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Zelus-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zelus-8B-Model_Stock", - "id": "DreadPoor/Zelus-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7788 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5307 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1647 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4214 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3841 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/Zelus_V2-8B-Model_Stock/100bc243-158c-4e5c-918b-1439bf26fee8.json b/data/hfopenllm_v2/DreadPoor/Zelus_V2-8B-Model_Stock/100bc243-158c-4e5c-918b-1439bf26fee8.json deleted file mode 100644 index 059e73847..000000000 --- a/data/hfopenllm_v2/DreadPoor/Zelus_V2-8B-Model_Stock/100bc243-158c-4e5c-918b-1439bf26fee8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_Zelus_V2-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zelus_V2-8B-Model_Stock", - "id": "DreadPoor/Zelus_V2-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7898 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5345 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2054 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3961 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3833 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/felix_dies-mistral-7B-model_stock/45e32080-1464-40e0-a232-310fdda967eb.json b/data/hfopenllm_v2/DreadPoor/felix_dies-mistral-7B-model_stock/45e32080-1464-40e0-a232-310fdda967eb.json deleted file mode 100644 index c0877447b..000000000 --- a/data/hfopenllm_v2/DreadPoor/felix_dies-mistral-7B-model_stock/45e32080-1464-40e0-a232-310fdda967eb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_felix_dies-mistral-7B-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "felix_dies-mistral-7B-model_stock", - "id": "DreadPoor/felix_dies-mistral-7B-model_stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3008 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4901 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4518 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/hakuchido-8B-MODEL_STOCK/e89b279f-d548-4aa8-b5e5-0bffdd98b840.json b/data/hfopenllm_v2/DreadPoor/hakuchido-8B-MODEL_STOCK/e89b279f-d548-4aa8-b5e5-0bffdd98b840.json deleted file mode 100644 index 81722a6a0..000000000 --- a/data/hfopenllm_v2/DreadPoor/hakuchido-8B-MODEL_STOCK/e89b279f-d548-4aa8-b5e5-0bffdd98b840.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_hakuchido-8B-MODEL_STOCK/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "hakuchido-8B-MODEL_STOCK", - "id": "DreadPoor/hakuchido-8B-MODEL_STOCK", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7375 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5398 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1949 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4175 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/ichor-8B-Model_Stock/777a53f9-891c-4f9e-99a8-bb1988f61f19.json b/data/hfopenllm_v2/DreadPoor/ichor-8B-Model_Stock/777a53f9-891c-4f9e-99a8-bb1988f61f19.json deleted file mode 100644 index c35d0a556..000000000 --- a/data/hfopenllm_v2/DreadPoor/ichor-8B-Model_Stock/777a53f9-891c-4f9e-99a8-bb1988f61f19.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_ichor-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ichor-8B-Model_Stock", - "id": "DreadPoor/ichor-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5386 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4212 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3151 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/ichor_1.1-8B-Model_Stock/f15846b1-8eaa-411b-88f7-25064161af4e.json b/data/hfopenllm_v2/DreadPoor/ichor_1.1-8B-Model_Stock/f15846b1-8eaa-411b-88f7-25064161af4e.json deleted file mode 100644 index 07bce712f..000000000 --- a/data/hfopenllm_v2/DreadPoor/ichor_1.1-8B-Model_Stock/f15846b1-8eaa-411b-88f7-25064161af4e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_ichor_1.1-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ichor_1.1-8B-Model_Stock", - "id": "DreadPoor/ichor_1.1-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8096 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5281 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4068 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3856 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/inexpertus-8B-Model_Stock/e803fc85-fb98-4db8-aab0-a63100dcd5fc.json b/data/hfopenllm_v2/DreadPoor/inexpertus-8B-Model_Stock/e803fc85-fb98-4db8-aab0-a63100dcd5fc.json deleted file mode 100644 index dd5b49bf4..000000000 --- a/data/hfopenllm_v2/DreadPoor/inexpertus-8B-Model_Stock/e803fc85-fb98-4db8-aab0-a63100dcd5fc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_inexpertus-8B-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "inexpertus-8B-Model_Stock", - "id": "DreadPoor/inexpertus-8B-Model_Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7795 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.528 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3791 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/inexpertus_1.1-8B-LINEAR/50620749-5ecf-41eb-a131-611675560e07.json b/data/hfopenllm_v2/DreadPoor/inexpertus_1.1-8B-LINEAR/50620749-5ecf-41eb-a131-611675560e07.json deleted file mode 100644 index 0f7b2d86a..000000000 --- a/data/hfopenllm_v2/DreadPoor/inexpertus_1.1-8B-LINEAR/50620749-5ecf-41eb-a131-611675560e07.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_inexpertus_1.1-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "inexpertus_1.1-8B-LINEAR", - "id": "DreadPoor/inexpertus_1.1-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7527 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5525 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.173 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3827 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/inexpertus_1.2-8B-LINEAR/2d40a551-6440-4d71-87e4-639d486c1c5e.json b/data/hfopenllm_v2/DreadPoor/inexpertus_1.2-8B-LINEAR/2d40a551-6440-4d71-87e4-639d486c1c5e.json deleted file mode 100644 index 9628c12f8..000000000 --- a/data/hfopenllm_v2/DreadPoor/inexpertus_1.2-8B-LINEAR/2d40a551-6440-4d71-87e4-639d486c1c5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_inexpertus_1.2-8B-LINEAR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "inexpertus_1.2-8B-LINEAR", - "id": "DreadPoor/inexpertus_1.2-8B-LINEAR", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7348 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5523 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1586 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4133 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3788 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/mergekit-nuslerp-nqzkedi/22235942-2e3e-4ef4-b7a0-5800f507571a.json b/data/hfopenllm_v2/DreadPoor/mergekit-nuslerp-nqzkedi/22235942-2e3e-4ef4-b7a0-5800f507571a.json deleted file mode 100644 index 0f3089a37..000000000 --- a/data/hfopenllm_v2/DreadPoor/mergekit-nuslerp-nqzkedi/22235942-2e3e-4ef4-b7a0-5800f507571a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_mergekit-nuslerp-nqzkedi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-nuslerp-nqzkedi", - "id": "DreadPoor/mergekit-nuslerp-nqzkedi", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7765 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5362 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1881 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4225 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3919 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/remember_to_breathe-8b-Model-Stock/ac06867d-3a34-42f6-9e2e-226cf86748f6.json b/data/hfopenllm_v2/DreadPoor/remember_to_breathe-8b-Model-Stock/ac06867d-3a34-42f6-9e2e-226cf86748f6.json deleted file mode 100644 index 08420f429..000000000 --- a/data/hfopenllm_v2/DreadPoor/remember_to_breathe-8b-Model-Stock/ac06867d-3a34-42f6-9e2e-226cf86748f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_remember_to_breathe-8b-Model-Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "remember_to_breathe-8b-Model-Stock", - "id": "DreadPoor/remember_to_breathe-8b-Model-Stock", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7104 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5412 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1488 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4145 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/test/394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json b/data/hfopenllm_v2/DreadPoor/test/394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json deleted file mode 100644 index 4d6f24d90..000000000 --- a/data/hfopenllm_v2/DreadPoor/test/394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test", - "id": "DreadPoor/test", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4937 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5372 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1934 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3647 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/test_ALT/03e52d4f-78d7-453c-9685-844dd1636904.json b/data/hfopenllm_v2/DreadPoor/test_ALT/03e52d4f-78d7-453c-9685-844dd1636904.json deleted file mode 100644 index 14344c072..000000000 --- a/data/hfopenllm_v2/DreadPoor/test_ALT/03e52d4f-78d7-453c-9685-844dd1636904.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_test_ALT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test_ALT", - "id": "DreadPoor/test_ALT", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4997 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4363 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3492 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/DreadPoor/tests_pending-do_not_use_yet/3ce136d5-be81-4b8c-a7dc-4e1346935d35.json b/data/hfopenllm_v2/DreadPoor/tests_pending-do_not_use_yet/3ce136d5-be81-4b8c-a7dc-4e1346935d35.json deleted file mode 100644 index e83a16a2c..000000000 --- a/data/hfopenllm_v2/DreadPoor/tests_pending-do_not_use_yet/3ce136d5-be81-4b8c-a7dc-4e1346935d35.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/DreadPoor_tests_pending-do_not_use_yet/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tests_pending-do_not_use_yet", - "id": "DreadPoor/tests_pending-do_not_use_yet", - "developer": "DreadPoor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7691 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5408 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1979 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4005 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3827 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2/fb35accf-0c5d-4f72-8d73-ba366a41a76d.json b/data/hfopenllm_v2/ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2/fb35accf-0c5d-4f72-8d73-ba366a41a76d.json deleted file mode 100644 index 2a680a23b..000000000 --- a/data/hfopenllm_v2/ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2/fb35accf-0c5d-4f72-8d73-ba366a41a76d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ECE-ILAB-PRYMMAL_ILAB-Merging-3B-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ILAB-Merging-3B-V2", - "id": "ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2", - "developer": "ECE-ILAB-PRYMMAL", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4029 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5402 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1518 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3861 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2/75e5ca5d-cce1-4463-b398-553399ce6833.json b/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2/75e5ca5d-cce1-4463-b398-553399ce6833.json deleted file mode 100644 index 12b4347c9..000000000 --- a/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2/75e5ca5d-cce1-4463-b398-553399ce6833.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EVA-UNIT-01_EVA-Qwen2.5-14B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EVA-Qwen2.5-14B-v0.2", - "id": "EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2", - "developer": "EVA-UNIT-01", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4038 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5135 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2/c426bae7-b98d-4343-b419-ac8206196a95.json b/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2/c426bae7-b98d-4343-b419-ac8206196a95.json deleted file mode 100644 index 557addc60..000000000 --- a/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2/c426bae7-b98d-4343-b419-ac8206196a95.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EVA-UNIT-01_EVA-Qwen2.5-72B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EVA-Qwen2.5-72B-v0.2", - "id": "EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2", - "developer": "EVA-UNIT-01", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7088 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4313 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.472 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5813 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json b/data/hfopenllm_v2/Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json deleted file mode 100644 index 548eedb0d..000000000 --- a/data/hfopenllm_v2/Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Edgerunners_meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16", - "id": "Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16", - "developer": "Edgerunners", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7147 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.498 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3636 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/gpt-j-6b/58ba7ca1-8cca-4668-836b-824491d9cf01.json b/data/hfopenllm_v2/EleutherAI/gpt-j-6b/58ba7ca1-8cca-4668-836b-824491d9cf01.json deleted file mode 100644 index 3f4b4bd5e..000000000 --- a/data/hfopenllm_v2/EleutherAI/gpt-j-6b/58ba7ca1-8cca-4668-836b-824491d9cf01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-j-6b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-j-6b", - "id": "EleutherAI/gpt-j-6b", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPTJForCausalLM", - "params_billions": 6.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2522 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3191 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1241 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/gpt-neo-1.3B/23da100a-13b9-42a7-ba79-234be551d0e4.json b/data/hfopenllm_v2/EleutherAI/gpt-neo-1.3B/23da100a-13b9-42a7-ba79-234be551d0e4.json deleted file mode 100644 index 14e41e155..000000000 --- a/data/hfopenllm_v2/EleutherAI/gpt-neo-1.3B/23da100a-13b9-42a7-ba79-234be551d0e4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-neo-1.3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-neo-1.3B", - "id": "EleutherAI/gpt-neo-1.3B", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPTNeoForCausalLM", - "params_billions": 1.366 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2079 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3039 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/gpt-neo-125m/2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json b/data/hfopenllm_v2/EleutherAI/gpt-neo-125m/2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json deleted file mode 100644 index 58ad6f3da..000000000 --- a/data/hfopenllm_v2/EleutherAI/gpt-neo-125m/2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-neo-125m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-neo-125m", - "id": "EleutherAI/gpt-neo-125m", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPTNeoForCausalLM", - "params_billions": 0.15 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1905 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3115 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3593 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1026 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/gpt-neo-2.7B/4b87eea2-169c-411e-9d15-caf6b7826590.json b/data/hfopenllm_v2/EleutherAI/gpt-neo-2.7B/4b87eea2-169c-411e-9d15-caf6b7826590.json deleted file mode 100644 index 409e0b82a..000000000 --- a/data/hfopenllm_v2/EleutherAI/gpt-neo-2.7B/4b87eea2-169c-411e-9d15-caf6b7826590.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-neo-2.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-neo-2.7B", - "id": "EleutherAI/gpt-neo-2.7B", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPTNeoForCausalLM", - "params_billions": 2.718 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.259 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/gpt-neox-20b/62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json b/data/hfopenllm_v2/EleutherAI/gpt-neox-20b/62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json deleted file mode 100644 index 8d26f484b..000000000 --- a/data/hfopenllm_v2/EleutherAI/gpt-neox-20b/62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-neox-20b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-neox-20b", - "id": "EleutherAI/gpt-neox-20b", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 20.739 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2587 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3165 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3647 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1155 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/pythia-1.4b/0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json b/data/hfopenllm_v2/EleutherAI/pythia-1.4b/0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json deleted file mode 100644 index 0e25cd321..000000000 --- a/data/hfopenllm_v2/EleutherAI/pythia-1.4b/0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-1.4b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pythia-1.4b", - "id": "EleutherAI/pythia-1.4b", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 1.515 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/pythia-12b/b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json b/data/hfopenllm_v2/EleutherAI/pythia-12b/b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json deleted file mode 100644 index c70b94beb..000000000 --- a/data/hfopenllm_v2/EleutherAI/pythia-12b/b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pythia-12b", - "id": "EleutherAI/pythia-12b", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 12.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2471 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3647 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/pythia-160m/7fadc486-767e-45ef-979d-74ecb858cb99.json b/data/hfopenllm_v2/EleutherAI/pythia-160m/7fadc486-767e-45ef-979d-74ecb858cb99.json deleted file mode 100644 index fc912f961..000000000 --- a/data/hfopenllm_v2/EleutherAI/pythia-160m/7fadc486-767e-45ef-979d-74ecb858cb99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-160m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pythia-160m", - "id": "EleutherAI/pythia-160m", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 0.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1816 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4179 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/pythia-1b/d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json b/data/hfopenllm_v2/EleutherAI/pythia-1b/d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json deleted file mode 100644 index daea284b8..000000000 --- a/data/hfopenllm_v2/EleutherAI/pythia-1b/d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-1b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pythia-1b", - "id": "EleutherAI/pythia-1b", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 1.079 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3552 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1136 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/pythia-2.8b/0999a066-1151-4445-b130-00d8fe4a516e.json b/data/hfopenllm_v2/EleutherAI/pythia-2.8b/0999a066-1151-4445-b130-00d8fe4a516e.json deleted file mode 100644 index 7e836076c..000000000 --- a/data/hfopenllm_v2/EleutherAI/pythia-2.8b/0999a066-1151-4445-b130-00d8fe4a516e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-2.8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pythia-2.8b", - "id": "EleutherAI/pythia-2.8b", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 2.909 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2173 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3224 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/pythia-410m/1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json b/data/hfopenllm_v2/EleutherAI/pythia-410m/1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json deleted file mode 100644 index 070f7c573..000000000 --- a/data/hfopenllm_v2/EleutherAI/pythia-410m/1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-410m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pythia-410m", - "id": "EleutherAI/pythia-410m", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 0.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2195 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3028 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3578 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EleutherAI/pythia-6.9b/1a59412f-fe78-4ecf-8951-8f2996dd374f.json b/data/hfopenllm_v2/EleutherAI/pythia-6.9b/1a59412f-fe78-4ecf-8951-8f2996dd374f.json deleted file mode 100644 index 6f0eb37e2..000000000 --- a/data/hfopenllm_v2/EleutherAI/pythia-6.9b/1a59412f-fe78-4ecf-8951-8f2996dd374f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-6.9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pythia-6.9b", - "id": "EleutherAI/pythia-6.9b", - "developer": "EleutherAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 6.9 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2281 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3232 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4/b5403311-2069-488d-af98-27da14496c15.json b/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4/b5403311-2069-488d-af98-27da14496c15.json deleted file mode 100644 index 43c0cbc2f..000000000 --- a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4/b5403311-2069-488d-af98-27da14496c15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Enno-Ai_EnnoAi-Pro-French-Llama-3-8B-v0.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EnnoAi-Pro-French-Llama-3-8B-v0.4", - "id": "Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4", - "developer": "Enno-Ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4189 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2635 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3/6c10c176-b2b6-4216-91c0-1444944612f7.json b/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3/6c10c176-b2b6-4216-91c0-1444944612f7.json deleted file mode 100644 index 3c0a9c0a6..000000000 --- a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3/6c10c176-b2b6-4216-91c0-1444944612f7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Enno-Ai_EnnoAi-Pro-Llama-3-8B-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EnnoAi-Pro-Llama-3-8B-v0.3", - "id": "Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3", - "developer": "Enno-Ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5083 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B/80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json b/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B/80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json deleted file mode 100644 index ae3f57ad1..000000000 --- a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B/80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Enno-Ai_EnnoAi-Pro-Llama-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EnnoAi-Pro-Llama-3-8B", - "id": "Enno-Ai/EnnoAi-Pro-Llama-3-8B", - "developer": "Enno-Ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3195 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4152 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2151 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9/0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json b/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9/0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json deleted file mode 100644 index 145d529e4..000000000 --- a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9/0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Enno-Ai_EnnoAi-Pro-Llama-3.1-8B-v0.9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EnnoAi-Pro-Llama-3.1-8B-v0.9", - "id": "Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9", - "developer": "Enno-Ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4689 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3832 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2596 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EnnoAi/EnnoAi-7B-French-Instruct-202502/4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json b/data/hfopenllm_v2/EnnoAi/EnnoAi-7B-French-Instruct-202502/4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json deleted file mode 100644 index 28ac26d3c..000000000 --- a/data/hfopenllm_v2/EnnoAi/EnnoAi-7B-French-Instruct-202502/4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EnnoAi_EnnoAi-7B-French-Instruct-202502/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EnnoAi-7B-French-Instruct-202502", - "id": "EnnoAi/EnnoAi-7B-French-Instruct-202502", - "developer": "EnnoAi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5575 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4013 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0/8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json b/data/hfopenllm_v2/EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0/8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json deleted file mode 100644 index 610b4bd7f..000000000 --- a/data/hfopenllm_v2/EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0/8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EnnoAi_EnnoAi-Pro-Llama-3.1-8B-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EnnoAi-Pro-Llama-3.1-8B-v1.0", - "id": "EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0", - "developer": "EnnoAi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4704 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3832 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2596 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Epiculous/Azure_Dusk-v0.2/524e634f-280c-4f3a-9f1f-bdda19fad740.json b/data/hfopenllm_v2/Epiculous/Azure_Dusk-v0.2/524e634f-280c-4f3a-9f1f-bdda19fad740.json deleted file mode 100644 index db3809f7b..000000000 --- a/data/hfopenllm_v2/Epiculous/Azure_Dusk-v0.2/524e634f-280c-4f3a-9f1f-bdda19fad740.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Epiculous_Azure_Dusk-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Azure_Dusk-v0.2", - "id": "Epiculous/Azure_Dusk-v0.2", - "developer": "Epiculous", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3467 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.412 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3835 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3034 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Epiculous/Crimson_Dawn-v0.2/cb82e92b-f207-4fbd-9bfe-43184769cdbd.json b/data/hfopenllm_v2/Epiculous/Crimson_Dawn-v0.2/cb82e92b-f207-4fbd-9bfe-43184769cdbd.json deleted file mode 100644 index 0f0cfc70f..000000000 --- a/data/hfopenllm_v2/Epiculous/Crimson_Dawn-v0.2/cb82e92b-f207-4fbd-9bfe-43184769cdbd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Epiculous_Crimson_Dawn-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Crimson_Dawn-v0.2", - "id": "Epiculous/Crimson_Dawn-v0.2", - "developer": "Epiculous", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3103 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4482 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4152 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2721 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Epiculous/NovaSpark/0b674103-4e55-41f4-accb-b7be73671801.json b/data/hfopenllm_v2/Epiculous/NovaSpark/0b674103-4e55-41f4-accb-b7be73671801.json deleted file mode 100644 index 74391a8d3..000000000 --- a/data/hfopenllm_v2/Epiculous/NovaSpark/0b674103-4e55-41f4-accb-b7be73671801.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Epiculous_NovaSpark/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NovaSpark", - "id": "Epiculous/NovaSpark", - "developer": "Epiculous", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5064 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1518 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Epiculous/Violet_Twilight-v0.2/fa0290e0-723f-4502-90b6-c77007fffc1f.json b/data/hfopenllm_v2/Epiculous/Violet_Twilight-v0.2/fa0290e0-723f-4502-90b6-c77007fffc1f.json deleted file mode 100644 index c08d3f9d1..000000000 --- a/data/hfopenllm_v2/Epiculous/Violet_Twilight-v0.2/fa0290e0-723f-4502-90b6-c77007fffc1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Epiculous_Violet_Twilight-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Violet_Twilight-v0.2", - "id": "Epiculous/Violet_Twilight-v0.2", - "developer": "Epiculous", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4532 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4615 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4299 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Alpaca-Llama3.1-8B/c3827ecd-d02a-4464-a098-110f4fb54516.json b/data/hfopenllm_v2/EpistemeAI/Alpaca-Llama3.1-8B/c3827ecd-d02a-4464-a098-110f4fb54516.json deleted file mode 100644 index 5e88a9d80..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Alpaca-Llama3.1-8B/c3827ecd-d02a-4464-a098-110f4fb54516.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Alpaca-Llama3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Alpaca-Llama3.1-8B", - "id": "EpistemeAI/Alpaca-Llama3.1-8B", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1599 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4755 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3403 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3246 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it-Philos/af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json b/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it-Philos/af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json deleted file mode 100644 index 109e4335c..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it-Philos/af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Athena-gemma-2-2b-it-Philos/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Athena-gemma-2-2b-it-Philos", - "id": "EpistemeAI/Athena-gemma-2-2b-it-Philos", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3795 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4314 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2248 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it/959a4e4d-211c-4e45-94f1-f8f877e0b36f.json b/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it/959a4e4d-211c-4e45-94f1-f8f877e0b36f.json deleted file mode 100644 index 6c1214fb7..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it/959a4e4d-211c-4e45-94f1-f8f877e0b36f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Athena-gemma-2-2b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Athena-gemma-2-2b-it", - "id": "EpistemeAI/Athena-gemma-2-2b-it", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3134 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4264 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2422 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3/96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json b/data/hfopenllm_v2/EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3/96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json deleted file mode 100644 index 18bd5723e..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3/96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Athene-codegemma-2-7b-it-alpaca-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Athene-codegemma-2-7b-it-alpaca-v1.3", - "id": "EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GemmaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4503 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2587 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/DeepPhi-3.5-mini-instruct/ed5d2ca8-d551-493d-8877-348204ef91cc.json b/data/hfopenllm_v2/EpistemeAI/DeepPhi-3.5-mini-instruct/ed5d2ca8-d551-493d-8877-348204ef91cc.json deleted file mode 100644 index 378a472d1..000000000 --- a/data/hfopenllm_v2/EpistemeAI/DeepPhi-3.5-mini-instruct/ed5d2ca8-d551-493d-8877-348204ef91cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_DeepPhi-3.5-mini-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepPhi-3.5-mini-instruct", - "id": "EpistemeAI/DeepPhi-3.5-mini-instruct", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1326 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2882 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2332 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/DeepThinkers-Phi4/04e20a14-8346-4801-8515-189861c857cb.json b/data/hfopenllm_v2/EpistemeAI/DeepThinkers-Phi4/04e20a14-8346-4801-8515-189861c857cb.json deleted file mode 100644 index f41ea35a4..000000000 --- a/data/hfopenllm_v2/EpistemeAI/DeepThinkers-Phi4/04e20a14-8346-4801-8515-189861c857cb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_DeepThinkers-Phi4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepThinkers-Phi4", - "id": "EpistemeAI/DeepThinkers-Phi4", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4585 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5258 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/FineLlama3.1-8B-Instruct/eec2da56-ba0a-418f-afe1-8a46882b9839.json b/data/hfopenllm_v2/EpistemeAI/FineLlama3.1-8B-Instruct/eec2da56-ba0a-418f-afe1-8a46882b9839.json deleted file mode 100644 index 7a0548790..000000000 --- a/data/hfopenllm_v2/EpistemeAI/FineLlama3.1-8B-Instruct/eec2da56-ba0a-418f-afe1-8a46882b9839.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_FineLlama3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FineLlama3.1-8B-Instruct", - "id": "EpistemeAI/FineLlama3.1-8B-Instruct", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "4bit", - "architecture": "?", - "params_billions": 14.483 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.08 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4557 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-12B-v1.13a-philosophers/321cf68b-9220-4ada-89da-061341a20a9d.json b/data/hfopenllm_v2/EpistemeAI/Fireball-12B-v1.13a-philosophers/321cf68b-9220-4ada-89da-061341a20a9d.json deleted file mode 100644 index 2dc46794a..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-12B-v1.13a-philosophers/321cf68b-9220-4ada-89da-061341a20a9d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-12B-v1.13a-philosophers/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-12B-v1.13a-philosophers", - "id": "EpistemeAI/Fireball-12B-v1.13a-philosophers", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5103 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3367 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-12B/86fda025-2345-4a40-9094-223b96b21f13.json b/data/hfopenllm_v2/EpistemeAI/Fireball-12B/86fda025-2345-4a40-9094-223b96b21f13.json deleted file mode 100644 index 0d7c7fd33..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-12B/86fda025-2345-4a40-9094-223b96b21f13.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-12B", - "id": "EpistemeAI/Fireball-12B", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1834 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3344 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/3c734233-9868-4ba6-83c0-2b63f2ce8980.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/3c734233-9868-4ba6-83c0-2b63f2ce8980.json deleted file mode 100644 index 185221b63..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/3c734233-9868-4ba6-83c0-2b63f2ce8980.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200", - "id": "EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4577 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4838 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3945 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3583 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json deleted file mode 100644 index a5c0fc181..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta", - "id": "EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7274 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4865 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1526 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3619 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3543 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/f5e0e809-08b8-43dd-a44d-875f365610c3.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/f5e0e809-08b8-43dd-a44d-875f365610c3.json deleted file mode 100644 index 5554b76ed..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/f5e0e809-08b8-43dd-a44d-875f365610c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2", - "id": "EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4673 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4932 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/8d267135-a7e6-4ec5-ae09-66478804bb66.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/8d267135-a7e6-4ec5-ae09-66478804bb66.json deleted file mode 100644 index e11cd048a..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/8d267135-a7e6-4ec5-ae09-66478804bb66.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto", - "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4824 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4066 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3516 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/4940ed0e-2c1e-4408-9806-49ceed30a69e.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/4940ed0e-2c1e-4408-9806-49ceed30a69e.json deleted file mode 100644 index afd2c84cc..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/4940ed0e-2c1e-4408-9806-49ceed30a69e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto", - "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7305 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.348 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json deleted file mode 100644 index df9550996..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto", - "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7207 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.461 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/5244ee3c-7d65-434a-acfe-cdb277ff5264.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/5244ee3c-7d65-434a-acfe-cdb277ff5264.json deleted file mode 100644 index 1b365ab87..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/5244ee3c-7d65-434a-acfe-cdb277ff5264.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds", - "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6691 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4668 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3418 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/eba4644f-d455-4a23-a16f-8ecb038ffe7f.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/eba4644f-d455-4a23-a16f-8ecb038ffe7f.json deleted file mode 100644 index 647383065..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/eba4644f-d455-4a23-a16f-8ecb038ffe7f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code", - "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5975 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4904 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.401 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/fb270319-7010-4946-b60c-409aebe41aaa.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/fb270319-7010-4946-b60c-409aebe41aaa.json deleted file mode 100644 index 19697de71..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/fb270319-7010-4946-b60c-409aebe41aaa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K", - "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4457 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4897 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3762 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3543 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json deleted file mode 100644 index b052f7972..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT", - "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4578 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4761 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1382 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3881 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3471 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/0220984e-fe8c-4e72-bc3e-92b949ffe769.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/0220984e-fe8c-4e72-bc3e-92b949ffe769.json deleted file mode 100644 index f8d75adfa..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/0220984e-fe8c-4e72-bc3e-92b949ffe769.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto", - "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7205 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4818 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1435 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3548 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math/16482634-ec03-463a-9deb-2230ee955800.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math/16482634-ec03-463a-9deb-2230ee955800.json deleted file mode 100644 index af0d2a1e2..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math/16482634-ec03-463a-9deb-2230ee955800.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Math/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Math", - "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4623 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4983 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/4c1db32d-96fc-4a66-b083-530a3e75ad6d.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/4c1db32d-96fc-4a66-b083-530a3e75ad6d.json deleted file mode 100644 index a7e78720e..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/4c1db32d-96fc-4a66-b083-530a3e75ad6d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO", - "id": "EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4611 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4801 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1254 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3521 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2/c0c5c846-395a-47ac-9e8e-e598939f317d.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2/c0c5c846-395a-47ac-9e8e-e598939f317d.json deleted file mode 100644 index d0ad3364c..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2/c0c5c846-395a-47ac-9e8e-e598939f317d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Mistral-Nemo-Base-2407-v1-DPO2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Mistral-Nemo-Base-2407-v1-DPO2", - "id": "EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1861 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4968 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.404 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT/6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json b/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT/6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json deleted file mode 100644 index 4e38880fc..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT/6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-R1-Llama-3.1-8B-Medical-COT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-R1-Llama-3.1-8B-Medical-COT", - "id": "EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3216 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.327 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3114 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B/d017e3bf-2abe-4b84-810e-e0eaf973adc3.json b/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B/d017e3bf-2abe-4b84-810e-e0eaf973adc3.json deleted file mode 100644 index 694f4c5d2..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B/d017e3bf-2abe-4b84-810e-e0eaf973adc3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-R1-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-R1-Llama-3.1-8B", - "id": "EpistemeAI/Fireball-R1-Llama-3.1-8B", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4427 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3643 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-R1.1-Llama-3.1-8B/62a3ecb8-f6d1-429c-807f-5545b2a5897f.json b/data/hfopenllm_v2/EpistemeAI/Fireball-R1.1-Llama-3.1-8B/62a3ecb8-f6d1-429c-807f-5545b2a5897f.json deleted file mode 100644 index 81219a0cf..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Fireball-R1.1-Llama-3.1-8B/62a3ecb8-f6d1-429c-807f-5545b2a5897f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-R1.1-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-R1.1-Llama-3.1-8B", - "id": "EpistemeAI/Fireball-R1.1-Llama-3.1-8B", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3676 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3326 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1382 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3419 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Llama-3.2-3B-Agent007-Coder/748557ce-1a49-4b3a-9c38-9007dc04aafb.json b/data/hfopenllm_v2/EpistemeAI/Llama-3.2-3B-Agent007-Coder/748557ce-1a49-4b3a-9c38-9007dc04aafb.json deleted file mode 100644 index c5b57ce3d..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Llama-3.2-3B-Agent007-Coder/748557ce-1a49-4b3a-9c38-9007dc04aafb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Llama-3.2-3B-Agent007-Coder/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Agent007-Coder", - "id": "EpistemeAI/Llama-3.2-3B-Agent007-Coder", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4304 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3668 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math/95d43d01-a75e-4af4-a2cc-b60f832071d3.json b/data/hfopenllm_v2/EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math/95d43d01-a75e-4af4-a2cc-b60f832071d3.json deleted file mode 100644 index 8cad61e00..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math/95d43d01-a75e-4af4-a2cc-b60f832071d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Mistral-Nemo-Instruct-12B-Philosophy-Math/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Instruct-12B-Philosophy-Math", - "id": "EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5365 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4292 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3296 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0/4dc7c889-7839-4047-b48c-33be5b688e72.json b/data/hfopenllm_v2/EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0/4dc7c889-7839-4047-b48c-33be5b688e72.json deleted file mode 100644 index 62e0df3f5..000000000 --- a/data/hfopenllm_v2/EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0/4dc7c889-7839-4047-b48c-33be5b688e72.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_OpenReasoner-Llama-3.2-3B-rs1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenReasoner-Llama-3.2-3B-rs1.0", - "id": "EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7274 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4519 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3134 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/751851c8-9a7f-4135-a106-eab4efbd0734.json b/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/751851c8-9a7f-4135-a106-eab4efbd0734.json deleted file mode 100644 index afaf74d49..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/751851c8-9a7f-4135-a106-eab4efbd0734.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy", - "id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7101 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4628 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3195 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3311 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json b/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json deleted file mode 100644 index 9a4c900d1..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic", - "id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7122 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4566 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.335 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json b/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json deleted file mode 100644 index f3d4c197b..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent", - "id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6915 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4525 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3578 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/33b8b64f-7da5-45aa-bf80-7145ef704229.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/33b8b64f-7da5-45aa-bf80-7145ef704229.json deleted file mode 100644 index 40ecf34e1..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/33b8b64f-7da5-45aa-bf80-7145ef704229.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO", - "id": "EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4553 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4804 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3931 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3598 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT/2662d257-49e2-430d-b44f-b0b347c61271.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT/2662d257-49e2-430d-b44f-b0b347c61271.json deleted file mode 100644 index c9e3844b1..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT/2662d257-49e2-430d-b44f-b0b347c61271.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reasoning-Llama-3.1-CoT-RE1-NMT", - "id": "EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4829 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4736 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3343 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2/870b639b-ee7a-4b13-872b-52657539c836.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2/870b639b-ee7a-4b13-872b-52657539c836.json deleted file mode 100644 index 2e9d69e3c..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2/870b639b-ee7a-4b13-872b-52657539c836.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reasoning-Llama-3.2-1B-Instruct-v1.2", - "id": "EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4087 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3324 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1179 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3/6ff20678-a335-4fa8-8126-9f96ce247f34.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3/6ff20678-a335-4fa8-8126-9f96ce247f34.json deleted file mode 100644 index a23d5fb0b..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3/6ff20678-a335-4fa8-8126-9f96ce247f34.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reasoning-Llama-3.2-1B-Instruct-v1.3", - "id": "EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3273 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.326 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/19c4ea89-896a-4577-a386-c2470eaf743f.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/19c4ea89-896a-4577-a386-c2470eaf743f.json deleted file mode 100644 index 3b21e4f59..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/19c4ea89-896a-4577-a386-c2470eaf743f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO", - "id": "EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.729 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4518 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1/22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1/22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json deleted file mode 100644 index ceb6ab2db..000000000 --- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1/22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reasoning-Llama-3.2-3B-Math-Instruct-RE1", - "id": "EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2789 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json deleted file mode 100644 index 3c0a3ed67..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math", - "id": "EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5903 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4364 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2823 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-0/071ca686-5950-4af4-80f2-969b1008e370.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-0/071ca686-5950-4af4-80f2-969b1008e370.json deleted file mode 100644 index da61c8f6c..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-0/071ca686-5950-4af4-80f2-969b1008e370.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-3B-0", - "id": "EpistemeAI/ReasoningCore-3B-0", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7341 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4446 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1586 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect/78977c34-33f8-4037-86e0-dfce1d01c3f8.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect/78977c34-33f8-4037-86e0-dfce1d01c3f8.json deleted file mode 100644 index 99bcceb57..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect/78977c34-33f8-4037-86e0-dfce1d01c3f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-Instruct-r01-Reflect/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-3B-Instruct-r01-Reflect", - "id": "EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7335 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-R01/480e4294-c8d9-4088-9b8c-7a239d57f683.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-R01/480e4294-c8d9-4088-9b8c-7a239d57f683.json deleted file mode 100644 index 78ed2cc8e..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-R01/480e4294-c8d9-4088-9b8c-7a239d57f683.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-R01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-3B-R01", - "id": "EpistemeAI/ReasoningCore-3B-R01", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2976 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3195 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2591 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2/be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2/be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json deleted file mode 100644 index a387e5176..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2/be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-RE1-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-3B-RE1-V2", - "id": "EpistemeAI/ReasoningCore-3B-RE1-V2", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7393 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1563 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2A/b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2A/b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json deleted file mode 100644 index 6f4873443..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2A/b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-RE1-V2A/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-3B-RE1-V2A", - "id": "EpistemeAI/ReasoningCore-3B-RE1-V2A", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5733 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2736 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2B/985e479b-658a-4548-9b5e-c9c04b8838c1.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2B/985e479b-658a-4548-9b5e-c9c04b8838c1.json deleted file mode 100644 index 21879ed27..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2B/985e479b-658a-4548-9b5e-c9c04b8838c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-RE1-V2B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-3B-RE1-V2B", - "id": "EpistemeAI/ReasoningCore-3B-RE1-V2B", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5051 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4168 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2673 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2C/d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2C/d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json deleted file mode 100644 index abce8ee39..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2C/d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-RE1-V2C/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-3B-RE1-V2C", - "id": "EpistemeAI/ReasoningCore-3B-RE1-V2C", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5057 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4177 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2691 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1-V1/5050c787-2f95-4a17-a4b0-c094860627b5.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1-V1/5050c787-2f95-4a17-a4b0-c094860627b5.json deleted file mode 100644 index e1a337bdd..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1-V1/5050c787-2f95-4a17-a4b0-c094860627b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-T1-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-3B-T1-V1", - "id": "EpistemeAI/ReasoningCore-3B-T1-V1", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.312 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1_1/bb5c8274-4324-47f2-94c5-d0c831ce0de7.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1_1/bb5c8274-4324-47f2-94c5-d0c831ce0de7.json deleted file mode 100644 index a3f8f7216..000000000 --- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1_1/bb5c8274-4324-47f2-94c5-d0c831ce0de7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-T1_1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReasoningCore-3B-T1_1", - "id": "EpistemeAI/ReasoningCore-3B-T1_1", - "developer": "EpistemeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2/8113a26a-5941-4f3d-872a-bdde5456ad97.json b/data/hfopenllm_v2/EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2/8113a26a-5941-4f3d-872a-bdde5456ad97.json deleted file mode 100644 index 18f691dc4..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2/8113a26a-5941-4f3d-872a-bdde5456ad97.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Athene-codegemma-2-7b-it-alpaca-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Athene-codegemma-2-7b-it-alpaca-v1.2", - "id": "EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4175 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2297 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-12B-v1.2/5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-12B-v1.2/5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json deleted file mode 100644 index 8a14100ea..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-12B-v1.2/5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-12B-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-12B-v1.2", - "id": "EpistemeAI2/Fireball-12B-v1.2", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1355 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5019 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3337 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos/88d79858-3a35-43eb-8da6-95b80b5deef6.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos/88d79858-3a35-43eb-8da6-95b80b5deef6.json deleted file mode 100644 index 1fef1a4b8..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos/88d79858-3a35-43eb-8da6-95b80b5deef6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1-8B-Philos/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1-8B-Philos", - "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4986 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos/63266a49-01ea-40f1-83ef-778f391aff2b.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos/63266a49-01ea-40f1-83ef-778f391aff2b.json deleted file mode 100644 index 81024bca2..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos/63266a49-01ea-40f1-83ef-778f391aff2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.01-8B-Philos/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1.01-8B-Philos", - "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4212 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4956 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3383 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos/f0da069a-833f-489a-a923-c79542a3a9a6.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos/f0da069a-833f-489a-a923-c79542a3a9a6.json deleted file mode 100644 index b979e6ad4..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos/f0da069a-833f-489a-a923-c79542a3a9a6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.03-8B-Philos/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1.03-8B-Philos", - "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3881 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4951 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3355 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos/205b9da8-d561-41ec-946e-1d2f9a43e437.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos/205b9da8-d561-41ec-946e-1d2f9a43e437.json deleted file mode 100644 index d4d82fcc6..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos/205b9da8-d561-41ec-946e-1d2f9a43e437.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.04-8B-Philos/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1.04-8B-Philos", - "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4084 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3403 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json deleted file mode 100644 index e5118311f..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo", - "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4866 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4881 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3932 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/c086f693-cef1-4212-9c17-669b210f4caa.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/c086f693-cef1-4212-9c17-669b210f4caa.json deleted file mode 100644 index a1b7ae3fb..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/c086f693-cef1-4212-9c17-669b210f4caa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1.07-8B-Philos-Math", - "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5079 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4847 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4063 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3531 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/290995f2-9982-4f29-ac74-dc646905206c.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/290995f2-9982-4f29-ac74-dc646905206c.json deleted file mode 100644 index 83634a0a2..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/290995f2-9982-4f29-ac74-dc646905206c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection", - "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3952 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4048 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3593 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/c60e65e6-d771-4c53-80d0-c1e09aa39377.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/c60e65e6-d771-4c53-80d0-c1e09aa39377.json deleted file mode 100644 index caf84f694..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/c60e65e6-d771-4c53-80d0-c1e09aa39377.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1", - "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5316 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4828 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4103 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection/fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection/fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json deleted file mode 100644 index cbd4f8177..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection/fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Llama-3.1-8B-Philos-Reflection/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Llama-3.1-8B-Philos-Reflection", - "id": "EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3596 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4898 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3957 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo/5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo/5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json deleted file mode 100644 index a3f2cd9b8..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo/5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-MathMistral-Nemo-Base-2407-v2dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-MathMistral-Nemo-Base-2407-v2dpo", - "id": "EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 11.58 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3097 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4328 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/a0b4a345-3530-4da2-8403-87259bbd1405.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/a0b4a345-3530-4da2-8403-87259bbd1405.json deleted file mode 100644 index fe44332dd..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/a0b4a345-3530-4da2-8403-87259bbd1405.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math", - "id": "EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4808 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1352 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json deleted file mode 100644 index 0ddafa271..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT", - "id": "EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4633 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4791 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3774 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos/707270e3-334b-4eba-84c0-2795ae53d79a.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos/707270e3-334b-4eba-84c0-2795ae53d79a.json deleted file mode 100644 index 6b5e01456..000000000 --- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos/707270e3-334b-4eba-84c0-2795ae53d79a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Phi-3-medium-4k-inst-Philos/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fireball-Phi-3-medium-4k-inst-Philos", - "id": "EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos", - "developer": "EpistemeAI2", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5313 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4139 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4599 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Eric111/CatunaMayo-DPO/c827bee3-a181-42bc-9387-ca132d59c8ba.json b/data/hfopenllm_v2/Eric111/CatunaMayo-DPO/c827bee3-a181-42bc-9387-ca132d59c8ba.json deleted file mode 100644 index f742caff8..000000000 --- a/data/hfopenllm_v2/Eric111/CatunaMayo-DPO/c827bee3-a181-42bc-9387-ca132d59c8ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Eric111_CatunaMayo-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CatunaMayo-DPO", - "id": "Eric111/CatunaMayo-DPO", - "developer": "Eric111", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4215 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5224 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.317 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Eric111/CatunaMayo/d3e8949b-f6f8-459f-891b-f4900ff806cd.json b/data/hfopenllm_v2/Eric111/CatunaMayo/d3e8949b-f6f8-459f-891b-f4900ff806cd.json deleted file mode 100644 index 5cbeab7ab..000000000 --- a/data/hfopenllm_v2/Eric111/CatunaMayo/d3e8949b-f6f8-459f-891b-f4900ff806cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Eric111_CatunaMayo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CatunaMayo", - "id": "Eric111/CatunaMayo", - "developer": "Eric111", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4074 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5244 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.454 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3178 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json b/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json deleted file mode 100644 index fb25872ce..000000000 --- a/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-3B-Instruct-DPO-Revised-Ties-v2", - "id": "Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2", - "developer": "Etherll", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1631 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3978 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties/4cf4479a-622a-4bc2-86f2-aa526216f24c.json b/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties/4cf4479a-622a-4bc2-86f2-aa526216f24c.json deleted file mode 100644 index 34efe21cd..000000000 --- a/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties/4cf4479a-622a-4bc2-86f2-aa526216f24c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-3B-Instruct-DPO-Revised-Ties", - "id": "Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties", - "developer": "Etherll", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1631 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3978 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b-Ties/6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json b/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b-Ties/6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json deleted file mode 100644 index f9d679b28..000000000 --- a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b-Ties/6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Etherll_Herplete-LLM-Llama-3.1-8b-Ties/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Herplete-LLM-Llama-3.1-8b-Ties", - "id": "Etherll/Herplete-LLM-Llama-3.1-8b-Ties", - "developer": "Etherll", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6164 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5338 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1601 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4017 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/87b5e360-7867-4edd-b45e-e7bb92a91b69.json b/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/87b5e360-7867-4edd-b45e-e7bb92a91b69.json deleted file mode 100644 index c264e2240..000000000 --- a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/87b5e360-7867-4edd-b45e-e7bb92a91b69.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Etherll_Herplete-LLM-Llama-3.1-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Herplete-LLM-Llama-3.1-8b", - "id": "Etherll/Herplete-LLM-Llama-3.1-8b", - "developer": "Etherll", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4672 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5013 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/d93116b8-28ff-41ea-8273-56f7ae11cf18.json b/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/d93116b8-28ff-41ea-8273-56f7ae11cf18.json deleted file mode 100644 index e86363c53..000000000 --- a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/d93116b8-28ff-41ea-8273-56f7ae11cf18.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Etherll_Herplete-LLM-Llama-3.1-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Herplete-LLM-Llama-3.1-8b", - "id": "Etherll/Herplete-LLM-Llama-3.1-8b", - "developer": "Etherll", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6106 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5347 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Etherll/Qwen2.5-7B-della-test/ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json b/data/hfopenllm_v2/Etherll/Qwen2.5-7B-della-test/ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json deleted file mode 100644 index dffbcaa3e..000000000 --- a/data/hfopenllm_v2/Etherll/Qwen2.5-7B-della-test/ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Etherll_Qwen2.5-7B-della-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-della-test", - "id": "Etherll/Qwen2.5-7B-della-test", - "developer": "Etherll", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7625 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5447 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4894 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4047 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4361 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Etherll/Qwen2.5-Coder-7B-Instruct-Ties/5e5602cc-b4de-4247-aa6d-940817fc849b.json b/data/hfopenllm_v2/Etherll/Qwen2.5-Coder-7B-Instruct-Ties/5e5602cc-b4de-4247-aa6d-940817fc849b.json deleted file mode 100644 index 89117f4d7..000000000 --- a/data/hfopenllm_v2/Etherll/Qwen2.5-Coder-7B-Instruct-Ties/5e5602cc-b4de-4247-aa6d-940817fc849b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Etherll_Qwen2.5-Coder-7B-Instruct-Ties/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-7B-Instruct-Ties", - "id": "Etherll/Qwen2.5-Coder-7B-Instruct-Ties", - "developer": "Etherll", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5005 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4895 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2915 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3503 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Etherll/Replete-LLM-V3-Llama-3.1-8b/cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json b/data/hfopenllm_v2/Etherll/Replete-LLM-V3-Llama-3.1-8b/cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json deleted file mode 100644 index 66e068f9c..000000000 --- a/data/hfopenllm_v2/Etherll/Replete-LLM-V3-Llama-3.1-8b/cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Etherll_Replete-LLM-V3-Llama-3.1-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Replete-LLM-V3-Llama-3.1-8b", - "id": "Etherll/Replete-LLM-V3-Llama-3.1-8b", - "developer": "Etherll", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5263 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4543 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2273 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3516 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.347 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Etherll/SuperHermes/aec03bd9-808a-4c3f-bbde-40bcac5775fb.json b/data/hfopenllm_v2/Etherll/SuperHermes/aec03bd9-808a-4c3f-bbde-40bcac5775fb.json deleted file mode 100644 index 688d993d6..000000000 --- a/data/hfopenllm_v2/Etherll/SuperHermes/aec03bd9-808a-4c3f-bbde-40bcac5775fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Etherll_SuperHermes/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SuperHermes", - "id": "Etherll/SuperHermes", - "developer": "Etherll", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5459 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1654 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3949 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Eurdem/Defne-llama3.1-8B/b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json b/data/hfopenllm_v2/Eurdem/Defne-llama3.1-8B/b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json deleted file mode 100644 index 39493bbf0..000000000 --- a/data/hfopenllm_v2/Eurdem/Defne-llama3.1-8B/b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Eurdem_Defne-llama3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Defne-llama3.1-8B", - "id": "Eurdem/Defne-llama3.1-8B", - "developer": "Eurdem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5036 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5321 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1601 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4331 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3866 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FINGU-AI/Chocolatine-Fusion-14B/c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json b/data/hfopenllm_v2/FINGU-AI/Chocolatine-Fusion-14B/c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json deleted file mode 100644 index 35a5b2bfe..000000000 --- a/data/hfopenllm_v2/FINGU-AI/Chocolatine-Fusion-14B/c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FINGU-AI_Chocolatine-Fusion-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-Fusion-14B", - "id": "FINGU-AI/Chocolatine-Fusion-14B", - "developer": "FINGU-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 8.367 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6949 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6413 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.494 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5262 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FINGU-AI/L3-8B/a93c5674-599b-429c-a322-3c6bc7248f45.json b/data/hfopenllm_v2/FINGU-AI/L3-8B/a93c5674-599b-429c-a322-3c6bc7248f45.json deleted file mode 100644 index c9e5228a0..000000000 --- a/data/hfopenllm_v2/FINGU-AI/L3-8B/a93c5674-599b-429c-a322-3c6bc7248f45.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FINGU-AI_L3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B", - "id": "FINGU-AI/L3-8B", - "developer": "FINGU-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7517 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4986 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2545 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3828 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3639 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FINGU-AI/Phi-4-RRStock/5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json b/data/hfopenllm_v2/FINGU-AI/Phi-4-RRStock/5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json deleted file mode 100644 index e93edbe24..000000000 --- a/data/hfopenllm_v2/FINGU-AI/Phi-4-RRStock/5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FINGU-AI_Phi-4-RRStock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-RRStock", - "id": "FINGU-AI/Phi-4-RRStock", - "developer": "FINGU-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.652 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2855 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6443 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4883 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FINGU-AI/Q-Small-3B/c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json b/data/hfopenllm_v2/FINGU-AI/Q-Small-3B/c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json deleted file mode 100644 index 61898827c..000000000 --- a/data/hfopenllm_v2/FINGU-AI/Q-Small-3B/c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FINGU-AI_Q-Small-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q-Small-3B", - "id": "FINGU-AI/Q-Small-3B", - "developer": "FINGU-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4145 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4319 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4005 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.279 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FINGU-AI/QwQ-Buddy-32B-Alpha/1b49cb06-3ee1-4945-aaed-12c868d9e45e.json b/data/hfopenllm_v2/FINGU-AI/QwQ-Buddy-32B-Alpha/1b49cb06-3ee1-4945-aaed-12c868d9e45e.json deleted file mode 100644 index a93048eac..000000000 --- a/data/hfopenllm_v2/FINGU-AI/QwQ-Buddy-32B-Alpha/1b49cb06-3ee1-4945-aaed-12c868d9e45e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FINGU-AI_QwQ-Buddy-32B-Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-Buddy-32B-Alpha", - "id": "FINGU-AI/QwQ-Buddy-32B-Alpha", - "developer": "FINGU-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 19.662 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3446 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6424 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.506 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5294 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FINGU-AI/RomboUltima-32B/65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json b/data/hfopenllm_v2/FINGU-AI/RomboUltima-32B/65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json deleted file mode 100644 index 05c3a1ad9..000000000 --- a/data/hfopenllm_v2/FINGU-AI/RomboUltima-32B/65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FINGU-AI_RomboUltima-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RomboUltima-32B", - "id": "FINGU-AI/RomboUltima-32B", - "developer": "FINGU-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 17.645 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6672 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6938 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4836 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5789 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FINGU-AI/Ultimos-32B/7fecc176-debf-4bf7-b3f3-479d05678a1e.json b/data/hfopenllm_v2/FINGU-AI/Ultimos-32B/7fecc176-debf-4bf7-b3f3-479d05678a1e.json deleted file mode 100644 index d9153beab..000000000 --- a/data/hfopenllm_v2/FINGU-AI/Ultimos-32B/7fecc176-debf-4bf7-b3f3-479d05678a1e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FINGU-AI_Ultimos-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ultimos-32B", - "id": "FINGU-AI/Ultimos-32B", - "developer": "FINGU-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 9.604 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1592 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2906 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3286 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FallenMerick/Chewy-Lemon-Cookie-11B/3c965626-a264-40db-93e1-cd7659d0662e.json b/data/hfopenllm_v2/FallenMerick/Chewy-Lemon-Cookie-11B/3c965626-a264-40db-93e1-cd7659d0662e.json deleted file mode 100644 index 21887b781..000000000 --- a/data/hfopenllm_v2/FallenMerick/Chewy-Lemon-Cookie-11B/3c965626-a264-40db-93e1-cd7659d0662e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FallenMerick_Chewy-Lemon-Cookie-11B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chewy-Lemon-Cookie-11B", - "id": "FallenMerick/Chewy-Lemon-Cookie-11B", - "developer": "FallenMerick", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4875 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5251 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4546 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3267 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Felladrin/Llama-160M-Chat-v1/50fa6f0c-d689-4380-b619-253209b5badc.json b/data/hfopenllm_v2/Felladrin/Llama-160M-Chat-v1/50fa6f0c-d689-4380-b619-253209b5badc.json deleted file mode 100644 index bb5ec6c1c..000000000 --- a/data/hfopenllm_v2/Felladrin/Llama-160M-Chat-v1/50fa6f0c-d689-4380-b619-253209b5badc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Felladrin_Llama-160M-Chat-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-160M-Chat-v1", - "id": "Felladrin/Llama-160M-Chat-v1", - "developer": "Felladrin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.162 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3036 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1136 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Felladrin/Minueza-32M-UltraChat/adb25c88-6113-4307-bbf0-d377f757bc18.json b/data/hfopenllm_v2/Felladrin/Minueza-32M-UltraChat/adb25c88-6113-4307-bbf0-d377f757bc18.json deleted file mode 100644 index 43e58afa9..000000000 --- a/data/hfopenllm_v2/Felladrin/Minueza-32M-UltraChat/adb25c88-6113-4307-bbf0-d377f757bc18.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Felladrin_Minueza-32M-UltraChat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minueza-32M-UltraChat", - "id": "Felladrin/Minueza-32M-UltraChat", - "developer": "Felladrin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 0.033 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1376 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2941 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/b9ac5e03-c878-4e46-a89c-1906f3b91dce.json b/data/hfopenllm_v2/FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/b9ac5e03-c878-4e46-a89c-1906f3b91dce.json deleted file mode 100644 index 1fd7e8ac3..000000000 --- a/data/hfopenllm_v2/FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/b9ac5e03-c878-4e46-a89c-1906f3b91dce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", - "id": "FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3083 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3323 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1498 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/d6a6badf-4472-44b5-af9e-4282e4406a8e.json b/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/d6a6badf-4472-44b5-af9e-4282e4406a8e.json deleted file mode 100644 index 648617711..000000000 --- a/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/d6a6badf-4472-44b5-af9e-4282e4406a8e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", - "id": "FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 16.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5097 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/92e62d3a-3091-4538-b6da-ba705e11687a.json b/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/92e62d3a-3091-4538-b6da-ba705e11687a.json deleted file mode 100644 index 7f912b4b9..000000000 --- a/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/92e62d3a-3091-4538-b6da-ba705e11687a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", - "id": "FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2815 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3306 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json b/data/hfopenllm_v2/FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json deleted file mode 100644 index cbbb13eef..000000000 --- a/data/hfopenllm_v2/FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", - "id": "FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3016 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3325 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1485 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/5013ccfc-6bc5-4862-898c-1ca781f92572.json b/data/hfopenllm_v2/FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/5013ccfc-6bc5-4862-898c-1ca781f92572.json deleted file mode 100644 index b055b4a00..000000000 --- a/data/hfopenllm_v2/FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/5013ccfc-6bc5-4862-898c-1ca781f92572.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", - "id": "FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1555 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb/38fff98c-72b1-453c-a2cf-cf077dd19d10.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb/38fff98c-72b1-453c-a2cf-cf077dd19d10.json deleted file mode 100644 index fe9ff6a34..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb/38fff98c-72b1-453c-a2cf-cf077dd19d10.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1000k_fineweb/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_1000k_fineweb", - "id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1485 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2918 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3581 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/42911928-ef64-474b-828a-02ce3383773e.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/42911928-ef64-474b-828a-02ce3383773e.json deleted file mode 100644 index 1a53297cc..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/42911928-ef64-474b-828a-02ce3383773e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed", - "id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1554 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json deleted file mode 100644 index 36ed45fb4..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_1000k_fineweb_uncovai_selected", - "id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1468 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2932 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4048 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb/5b9acd52-7eb6-4099-98be-ecd6cae07835.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb/5b9acd52-7eb6-4099-98be-ecd6cae07835.json deleted file mode 100644 index 0446e5349..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb/5b9acd52-7eb6-4099-98be-ecd6cae07835.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1200k_fineweb/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_1200k_fineweb", - "id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1581 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2941 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3714 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1076 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/666bef5a-2d62-4743-bff1-07365716ab19.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/666bef5a-2d62-4743-bff1-07365716ab19.json deleted file mode 100644 index 197afa5d3..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/666bef5a-2d62-4743-bff1-07365716ab19.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed", - "id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1578 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.295 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/85de411c-2308-4824-bd6e-3327eeb6fe3e.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/85de411c-2308-4824-bd6e-3327eeb6fe3e.json deleted file mode 100644 index bc92a7eb5..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/85de411c-2308-4824-bd6e-3327eeb6fe3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_1200k_fineweb_uncovai_selected", - "id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1585 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.296 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3567 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb/df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb/df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json deleted file mode 100644 index 86c5be30e..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb/df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1400k_fineweb/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_1400k_fineweb", - "id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1764 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2922 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json deleted file mode 100644 index 4fd364781..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed", - "id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2992 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/c41df02e-5aff-4de6-a1c4-d45b5585e29d.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/c41df02e-5aff-4de6-a1c4-d45b5585e29d.json deleted file mode 100644 index c93f56f5d..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/c41df02e-5aff-4de6-a1c4-d45b5585e29d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_1400k_fineweb_uncovai_selected", - "id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1538 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2917 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3741 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/aa587b4a-9c19-4231-ba72-9b66446460f9.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/aa587b4a-9c19-4231-ba72-9b66446460f9.json deleted file mode 100644 index c51687eca..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/aa587b4a-9c19-4231-ba72-9b66446460f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed", - "id": "FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1475 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3578 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected/be14e75e-4fb1-41aa-b168-1ec23eb305e0.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected/be14e75e-4fb1-41aa-b168-1ec23eb305e0.json deleted file mode 100644 index 6785da5dc..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected/be14e75e-4fb1-41aa-b168-1ec23eb305e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_selected/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_200k_fineweb_uncovai_selected", - "id": "FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1345 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2927 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb/73be4a2b-28c9-4208-8107-3734fea25008.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb/73be4a2b-28c9-4208-8107-3734fea25008.json deleted file mode 100644 index ec2adafe9..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb/73be4a2b-28c9-4208-8107-3734fea25008.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_400k_fineweb/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_400k_fineweb", - "id": "FlofloB/smollm2-135M_pretrained_400k_fineweb", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1511 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2972 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json deleted file mode 100644 index f405d336a..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed", - "id": "FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3049 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1138 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected/9f8fc05a-8658-4ed3-994a-965e6882d242.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected/9f8fc05a-8658-4ed3-994a-965e6882d242.json deleted file mode 100644 index 2de316a22..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected/9f8fc05a-8658-4ed3-994a-965e6882d242.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_selected/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_400k_fineweb_uncovai_selected", - "id": "FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1584 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2925 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1158 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb/ced11f6e-490d-42e9-8f3e-00e22cfc2910.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb/ced11f6e-490d-42e9-8f3e-00e22cfc2910.json deleted file mode 100644 index 13f664341..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb/ced11f6e-490d-42e9-8f3e-00e22cfc2910.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_600k_fineweb/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_600k_fineweb", - "id": "FlofloB/smollm2-135M_pretrained_600k_fineweb", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3014 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/70ba788b-fe8c-4667-a859-0fb122de22b9.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/70ba788b-fe8c-4667-a859-0fb122de22b9.json deleted file mode 100644 index 3978e6a37..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/70ba788b-fe8c-4667-a859-0fb122de22b9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed", - "id": "FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1641 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected/e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected/e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json deleted file mode 100644 index 4fb6c3205..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected/e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_selected/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_600k_fineweb_uncovai_selected", - "id": "FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1606 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2983 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3846 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1162 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb/15cacfe0-bdfb-4b87-a813-bfa70ff71984.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb/15cacfe0-bdfb-4b87-a813-bfa70ff71984.json deleted file mode 100644 index 3d0b3cb65..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb/15cacfe0-bdfb-4b87-a813-bfa70ff71984.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_800k_fineweb/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_800k_fineweb", - "id": "FlofloB/smollm2-135M_pretrained_800k_fineweb", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1641 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2959 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json deleted file mode 100644 index ee1ff5fea..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed", - "id": "FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1623 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3038 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3993 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1138 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected/e1eab0cf-2c6d-44b2-8aaf-a75347741529.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected/e1eab0cf-2c6d-44b2-8aaf-a75347741529.json deleted file mode 100644 index c0e2e5309..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected/e1eab0cf-2c6d-44b2-8aaf-a75347741529.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_selected/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-135M_pretrained_800k_fineweb_uncovai_selected", - "id": "FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1474 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2943 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3766 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/smollm2_pretrained_200k_fineweb/ed221db8-cf81-4257-8785-db9381eec5b7.json b/data/hfopenllm_v2/FlofloB/smollm2_pretrained_200k_fineweb/ed221db8-cf81-4257-8785-db9381eec5b7.json deleted file mode 100644 index efa15e0a4..000000000 --- a/data/hfopenllm_v2/FlofloB/smollm2_pretrained_200k_fineweb/ed221db8-cf81-4257-8785-db9381eec5b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_smollm2_pretrained_200k_fineweb/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2_pretrained_200k_fineweb", - "id": "FlofloB/smollm2_pretrained_200k_fineweb", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1527 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1159 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/b314468b-401a-4318-b022-c966bf3366aa.json b/data/hfopenllm_v2/FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/b314468b-401a-4318-b022-c966bf3366aa.json deleted file mode 100644 index 0db2c1292..000000000 --- a/data/hfopenllm_v2/FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/b314468b-401a-4318-b022-c966bf3366aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FlofloB_test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", - "id": "FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit", - "developer": "FlofloB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 16.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5215 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3721 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs/a0dbb2eb-66c7-48a3-a85c-725b49141edf.json b/data/hfopenllm_v2/FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs/a0dbb2eb-66c7-48a3-a85c-725b49141edf.json deleted file mode 100644 index 09f22c033..000000000 --- a/data/hfopenllm_v2/FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs/a0dbb2eb-66c7-48a3-a85c-725b49141edf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FuJhen_ft-openhermes-25-mistral-7b-irca-dpo-pairs/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ft-openhermes-25-mistral-7b-irca-dpo-pairs", - "id": "FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs", - "developer": "FuJhen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 14.483 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4773 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4174 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2956 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FuJhen/mistral-instruct-7B-DPO/812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json b/data/hfopenllm_v2/FuJhen/mistral-instruct-7B-DPO/812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json deleted file mode 100644 index a76f20076..000000000 --- a/data/hfopenllm_v2/FuJhen/mistral-instruct-7B-DPO/812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FuJhen_mistral-instruct-7B-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-instruct-7B-DPO", - "id": "FuJhen/mistral-instruct-7B-DPO", - "developer": "FuJhen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 14.496 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4968 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4016 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3034 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_e2e/77af2424-0a23-49f3-97b0-316d04a33547.json b/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_e2e/77af2424-0a23-49f3-97b0-316d04a33547.json deleted file mode 100644 index fd9de9edd..000000000 --- a/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_e2e/77af2424-0a23-49f3-97b0-316d04a33547.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FuJhen_mistral_7b_v0.1_structedData_e2e/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral_7b_v0.1_structedData_e2e", - "id": "FuJhen/mistral_7b_v0.1_structedData_e2e", - "developer": "FuJhen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1727 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2811 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_viggo/6f422676-2d7e-40ed-a5e3-4afc25564cfc.json b/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_viggo/6f422676-2d7e-40ed-a5e3-4afc25564cfc.json deleted file mode 100644 index 2e924bc72..000000000 --- a/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_viggo/6f422676-2d7e-40ed-a5e3-4afc25564cfc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FuJhen_mistral_7b_v0.1_structedData_viggo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral_7b_v0.1_structedData_viggo", - "id": "FuJhen/mistral_7b_v0.1_structedData_viggo", - "developer": "FuJhen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 14.483 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1783 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2942 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FuseAI/FuseChat-7B-v2.0/43923dd6-838a-4259-a938-7766dfd9c07e.json b/data/hfopenllm_v2/FuseAI/FuseChat-7B-v2.0/43923dd6-838a-4259-a938-7766dfd9c07e.json deleted file mode 100644 index c01a38f0d..000000000 --- a/data/hfopenllm_v2/FuseAI/FuseChat-7B-v2.0/43923dd6-838a-4259-a938-7766dfd9c07e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FuseAI_FuseChat-7B-v2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FuseChat-7B-v2.0", - "id": "FuseAI/FuseChat-7B-v2.0", - "developer": "FuseAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4954 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4797 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3162 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.1-8B-Instruct/dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json b/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.1-8B-Instruct/dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json deleted file mode 100644 index b8ec7de60..000000000 --- a/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.1-8B-Instruct/dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FuseAI_FuseChat-Llama-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FuseChat-Llama-3.1-8B-Instruct", - "id": "FuseAI/FuseChat-Llama-3.1-8B-Instruct", - "developer": "FuseAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7205 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.2-3B-Instruct/16a782dc-0795-4281-aad6-4f664a0940ab.json b/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.2-3B-Instruct/16a782dc-0795-4281-aad6-4f664a0940ab.json deleted file mode 100644 index 879cf24a5..000000000 --- a/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.2-3B-Instruct/16a782dc-0795-4281-aad6-4f664a0940ab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FuseAI_FuseChat-Llama-3.2-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FuseChat-Llama-3.2-3B-Instruct", - "id": "FuseAI/FuseChat-Llama-3.2-3B-Instruct", - "developer": "FuseAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6849 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4658 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3132 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/5d24d4ad-9f37-4634-ba23-74fbc74fd298.json b/data/hfopenllm_v2/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/5d24d4ad-9f37-4634-ba23-74fbc74fd298.json deleted file mode 100644 index 0a5c32d0b..000000000 --- a/data/hfopenllm_v2/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/5d24d4ad-9f37-4634-ba23-74fbc74fd298.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/FuseAI_FuseChat-Qwen-2.5-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FuseChat-Qwen-2.5-7B-Instruct", - "id": "FuseAI/FuseChat-Qwen-2.5-7B-Instruct", - "developer": "FuseAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5906 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5526 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GalrionSoftworks/MN-LooseCannon-12B-v1/043cd315-fcb7-4871-ae79-dee3fdefaef0.json b/data/hfopenllm_v2/GalrionSoftworks/MN-LooseCannon-12B-v1/043cd315-fcb7-4871-ae79-dee3fdefaef0.json deleted file mode 100644 index a45337ab5..000000000 --- a/data/hfopenllm_v2/GalrionSoftworks/MN-LooseCannon-12B-v1/043cd315-fcb7-4871-ae79-dee3fdefaef0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GalrionSoftworks_MN-LooseCannon-12B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-LooseCannon-12B-v1", - "id": "GalrionSoftworks/MN-LooseCannon-12B-v1", - "developer": "GalrionSoftworks", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5418 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5128 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GalrionSoftworks/MagnusIntellectus-12B-v1/3c377d7e-14bc-4c82-9ada-7560552abbe4.json b/data/hfopenllm_v2/GalrionSoftworks/MagnusIntellectus-12B-v1/3c377d7e-14bc-4c82-9ada-7560552abbe4.json deleted file mode 100644 index 19eeb69d1..000000000 --- a/data/hfopenllm_v2/GalrionSoftworks/MagnusIntellectus-12B-v1/3c377d7e-14bc-4c82-9ada-7560552abbe4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GalrionSoftworks_MagnusIntellectus-12B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MagnusIntellectus-12B-v1", - "id": "GalrionSoftworks/MagnusIntellectus-12B-v1", - "developer": "GalrionSoftworks", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4421 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5323 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3421 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-2-Merged/43bb650b-8bb7-41b4-866a-cb2dad1499d6.json b/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-2-Merged/43bb650b-8bb7-41b4-866a-cb2dad1499d6.json deleted file mode 100644 index ce87f2d44..000000000 --- a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-2-Merged/43bb650b-8bb7-41b4-866a-cb2dad1499d6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GenVRadmin_AryaBhatta-GemmaOrca-2-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AryaBhatta-GemmaOrca-2-Merged", - "id": "GenVRadmin/AryaBhatta-GemmaOrca-2-Merged", - "developer": "GenVRadmin", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3887 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.455 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-Merged/bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json b/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-Merged/bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json deleted file mode 100644 index 54f086875..000000000 --- a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-Merged/bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GenVRadmin_AryaBhatta-GemmaOrca-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AryaBhatta-GemmaOrca-Merged", - "id": "GenVRadmin/AryaBhatta-GemmaOrca-Merged", - "developer": "GenVRadmin", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4131 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3524 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2228 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaUltra-Merged/14a1872c-7afd-4cd4-ad87-853e4fc0847e.json b/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaUltra-Merged/14a1872c-7afd-4cd4-ad87-853e4fc0847e.json deleted file mode 100644 index 88dd8d560..000000000 --- a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaUltra-Merged/14a1872c-7afd-4cd4-ad87-853e4fc0847e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GenVRadmin_AryaBhatta-GemmaUltra-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AryaBhatta-GemmaUltra-Merged", - "id": "GenVRadmin/AryaBhatta-GemmaUltra-Merged", - "developer": "GenVRadmin", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3021 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GenVRadmin/llama38bGenZ_Vikas-Merged/887e4ca9-ed48-4b33-b933-f8534a8d0377.json b/data/hfopenllm_v2/GenVRadmin/llama38bGenZ_Vikas-Merged/887e4ca9-ed48-4b33-b933-f8534a8d0377.json deleted file mode 100644 index 0c72024f8..000000000 --- a/data/hfopenllm_v2/GenVRadmin/llama38bGenZ_Vikas-Merged/887e4ca9-ed48-4b33-b933-f8534a8d0377.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GenVRadmin_llama38bGenZ_Vikas-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama38bGenZ_Vikas-Merged", - "id": "GenVRadmin/llama38bGenZ_Vikas-Merged", - "developer": "GenVRadmin", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4536 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2622 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct/c585488d-4043-482f-b1fa-4a61e96f7f0f.json b/data/hfopenllm_v2/GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct/c585488d-4043-482f-b1fa-4a61e96f7f0f.json deleted file mode 100644 index 99a5c1561..000000000 --- a/data/hfopenllm_v2/GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct/c585488d-4043-482f-b1fa-4a61e96f7f0f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GoToCompany_gemma2-9b-cpt-sahabatai-v1-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma2-9b-cpt-sahabatai-v1-instruct", - "id": "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct", - "developer": "GoToCompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6551 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5955 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2054 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4779 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4264 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct/d64541f6-19ef-4f04-a991-93efec6fe24f.json b/data/hfopenllm_v2/GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct/d64541f6-19ef-4f04-a991-93efec6fe24f.json deleted file mode 100644 index e5f54c812..000000000 --- a/data/hfopenllm_v2/GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct/d64541f6-19ef-4f04-a991-93efec6fe24f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GoToCompany_llama3-8b-cpt-sahabatai-v1-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3-8b-cpt-sahabatai-v1-instruct", - "id": "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct", - "developer": "GoToCompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5238 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4951 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3453 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1c13e194-8bee-4456-a249-f71e7e34b0eb.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1c13e194-8bee-4456-a249-f71e7e34b0eb.json deleted file mode 100644 index db3383a91..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1c13e194-8bee-4456-a249-f71e7e34b0eb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1", - "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3417 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1638 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1d3db737-20e7-4da1-a311-e60de0b41c93.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1d3db737-20e7-4da1-a311-e60de0b41c93.json deleted file mode 100644 index a0e2c50db..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1d3db737-20e7-4da1-a311-e60de0b41c93.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1", - "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3472 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3268 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1641 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/7b73d50e-358b-4961-8b58-63765ce5a82a.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/7b73d50e-358b-4961-8b58-63765ce5a82a.json deleted file mode 100644 index 6da2363a1..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/7b73d50e-358b-4961-8b58-63765ce5a82a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1", - "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4769 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2085 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3675 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2783 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/81dfd69c-cf01-4114-8157-fd09af6f490c.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/81dfd69c-cf01-4114-8157-fd09af6f490c.json deleted file mode 100644 index 5793c4f9c..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/81dfd69c-cf01-4114-8157-fd09af6f490c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2", - "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4216 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4042 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json deleted file mode 100644 index c4deee9bd..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3", - "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4053 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3702 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2556 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/01863b4f-9550-49c3-ad83-74c0bb535eb9.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/01863b4f-9550-49c3-ad83-74c0bb535eb9.json deleted file mode 100644 index 42f6949ab..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/01863b4f-9550-49c3-ad83-74c0bb535eb9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-14B-Instruct-abliterated-v4", - "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8292 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6356 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5018 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/edd25437-38bc-443c-9da3-bc041270447e.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/edd25437-38bc-443c-9da3-bc041270447e.json deleted file mode 100644 index 0afe4d3d1..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/edd25437-38bc-443c-9da3-bc041270447e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", - "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7814 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4532 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/31836d43-5022-488f-ba9e-379195809069.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/31836d43-5022-488f-ba9e-379195809069.json deleted file mode 100644 index 85cf3c248..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/31836d43-5022-488f-ba9e-379195809069.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "j.o.s.i.e.v4o-1.5b-dpo-stage1-v1", - "id": "Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2555 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-3b-v6.0/2a5a3ed6-7137-49e2-a141-497ceba88757.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-3b-v6.0/2a5a3ed6-7137-49e2-a141-497ceba88757.json deleted file mode 100644 index 492ac7214..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-3b-v6.0/2a5a3ed6-7137-49e2-a141-497ceba88757.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_josie-3b-v6.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "josie-3b-v6.0", - "id": "Goekdeniz-Guelmez/josie-3b-v6.0", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.601 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4496 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2938 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3861 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.322 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json deleted file mode 100644 index 55f207668..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_josie-7b-v6.0-step2000/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "josie-7b-v6.0-step2000", - "id": "Goekdeniz-Guelmez/josie-7b-v6.0-step2000", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7598 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4012 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/69423132-adc9-4b97-b799-15f37de1d7e5.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/69423132-adc9-4b97-b799-15f37de1d7e5.json deleted file mode 100644 index 8561cb840..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/69423132-adc9-4b97-b799-15f37de1d7e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_josie-7b-v6.0-step2000/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "josie-7b-v6.0-step2000", - "id": "Goekdeniz-Guelmez/josie-7b-v6.0-step2000", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7628 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5098 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4033 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0/54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0/54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json deleted file mode 100644 index 359a9fc74..000000000 --- a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0/54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_josie-7b-v6.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "josie-7b-v6.0", - "id": "Goekdeniz-Guelmez/josie-7b-v6.0", - "developer": "Goekdeniz-Guelmez", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5105 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GreenNode/GreenNode-small-9B-it/cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json b/data/hfopenllm_v2/GreenNode/GreenNode-small-9B-it/cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json deleted file mode 100644 index 847a2aed3..000000000 --- a/data/hfopenllm_v2/GreenNode/GreenNode-small-9B-it/cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GreenNode_GreenNode-small-9B-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GreenNode-small-9B-it", - "id": "GreenNode/GreenNode-small-9B-it", - "developer": "GreenNode", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5994 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1745 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4204 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3927 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GritLM/GritLM-7B-KTO/7fbc0323-1c78-46b6-a08a-6e5870c64e53.json b/data/hfopenllm_v2/GritLM/GritLM-7B-KTO/7fbc0323-1c78-46b6-a08a-6e5870c64e53.json deleted file mode 100644 index 054831436..000000000 --- a/data/hfopenllm_v2/GritLM/GritLM-7B-KTO/7fbc0323-1c78-46b6-a08a-6e5870c64e53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GritLM_GritLM-7B-KTO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GritLM-7B-KTO", - "id": "GritLM/GritLM-7B-KTO", - "developer": "GritLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4853 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.371 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.268 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GritLM/GritLM-8x7B-KTO/1c769f0d-b99d-4b82-a529-f5264f7b3349.json b/data/hfopenllm_v2/GritLM/GritLM-8x7B-KTO/1c769f0d-b99d-4b82-a529-f5264f7b3349.json deleted file mode 100644 index 9c52b15ad..000000000 --- a/data/hfopenllm_v2/GritLM/GritLM-8x7B-KTO/1c769f0d-b99d-4b82-a529-f5264f7b3349.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GritLM_GritLM-8x7B-KTO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GritLM-8x7B-KTO", - "id": "GritLM/GritLM-8x7B-KTO", - "developer": "GritLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5714 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.582 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4217 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3648 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Groq/Llama-3-Groq-8B-Tool-Use/a9365685-e299-48e2-931a-c63e123a9e00.json b/data/hfopenllm_v2/Groq/Llama-3-Groq-8B-Tool-Use/a9365685-e299-48e2-931a-c63e123a9e00.json deleted file mode 100644 index 1407a69b5..000000000 --- a/data/hfopenllm_v2/Groq/Llama-3-Groq-8B-Tool-Use/a9365685-e299-48e2-931a-c63e123a9e00.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Groq_Llama-3-Groq-8B-Tool-Use/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Groq-8B-Tool-Use", - "id": "Groq/Llama-3-Groq-8B-Tool-Use", - "developer": "Groq", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6098 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4863 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3399 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.0-8b-Llama-3/bdf2d61a-daa1-4b1f-9245-43ff263540fb.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.0-8b-Llama-3/bdf2d61a-daa1-4b1f-9245-43ff263540fb.json deleted file mode 100644 index 7c3f4b4a5..000000000 --- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.0-8b-Llama-3/bdf2d61a-daa1-4b1f-9245-43ff263540fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-1.0-8b-Llama-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pantheon-RP-1.0-8b-Llama-3", - "id": "Gryphe/Pantheon-RP-1.0-8b-Llama-3", - "developer": "Gryphe", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3933 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3832 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3067 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.5-12b-Nemo/f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.5-12b-Nemo/f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json deleted file mode 100644 index 5c8283525..000000000 --- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.5-12b-Nemo/f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-1.5-12b-Nemo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pantheon-RP-1.5-12b-Nemo", - "id": "Gryphe/Pantheon-RP-1.5-12b-Nemo", - "developer": "Gryphe", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5196 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.442 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO/29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO/29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json deleted file mode 100644 index 2a9bc0b72..000000000 --- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO/29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-1.6-12b-Nemo-KTO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pantheon-RP-1.6-12b-Nemo-KTO", - "id": "Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO", - "developer": "Gryphe", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4636 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5277 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4248 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo/c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo/c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json deleted file mode 100644 index 36f59a731..000000000 --- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo/c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-1.6-12b-Nemo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pantheon-RP-1.6-12b-Nemo", - "id": "Gryphe/Pantheon-RP-1.6-12b-Nemo", - "developer": "Gryphe", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5204 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3311 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small/0b11eb9a-61c8-4af1-8335-24bef2597e5d.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small/0b11eb9a-61c8-4af1-8335-24bef2597e5d.json deleted file mode 100644 index 672a5af34..000000000 --- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small/0b11eb9a-61c8-4af1-8335-24bef2597e5d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-Pure-1.6.2-22b-Small/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pantheon-RP-Pure-1.6.2-22b-Small", - "id": "Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small", - "developer": "Gryphe", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6931 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5305 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3765 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3942 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/7d31e5fd-700a-42a8-bea8-8989e8c52603.json b/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/7d31e5fd-700a-42a8-bea8-8989e8c52603.json deleted file mode 100644 index 51a98d10f..000000000 --- a/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/7d31e5fd-700a-42a8-bea8-8989e8c52603.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GuilhermeNaturaUmana_Nature-Reason-1.2-reallysmall/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nature-Reason-1.2-reallysmall", - "id": "GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall", - "developer": "GuilhermeNaturaUmana", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4791 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5649 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4439 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json b/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json deleted file mode 100644 index 36f7c0fb0..000000000 --- a/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/GuilhermeNaturaUmana_Nature-Reason-1.2-reallysmall/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nature-Reason-1.2-reallysmall", - "id": "GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall", - "developer": "GuilhermeNaturaUmana", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4985 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5645 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/2fae7e4a-8c28-4be8-9391-ca79077e32c2.json b/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/2fae7e4a-8c28-4be8-9391-ca79077e32c2.json deleted file mode 100644 index 3d2506966..000000000 --- a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/2fae7e4a-8c28-4be8-9391-ca79077e32c2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Gunulhona_Gemma-Ko-Merge-PEFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-Ko-Merge-PEFT", - "id": "Gunulhona/Gemma-Ko-Merge-PEFT", - "developer": "Gunulhona", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 20.318 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.288 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5154 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/436e651e-6f04-44ff-ab3d-db8ed0d639bd.json b/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/436e651e-6f04-44ff-ab3d-db8ed0d639bd.json deleted file mode 100644 index d5f8fff6e..000000000 --- a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/436e651e-6f04-44ff-ab3d-db8ed0d639bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Gunulhona_Gemma-Ko-Merge-PEFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-Ko-Merge-PEFT", - "id": "Gunulhona/Gemma-Ko-Merge-PEFT", - "developer": "Gunulhona", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 20.318 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4863 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3986 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge/9fbccac2-c840-494e-a24d-a6f0c9a07b88.json b/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge/9fbccac2-c840-494e-a24d-a6f0c9a07b88.json deleted file mode 100644 index 6f57f8828..000000000 --- a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge/9fbccac2-c840-494e-a24d-a6f0c9a07b88.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Gunulhona_Gemma-Ko-Merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-Ko-Merge", - "id": "Gunulhona/Gemma-Ko-Merge", - "developer": "Gunulhona", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5813 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1881 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4047 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3879 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HPAI-BSC/Llama3-Aloe-8B-Alpha/a4ee6a33-df51-4a4e-a13d-45488a094fd7.json b/data/hfopenllm_v2/HPAI-BSC/Llama3-Aloe-8B-Alpha/a4ee6a33-df51-4a4e-a13d-45488a094fd7.json deleted file mode 100644 index 97b6f883b..000000000 --- a/data/hfopenllm_v2/HPAI-BSC/Llama3-Aloe-8B-Alpha/a4ee6a33-df51-4a4e-a13d-45488a094fd7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HPAI-BSC_Llama3-Aloe-8B-Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-Aloe-8B-Alpha", - "id": "HPAI-BSC/Llama3-Aloe-8B-Alpha", - "developer": "HPAI-BSC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5081 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4831 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3673 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3295 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HPAI-BSC/Llama3.1-Aloe-Beta-8B/a3923f10-e64c-4556-9616-4fe7072eff60.json b/data/hfopenllm_v2/HPAI-BSC/Llama3.1-Aloe-Beta-8B/a3923f10-e64c-4556-9616-4fe7072eff60.json deleted file mode 100644 index 00bf8e2ef..000000000 --- a/data/hfopenllm_v2/HPAI-BSC/Llama3.1-Aloe-Beta-8B/a3923f10-e64c-4556-9616-4fe7072eff60.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HPAI-BSC_Llama3.1-Aloe-Beta-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-Aloe-Beta-8B", - "id": "HPAI-BSC/Llama3.1-Aloe-Beta-8B", - "developer": "HPAI-BSC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7253 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5093 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1828 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3835 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HPAI-BSC/Qwen2.5-Aloe-Beta-7B/ca15d972-9075-42df-884b-5d069f6ff425.json b/data/hfopenllm_v2/HPAI-BSC/Qwen2.5-Aloe-Beta-7B/ca15d972-9075-42df-884b-5d069f6ff425.json deleted file mode 100644 index 4e7791b20..000000000 --- a/data/hfopenllm_v2/HPAI-BSC/Qwen2.5-Aloe-Beta-7B/ca15d972-9075-42df-884b-5d069f6ff425.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HPAI-BSC_Qwen2.5-Aloe-Beta-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Aloe-Beta-7B", - "id": "HPAI-BSC/Qwen2.5-Aloe-Beta-7B", - "developer": "HPAI-BSC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4554 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5049 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1.2/905909a5-abef-46bf-9392-c97873e229df.json b/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1.2/905909a5-abef-46bf-9392-c97873e229df.json deleted file mode 100644 index e6b388131..000000000 --- a/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1.2/905909a5-abef-46bf-9392-c97873e229df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HarbingerX_Zeitgeist-3b-V1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zeitgeist-3b-V1.2", - "id": "HarbingerX/Zeitgeist-3b-V1.2", - "developer": "HarbingerX", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6754 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3056 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1/95bd05cf-8f59-409d-a99e-d249bad6c561.json b/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1/95bd05cf-8f59-409d-a99e-d249bad6c561.json deleted file mode 100644 index e04145e0b..000000000 --- a/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1/95bd05cf-8f59-409d-a99e-d249bad6c561.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HarbingerX_Zeitgeist-3b-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zeitgeist-3b-V1", - "id": "HarbingerX/Zeitgeist-3b-V1", - "developer": "HarbingerX", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6712 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3009 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Hastagaras/L3.2-JametMini-3B-MK.III/76b12246-33f6-4992-a0ab-38704dcf6345.json b/data/hfopenllm_v2/Hastagaras/L3.2-JametMini-3B-MK.III/76b12246-33f6-4992-a0ab-38704dcf6345.json deleted file mode 100644 index 9a45e8545..000000000 --- a/data/hfopenllm_v2/Hastagaras/L3.2-JametMini-3B-MK.III/76b12246-33f6-4992-a0ab-38704dcf6345.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Hastagaras_L3.2-JametMini-3B-MK.III/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.2-JametMini-3B-MK.III", - "id": "Hastagaras/L3.2-JametMini-3B-MK.III", - "developer": "Hastagaras", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6183 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2983 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Hastagaras/Llama-3.1-Jamet-8B-MK.I/e4415806-0ec0-465a-b28f-9c8741436fb4.json b/data/hfopenllm_v2/Hastagaras/Llama-3.1-Jamet-8B-MK.I/e4415806-0ec0-465a-b28f-9c8741436fb4.json deleted file mode 100644 index 680f1f7a0..000000000 --- a/data/hfopenllm_v2/Hastagaras/Llama-3.1-Jamet-8B-MK.I/e4415806-0ec0-465a-b28f-9c8741436fb4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Hastagaras_Llama-3.1-Jamet-8B-MK.I/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Jamet-8B-MK.I", - "id": "Hastagaras/Llama-3.1-Jamet-8B-MK.I", - "developer": "Hastagaras", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7338 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5049 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3726 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Hastagaras/Zabuza-8B-Llama-3.1/98e62ab5-d35a-42dd-904b-bed9c50f3745.json b/data/hfopenllm_v2/Hastagaras/Zabuza-8B-Llama-3.1/98e62ab5-d35a-42dd-904b-bed9c50f3745.json deleted file mode 100644 index a06055349..000000000 --- a/data/hfopenllm_v2/Hastagaras/Zabuza-8B-Llama-3.1/98e62ab5-d35a-42dd-904b-bed9c50f3745.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Hastagaras_Zabuza-8B-Llama-3.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zabuza-8B-Llama-3.1", - "id": "Hastagaras/Zabuza-8B-Llama-3.1", - "developer": "Hastagaras", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6265 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3568 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HelpingAI/Cipher-20B/8fb3596e-224e-492b-bdb6-a95a16656eb0.json b/data/hfopenllm_v2/HelpingAI/Cipher-20B/8fb3596e-224e-492b-bdb6-a95a16656eb0.json deleted file mode 100644 index a8c33014c..000000000 --- a/data/hfopenllm_v2/HelpingAI/Cipher-20B/8fb3596e-224e-492b-bdb6-a95a16656eb0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HelpingAI_Cipher-20B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cipher-20B", - "id": "HelpingAI/Cipher-20B", - "developer": "HelpingAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 20.551 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1994 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3744 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HelpingAI/Dhanishtha-Large/154203c4-d86e-4c36-806b-c45c5cc568ce.json b/data/hfopenllm_v2/HelpingAI/Dhanishtha-Large/154203c4-d86e-4c36-806b-c45c5cc568ce.json deleted file mode 100644 index d0d75bfd3..000000000 --- a/data/hfopenllm_v2/HelpingAI/Dhanishtha-Large/154203c4-d86e-4c36-806b-c45c5cc568ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HelpingAI_Dhanishtha-Large/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dhanishtha-Large", - "id": "HelpingAI/Dhanishtha-Large", - "developer": "HelpingAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2457 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4604 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3845 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2755 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HelpingAI/Priya-10B/e42c01f7-2869-4103-bbfd-81aa5a15c140.json b/data/hfopenllm_v2/HelpingAI/Priya-10B/e42c01f7-2869-4103-bbfd-81aa5a15c140.json deleted file mode 100644 index 5c591c97d..000000000 --- a/data/hfopenllm_v2/HelpingAI/Priya-10B/e42c01f7-2869-4103-bbfd-81aa5a15c140.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HelpingAI_Priya-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Priya-10B", - "id": "HelpingAI/Priya-10B", - "developer": "HelpingAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.211 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2493 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HelpingAI/Priya-3B/323d2f94-5e04-4627-9f74-129217f53eea.json b/data/hfopenllm_v2/HelpingAI/Priya-3B/323d2f94-5e04-4627-9f74-129217f53eea.json deleted file mode 100644 index ab36912a7..000000000 --- a/data/hfopenllm_v2/HelpingAI/Priya-3B/323d2f94-5e04-4627-9f74-129217f53eea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HelpingAI_Priya-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Priya-3B", - "id": "HelpingAI/Priya-3B", - "developer": "HelpingAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.81 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4526 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3961 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2339 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HeraiHench/DeepSeek-R1-Qwen-Coder-8B/6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json b/data/hfopenllm_v2/HeraiHench/DeepSeek-R1-Qwen-Coder-8B/6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json deleted file mode 100644 index 83225ff07..000000000 --- a/data/hfopenllm_v2/HeraiHench/DeepSeek-R1-Qwen-Coder-8B/6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HeraiHench_DeepSeek-R1-Qwen-Coder-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Qwen-Coder-8B", - "id": "HeraiHench/DeepSeek-R1-Qwen-Coder-8B", - "developer": "HeraiHench", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 8.164 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1869 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2913 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HeraiHench/Double-Down-Qwen-Math-7B/691cace3-5316-4f5b-8693-67efb24a0a06.json b/data/hfopenllm_v2/HeraiHench/Double-Down-Qwen-Math-7B/691cace3-5316-4f5b-8693-67efb24a0a06.json deleted file mode 100644 index 6515c0ca8..000000000 --- a/data/hfopenllm_v2/HeraiHench/Double-Down-Qwen-Math-7B/691cace3-5316-4f5b-8693-67efb24a0a06.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HeraiHench_Double-Down-Qwen-Math-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Double-Down-Qwen-Math-7B", - "id": "HeraiHench/Double-Down-Qwen-Math-7B", - "developer": "HeraiHench", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.167 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2845 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3737 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HeraiHench/Marge-Qwen-Math-7B/d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json b/data/hfopenllm_v2/HeraiHench/Marge-Qwen-Math-7B/d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json deleted file mode 100644 index c9043860f..000000000 --- a/data/hfopenllm_v2/HeraiHench/Marge-Qwen-Math-7B/d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HeraiHench_Marge-Qwen-Math-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Marge-Qwen-Math-7B", - "id": "HeraiHench/Marge-Qwen-Math-7B", - "developer": "HeraiHench", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1262 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3069 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1056 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HeraiHench/Phi-4-slerp-ReasoningRP-14B/f6f515d3-f5e9-4362-be51-bb8fc05527e6.json b/data/hfopenllm_v2/HeraiHench/Phi-4-slerp-ReasoningRP-14B/f6f515d3-f5e9-4362-be51-bb8fc05527e6.json deleted file mode 100644 index 287a631b7..000000000 --- a/data/hfopenllm_v2/HeraiHench/Phi-4-slerp-ReasoningRP-14B/f6f515d3-f5e9-4362-be51-bb8fc05527e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HeraiHench_Phi-4-slerp-ReasoningRP-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-slerp-ReasoningRP-14B", - "id": "HeraiHench/Phi-4-slerp-ReasoningRP-14B", - "developer": "HeraiHench", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 9.207 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4196 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3116 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.19 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HiroseKoichi/Llama-Salad-4x8B-V3/2e1e215f-b622-439f-a13f-531441e25ae3.json b/data/hfopenllm_v2/HiroseKoichi/Llama-Salad-4x8B-V3/2e1e215f-b622-439f-a13f-531441e25ae3.json deleted file mode 100644 index 68ae14b6f..000000000 --- a/data/hfopenllm_v2/HiroseKoichi/Llama-Salad-4x8B-V3/2e1e215f-b622-439f-a13f-531441e25ae3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HiroseKoichi_Llama-Salad-4x8B-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Salad-4x8B-V3", - "id": "HiroseKoichi/Llama-Salad-4x8B-V3", - "developer": "HiroseKoichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.942 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6654 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5245 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HoangHa/Pensez-Llama3.1-8B/d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json b/data/hfopenllm_v2/HoangHa/Pensez-Llama3.1-8B/d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json deleted file mode 100644 index bdc98fec4..000000000 --- a/data/hfopenllm_v2/HoangHa/Pensez-Llama3.1-8B/d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HoangHa_Pensez-Llama3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pensez-Llama3.1-8B", - "id": "HoangHa/Pensez-Llama3.1-8B", - "developer": "HoangHa", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3887 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4669 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3597 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-alpha/ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json b/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-alpha/ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json deleted file mode 100644 index 6f52aa9ae..000000000 --- a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-alpha/ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceH4_zephyr-7b-alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-7b-alpha", - "id": "HuggingFaceH4/zephyr-7b-alpha", - "developer": "HuggingFaceH4", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5191 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4583 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2795 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-beta/4eedd6d4-279f-4660-8d71-708a27bb53e0.json b/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-beta/4eedd6d4-279f-4660-8d71-708a27bb53e0.json deleted file mode 100644 index 243694134..000000000 --- a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-beta/4eedd6d4-279f-4660-8d71-708a27bb53e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceH4_zephyr-7b-beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-7b-beta", - "id": "HuggingFaceH4/zephyr-7b-beta", - "developer": "HuggingFaceH4", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.495 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3925 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2781 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-gemma-v0.1/9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json b/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-gemma-v0.1/9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json deleted file mode 100644 index 46badaeea..000000000 --- a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-gemma-v0.1/9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceH4_zephyr-7b-gemma-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-7b-gemma-v0.1", - "id": "HuggingFaceH4/zephyr-7b-gemma-v0.1", - "developer": "HuggingFaceH4", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2847 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1/e5c0fbc9-f424-4b04-839a-8335adaf89cc.json b/data/hfopenllm_v2/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1/e5c0fbc9-f424-4b04-839a-8335adaf89cc.json deleted file mode 100644 index 24a491daf..000000000 --- a/data/hfopenllm_v2/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1/e5c0fbc9-f424-4b04-839a-8335adaf89cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceH4_zephyr-orpo-141b-A35b-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-orpo-141b-A35b-v0.1", - "id": "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1", - "developer": "HuggingFaceH4", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 140.621 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6511 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.629 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2047 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4465 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4586 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B-Instruct/d91107fa-eb8d-4d01-90a2-fc9831f337b2.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B-Instruct/d91107fa-eb8d-4d01-90a2-fc9831f337b2.json deleted file mode 100644 index 0268b6bbe..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B-Instruct/d91107fa-eb8d-4d01-90a2-fc9831f337b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-1.7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM-1.7B-Instruct", - "id": "HuggingFaceTB/SmolLM-1.7B-Instruct", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.71 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2348 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2885 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B/926999bf-1ba6-4321-82b2-fcced4336739.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B/926999bf-1ba6-4321-82b2-fcced4336739.json deleted file mode 100644 index e9cd807d2..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B/926999bf-1ba6-4321-82b2-fcced4336739.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-1.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM-1.7B", - "id": "HuggingFaceTB/SmolLM-1.7B", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.71 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2362 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2416 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M-Instruct/57d481bf-0db9-4208-afda-dcd20df13964.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M-Instruct/57d481bf-0db9-4208-afda-dcd20df13964.json deleted file mode 100644 index 745d7547b..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M-Instruct/57d481bf-0db9-4208-afda-dcd20df13964.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-135M-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM-135M-Instruct", - "id": "HuggingFaceTB/SmolLM-135M-Instruct", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1214 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3015 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3635 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1176 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M/eb417e47-fe63-4dc5-b3e5-28782f3782da.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M/eb417e47-fe63-4dc5-b3e5-28782f3782da.json deleted file mode 100644 index b15e9c58d..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M/eb417e47-fe63-4dc5-b3e5-28782f3782da.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-135M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM-135M", - "id": "HuggingFaceTB/SmolLM-135M", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.13 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2125 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3046 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M-Instruct/b0f516dd-7185-4906-87a5-3c6f019894d0.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M-Instruct/b0f516dd-7185-4906-87a5-3c6f019894d0.json deleted file mode 100644 index c5826e27d..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M-Instruct/b0f516dd-7185-4906-87a5-3c6f019894d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-360M-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM-360M-Instruct", - "id": "HuggingFaceTB/SmolLM-360M-Instruct", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.362 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1952 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2885 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3472 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M/1e562944-a205-4ef7-aff1-3776595d131c.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M/1e562944-a205-4ef7-aff1-3776595d131c.json deleted file mode 100644 index 3e49abf4e..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M/1e562944-a205-4ef7-aff1-3776595d131c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-360M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM-360M", - "id": "HuggingFaceTB/SmolLM-360M", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.36 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2134 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3065 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4018 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B-Instruct/6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B-Instruct/6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json deleted file mode 100644 index 7fcd1bd5f..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B-Instruct/6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-1.7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-1.7B-Instruct", - "id": "HuggingFaceTB/SmolLM2-1.7B-Instruct", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.711 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5368 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2054 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B/2064938d-9f05-4740-a4d4-2a2da0eac21d.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B/2064938d-9f05-4740-a4d4-2a2da0eac21d.json deleted file mode 100644 index c33283bc1..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B/2064938d-9f05-4740-a4d4-2a2da0eac21d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-1.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-1.7B", - "id": "HuggingFaceTB/SmolLM2-1.7B", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.71 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.244 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3453 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3485 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2138 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/43240184-8245-43ff-a971-678523918fe0.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/43240184-8245-43ff-a971-678523918fe0.json deleted file mode 100644 index a41b862d1..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/43240184-8245-43ff-a971-678523918fe0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-135M-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-135M-Instruct", - "id": "HuggingFaceTB/SmolLM2-135M-Instruct", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0593 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3135 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2341 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3871 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1092 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/b3b854b6-700c-4297-b335-6acc3c385f84.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/b3b854b6-700c-4297-b335-6acc3c385f84.json deleted file mode 100644 index 5a55c0fe8..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/b3b854b6-700c-4297-b335-6acc3c385f84.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-135M-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-135M-Instruct", - "id": "HuggingFaceTB/SmolLM2-135M-Instruct", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3124 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2357 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3662 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M/a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M/a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json deleted file mode 100644 index 08375897a..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M/a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-135M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-135M", - "id": "HuggingFaceTB/SmolLM2-135M", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1818 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3044 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4112 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/88e1dd78-d3bc-401b-88e9-d963bac181db.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/88e1dd78-d3bc-401b-88e9-d963bac181db.json deleted file mode 100644 index bfc21511e..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/88e1dd78-d3bc-401b-88e9-d963bac181db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-360M-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-360M-Instruct", - "id": "HuggingFaceTB/SmolLM2-360M-Instruct", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.36 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3144 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/a41bd607-f319-4063-a6e4-813f43e40568.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/a41bd607-f319-4063-a6e4-813f43e40568.json deleted file mode 100644 index 96d89080d..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/a41bd607-f319-4063-a6e4-813f43e40568.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-360M-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-360M-Instruct", - "id": "HuggingFaceTB/SmolLM2-360M-Instruct", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.362 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.083 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3053 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M/8629aef1-c673-4b17-a9cc-b361a53bdaa7.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M/8629aef1-c673-4b17-a9cc-b361a53bdaa7.json deleted file mode 100644 index 95ae8687c..000000000 --- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M/8629aef1-c673-4b17-a9cc-b361a53bdaa7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-360M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-360M", - "id": "HuggingFaceTB/SmolLM2-360M", - "developer": "HuggingFaceTB", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.36 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3233 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3954 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1169 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HumanLLMs/Humanish-LLama3-8B-Instruct/532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json b/data/hfopenllm_v2/HumanLLMs/Humanish-LLama3-8B-Instruct/532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json deleted file mode 100644 index c3eae323f..000000000 --- a/data/hfopenllm_v2/HumanLLMs/Humanish-LLama3-8B-Instruct/532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HumanLLMs_Humanish-LLama3-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Humanish-LLama3-8B-Instruct", - "id": "HumanLLMs/Humanish-LLama3-8B-Instruct", - "developer": "HumanLLMs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6498 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4968 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1027 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3702 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407/843f9927-9865-4066-9cc0-f0522d3b914f.json b/data/hfopenllm_v2/HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407/843f9927-9865-4066-9cc0-f0522d3b914f.json deleted file mode 100644 index b2ce02227..000000000 --- a/data/hfopenllm_v2/HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407/843f9927-9865-4066-9cc0-f0522d3b914f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HumanLLMs_Humanish-Mistral-Nemo-Instruct-2407/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Humanish-Mistral-Nemo-Instruct-2407", - "id": "HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407", - "developer": "HumanLLMs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5451 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3521 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/HumanLLMs/Humanish-Qwen2.5-7B-Instruct/eeecb2cb-e286-443f-84aa-d825702a4ad8.json b/data/hfopenllm_v2/HumanLLMs/Humanish-Qwen2.5-7B-Instruct/eeecb2cb-e286-443f-84aa-d825702a4ad8.json deleted file mode 100644 index 899bd0d00..000000000 --- a/data/hfopenllm_v2/HumanLLMs/Humanish-Qwen2.5-7B-Instruct/eeecb2cb-e286-443f-84aa-d825702a4ad8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/HumanLLMs_Humanish-Qwen2.5-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Humanish-Qwen2.5-7B-Instruct", - "id": "HumanLLMs/Humanish-Qwen2.5-7B-Instruct", - "developer": "HumanLLMs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7284 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5364 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4398 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/IDEA-CCNL/Ziya-LLaMA-13B-v1/36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json b/data/hfopenllm_v2/IDEA-CCNL/Ziya-LLaMA-13B-v1/36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json deleted file mode 100644 index 27a4a8e2c..000000000 --- a/data/hfopenllm_v2/IDEA-CCNL/Ziya-LLaMA-13B-v1/36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/IDEA-CCNL_Ziya-LLaMA-13B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ziya-LLaMA-13B-v1", - "id": "IDEA-CCNL/Ziya-LLaMA-13B-v1", - "developer": "IDEA-CCNL", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1697 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2877 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3751 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1101 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0/c4e810f1-ffb3-4ece-b445-64e339761530.json b/data/hfopenllm_v2/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0/c4e810f1-ffb3-4ece-b445-64e339761530.json deleted file mode 100644 index 225b86c3c..000000000 --- a/data/hfopenllm_v2/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0/c4e810f1-ffb3-4ece-b445-64e339761530.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/INSAIT-Institute_BgGPT-Gemma-2-27B-IT-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BgGPT-Gemma-2-27B-IT-v1.0", - "id": "INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0", - "developer": "INSAIT-Institute", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2912 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/IlyaGusev/gemma-2-2b-it-abliterated/025725b6-0034-48c0-a720-5fc210e5e24b.json b/data/hfopenllm_v2/IlyaGusev/gemma-2-2b-it-abliterated/025725b6-0034-48c0-a720-5fc210e5e24b.json deleted file mode 100644 index 1daa4500b..000000000 --- a/data/hfopenllm_v2/IlyaGusev/gemma-2-2b-it-abliterated/025725b6-0034-48c0-a720-5fc210e5e24b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/IlyaGusev_gemma-2-2b-it-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-it-abliterated", - "id": "IlyaGusev/gemma-2-2b-it-abliterated", - "developer": "IlyaGusev", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4119 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2538 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/IlyaGusev/gemma-2-9b-it-abliterated/7bdd8928-c336-494e-9c87-de9ecc2749b8.json b/data/hfopenllm_v2/IlyaGusev/gemma-2-9b-it-abliterated/7bdd8928-c336-494e-9c87-de9ecc2749b8.json deleted file mode 100644 index b601a539f..000000000 --- a/data/hfopenllm_v2/IlyaGusev/gemma-2-9b-it-abliterated/7bdd8928-c336-494e-9c87-de9ecc2749b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/IlyaGusev_gemma-2-9b-it-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-it-abliterated", - "id": "IlyaGusev/gemma-2-9b-it-abliterated", - "developer": "IlyaGusev", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7473 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5906 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3915 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0/ff7369dc-3ff2-424b-80b0-e06a141b54f3.json b/data/hfopenllm_v2/Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0/ff7369dc-3ff2-424b-80b0-e06a141b54f3.json deleted file mode 100644 index dde7937ec..000000000 --- a/data/hfopenllm_v2/Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0/ff7369dc-3ff2-424b-80b0-e06a141b54f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Infinirc_Infinirc-Llama3-8B-2G-Release-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Infinirc-Llama3-8B-2G-Release-v1.0", - "id": "Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0", - "developer": "Infinirc", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4609 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.216 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-1/a6dc7253-75fd-4897-be85-8ac89fc11f8e.json b/data/hfopenllm_v2/Intel/neural-chat-7b-v3-1/a6dc7253-75fd-4897-be85-8ac89fc11f8e.json deleted file mode 100644 index c13200f68..000000000 --- a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-1/a6dc7253-75fd-4897-be85-8ac89fc11f8e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Intel_neural-chat-7b-v3-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "neural-chat-7b-v3-1", - "id": "Intel/neural-chat-7b-v3-1", - "developer": "Intel", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4687 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5052 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4979 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2678 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-2/296ceacc-542a-4000-bf9b-ae59b33a53ce.json b/data/hfopenllm_v2/Intel/neural-chat-7b-v3-2/296ceacc-542a-4000-bf9b-ae59b33a53ce.json deleted file mode 100644 index 050bc7fae..000000000 --- a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-2/296ceacc-542a-4000-bf9b-ae59b33a53ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Intel_neural-chat-7b-v3-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "neural-chat-7b-v3-2", - "id": "Intel/neural-chat-7b-v3-2", - "developer": "Intel", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4988 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4895 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2667 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-3/13870577-7579-48b4-9c92-202318ca6ecc.json b/data/hfopenllm_v2/Intel/neural-chat-7b-v3-3/13870577-7579-48b4-9c92-202318ca6ecc.json deleted file mode 100644 index f546afc25..000000000 --- a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-3/13870577-7579-48b4-9c92-202318ca6ecc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Intel_neural-chat-7b-v3-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "neural-chat-7b-v3-3", - "id": "Intel/neural-chat-7b-v3-3", - "developer": "Intel", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4877 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2625 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Intel/neural-chat-7b-v3/6ebd2806-2623-4773-93bd-1036ff01cb8c.json b/data/hfopenllm_v2/Intel/neural-chat-7b-v3/6ebd2806-2623-4773-93bd-1036ff01cb8c.json deleted file mode 100644 index 937a77afe..000000000 --- a/data/hfopenllm_v2/Intel/neural-chat-7b-v3/6ebd2806-2623-4773-93bd-1036ff01cb8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Intel_neural-chat-7b-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "neural-chat-7b-v3", - "id": "Intel/neural-chat-7b-v3", - "developer": "Intel", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2778 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5048 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5055 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2699 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/IntervitensInc/internlm2_5-20b-llamafied/99d6a44b-d556-4674-8ade-a5b30cf99255.json b/data/hfopenllm_v2/IntervitensInc/internlm2_5-20b-llamafied/99d6a44b-d556-4674-8ade-a5b30cf99255.json deleted file mode 100644 index 9f01570fc..000000000 --- a/data/hfopenllm_v2/IntervitensInc/internlm2_5-20b-llamafied/99d6a44b-d556-4674-8ade-a5b30cf99255.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/IntervitensInc_internlm2_5-20b-llamafied/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm2_5-20b-llamafied", - "id": "IntervitensInc/internlm2_5-20b-llamafied", - "developer": "IntervitensInc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 19.861 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.341 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7478 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1715 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4051 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.5/605118a3-316a-46b5-9719-f596e361a2a8.json b/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.5/605118a3-316a-46b5-9719-f596e361a2a8.json deleted file mode 100644 index cb284ea11..000000000 --- a/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.5/605118a3-316a-46b5-9719-f596e361a2a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Invalid-Null_PeiYangMe-0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PeiYangMe-0.5", - "id": "Invalid-Null/PeiYangMe-0.5", - "developer": "Invalid-Null", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1409 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2791 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.7/271d2829-fbd4-438e-9f09-59539af68c8b.json b/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.7/271d2829-fbd4-438e-9f09-59539af68c8b.json deleted file mode 100644 index b5d375fc5..000000000 --- a/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.7/271d2829-fbd4-438e-9f09-59539af68c8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Invalid-Null_PeiYangMe-0.7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PeiYangMe-0.7", - "id": "Invalid-Null/PeiYangMe-0.7", - "developer": "Invalid-Null", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1491 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3028 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2332 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3857 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1101 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/107bc549-75c1-4272-b567-f8ab9f6cd675.json b/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/107bc549-75c1-4272-b567-f8ab9f6cd675.json deleted file mode 100644 index ffdfacf77..000000000 --- a/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/107bc549-75c1-4272-b567-f8ab9f6cd675.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Isaak-Carter_JOSIEv4o-8b-stage1-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "JOSIEv4o-8b-stage1-v4", - "id": "Isaak-Carter/JOSIEv4o-8b-stage1-v4", - "developer": "Isaak-Carter", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4758 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3292 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/dfb451e9-c1c1-45a1-8082-155763366129.json b/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/dfb451e9-c1c1-45a1-8082-155763366129.json deleted file mode 100644 index 9fc986679..000000000 --- a/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/dfb451e9-c1c1-45a1-8082-155763366129.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Isaak-Carter_JOSIEv4o-8b-stage1-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "JOSIEv4o-8b-stage1-v4", - "id": "Isaak-Carter/JOSIEv4o-8b-stage1-v4", - "developer": "Isaak-Carter", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2553 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4725 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3654 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/b2d80977-d079-42ec-b057-5aac530b9d70.json b/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/b2d80977-d079-42ec-b057-5aac530b9d70.json deleted file mode 100644 index 839f443ca..000000000 --- a/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/b2d80977-d079-42ec-b057-5aac530b9d70.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", - "id": "Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2", - "developer": "Isaak-Carter", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7841 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5311 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4721 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated/16b33b80-3b4b-4edb-b89f-3d93dca8969c.json b/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated/16b33b80-3b4b-4edb-b89f-3d93dca8969c.json deleted file mode 100644 index 7fb690341..000000000 --- a/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated/16b33b80-3b4b-4edb-b89f-3d93dca8969c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated", - "id": "Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated", - "developer": "Isaak-Carter", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7317 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5396 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4087 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4276 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/J-LAB/Thynk_orpo/63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json b/data/hfopenllm_v2/J-LAB/Thynk_orpo/63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json deleted file mode 100644 index 9f7d195c4..000000000 --- a/data/hfopenllm_v2/J-LAB/Thynk_orpo/63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/J-LAB_Thynk_orpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Thynk_orpo", - "id": "J-LAB/Thynk_orpo", - "developer": "J-LAB", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2102 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4463 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4515 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3231 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JackFram/llama-160m/538f2b43-328c-456d-8a40-ff2b37924453.json b/data/hfopenllm_v2/JackFram/llama-160m/538f2b43-328c-456d-8a40-ff2b37924453.json deleted file mode 100644 index 94fe29570..000000000 --- a/data/hfopenllm_v2/JackFram/llama-160m/538f2b43-328c-456d-8a40-ff2b37924453.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JackFram_llama-160m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-160m", - "id": "JackFram/llama-160m", - "developer": "JackFram", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.162 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1791 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2888 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JackFram/llama-68m/fb7a68e6-716e-48c6-96c0-d227735f9a7c.json b/data/hfopenllm_v2/JackFram/llama-68m/fb7a68e6-716e-48c6-96c0-d227735f9a7c.json deleted file mode 100644 index 620583267..000000000 --- a/data/hfopenllm_v2/JackFram/llama-68m/fb7a68e6-716e-48c6-96c0-d227735f9a7c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JackFram_llama-68m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-68m", - "id": "JackFram/llama-68m", - "developer": "JackFram", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.068 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1726 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jacoby746/Casual-Magnum-34B/3593d4b8-5602-4cca-935f-a76e342f060a.json b/data/hfopenllm_v2/Jacoby746/Casual-Magnum-34B/3593d4b8-5602-4cca-935f-a76e342f060a.json deleted file mode 100644 index b0ceabd5f..000000000 --- a/data/hfopenllm_v2/Jacoby746/Casual-Magnum-34B/3593d4b8-5602-4cca-935f-a76e342f060a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jacoby746_Casual-Magnum-34B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Casual-Magnum-34B", - "id": "Jacoby746/Casual-Magnum-34B", - "developer": "Jacoby746", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.193 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0921 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4078 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5184 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B/72d503fc-b221-498e-811a-a806769175d6.json b/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B/72d503fc-b221-498e-811a-a806769175d6.json deleted file mode 100644 index eeb08788f..000000000 --- a/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B/72d503fc-b221-498e-811a-a806769175d6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jacoby746_Inf-Silent-Kunoichi-v0.1-2x7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Inf-Silent-Kunoichi-v0.1-2x7B", - "id": "Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B", - "developer": "Jacoby746", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5185 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3271 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B/ad7d9698-d9e6-4f2d-9767-987835626c8c.json b/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B/ad7d9698-d9e6-4f2d-9767-987835626c8c.json deleted file mode 100644 index 1870540b9..000000000 --- a/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B/ad7d9698-d9e6-4f2d-9767-987835626c8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jacoby746_Inf-Silent-Kunoichi-v0.2-2x7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Inf-Silent-Kunoichi-v0.2-2x7B", - "id": "Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B", - "developer": "Jacoby746", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3636 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jacoby746/Proto-Athena-4x7B/98899942-fcf0-41de-8587-44d7429bea47.json b/data/hfopenllm_v2/Jacoby746/Proto-Athena-4x7B/98899942-fcf0-41de-8587-44d7429bea47.json deleted file mode 100644 index 290b886ee..000000000 --- a/data/hfopenllm_v2/Jacoby746/Proto-Athena-4x7B/98899942-fcf0-41de-8587-44d7429bea47.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jacoby746_Proto-Athena-4x7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Proto-Athena-4x7B", - "id": "Jacoby746/Proto-Athena-4x7B", - "developer": "Jacoby746", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3703 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4348 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3206 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jacoby746/Proto-Athena-v0.2-4x7B/bb51eb59-88f6-49c2-814a-11b2c80313d0.json b/data/hfopenllm_v2/Jacoby746/Proto-Athena-v0.2-4x7B/bb51eb59-88f6-49c2-814a-11b2c80313d0.json deleted file mode 100644 index 93635570d..000000000 --- a/data/hfopenllm_v2/Jacoby746/Proto-Athena-v0.2-4x7B/bb51eb59-88f6-49c2-814a-11b2c80313d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jacoby746_Proto-Athena-v0.2-4x7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Proto-Athena-v0.2-4x7B", - "id": "Jacoby746/Proto-Athena-v0.2-4x7B", - "developer": "Jacoby746", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5068 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4213 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3197 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B/d8563f36-e299-4186-a5dc-9dae51824e1f.json b/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B/d8563f36-e299-4186-a5dc-9dae51824e1f.json deleted file mode 100644 index 38b06bee1..000000000 --- a/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B/d8563f36-e299-4186-a5dc-9dae51824e1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jacoby746_Proto-Harpy-Blazing-Light-v0.1-2x7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Proto-Harpy-Blazing-Light-v0.1-2x7B", - "id": "Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B", - "developer": "Jacoby746", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4905 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5187 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Spark-v0.1-7B/43bc0528-7bc5-4eac-8848-c9995079450f.json b/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Spark-v0.1-7B/43bc0528-7bc5-4eac-8848-c9995079450f.json deleted file mode 100644 index 220166198..000000000 --- a/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Spark-v0.1-7B/43bc0528-7bc5-4eac-8848-c9995079450f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jacoby746_Proto-Harpy-Spark-v0.1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Proto-Harpy-Spark-v0.1-7B", - "id": "Jacoby746/Proto-Harpy-Spark-v0.1-7B", - "developer": "Jacoby746", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4333 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4736 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4317 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3069 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-1epoch/ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-1epoch/ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json deleted file mode 100644 index b932687e9..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-1epoch/ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-DPO-1epoch/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-0.5B-DPO-1epoch", - "id": "JayHyeon/Qwen-0.5B-DPO-1epoch", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2647 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3191 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1558 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-5epoch/24629e14-d197-4a5b-adff-7840af652f22.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-5epoch/24629e14-d197-4a5b-adff-7840af652f22.json deleted file mode 100644 index ab62482c6..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-5epoch/24629e14-d197-4a5b-adff-7840af652f22.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-DPO-5epoch/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-0.5B-DPO-5epoch", - "id": "JayHyeon/Qwen-0.5B-DPO-5epoch", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.257 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-1epoch/9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-1epoch/9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json deleted file mode 100644 index bd901d9fc..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-1epoch/9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-IRPO-1epoch/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-0.5B-IRPO-1epoch", - "id": "JayHyeon/Qwen-0.5B-IRPO-1epoch", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2589 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3164 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3286 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.15 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-5epoch/46548403-6eb5-4f7a-874c-1327420f4cab.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-5epoch/46548403-6eb5-4f7a-874c-1327420f4cab.json deleted file mode 100644 index 296d7cd98..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-5epoch/46548403-6eb5-4f7a-874c-1327420f4cab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-IRPO-5epoch/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-0.5B-IRPO-5epoch", - "id": "JayHyeon/Qwen-0.5B-IRPO-5epoch", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1507 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-1epoch/0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-1epoch/0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json deleted file mode 100644 index e9415599f..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-1epoch/0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-eDPO-1epoch/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-0.5B-eDPO-1epoch", - "id": "JayHyeon/Qwen-0.5B-eDPO-1epoch", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2623 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3327 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1553 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-5epoch/aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-5epoch/aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json deleted file mode 100644 index c6919cc60..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-5epoch/aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-eDPO-5epoch/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-0.5B-eDPO-5epoch", - "id": "JayHyeon/Qwen-0.5B-eDPO-5epoch", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3326 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1523 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/ad03cae6-b126-4157-a225-9576e4d651d0.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/ad03cae6-b126-4157-a225-9576e4d651d0.json deleted file mode 100644 index 4ad70853d..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/ad03cae6-b126-4157-a225-9576e4d651d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1", - "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2469 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.326 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1575 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/0d57b65d-3dd4-4185-b8cf-531105e94b5e.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/0d57b65d-3dd4-4185-b8cf-531105e94b5e.json deleted file mode 100644 index b71ae80d9..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/0d57b65d-3dd4-4185-b8cf-531105e94b5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1", - "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2606 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3308 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1626 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/f8882044-6e71-4788-b2ee-f51f85e67ecc.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/f8882044-6e71-4788-b2ee-f51f85e67ecc.json deleted file mode 100644 index f890e44af..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/f8882044-6e71-4788-b2ee-f51f85e67ecc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1", - "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2529 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1576 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT/3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT/3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json deleted file mode 100644 index 4ed5a1209..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT/3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-Instruct-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-Instruct-SFT", - "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3254 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep/e26743b9-4caf-46f8-bd5a-7e4445c850b1.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep/e26743b9-4caf-46f8-bd5a-7e4445c850b1.json deleted file mode 100644 index a9d17a4e6..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep/e26743b9-4caf-46f8-bd5a-7e4445c850b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-2ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-1e-4-2ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.214 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1537 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep/febd4016-3a30-4b26-93e5-f7b556781b9b.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep/febd4016-3a30-4b26-93e5-f7b556781b9b.json deleted file mode 100644 index 24b629e3d..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep/febd4016-3a30-4b26-93e5-f7b556781b9b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-3ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-1e-4-3ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2257 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3064 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1532 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep/ae82125e-94ac-48ca-8240-807e4b7ef9a0.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep/ae82125e-94ac-48ca-8240-807e4b7ef9a0.json deleted file mode 100644 index 3a2866ea3..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep/ae82125e-94ac-48ca-8240-807e4b7ef9a0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-5ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-1e-4-5ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1987 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1558 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4/5321fa0b-b010-4e1d-9f20-a97b56f4f937.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4/5321fa0b-b010-4e1d-9f20-a97b56f4f937.json deleted file mode 100644 index 844a39283..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4/5321fa0b-b010-4e1d-9f20-a97b56f4f937.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-1e-4", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.202 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3017 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1619 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep/d25a4602-ea50-4a53-952c-112ba250123b.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep/d25a4602-ea50-4a53-952c-112ba250123b.json deleted file mode 100644 index 4799c0aa3..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep/d25a4602-ea50-4a53-952c-112ba250123b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-2ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-1e-5-2ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1971 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3225 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep/232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep/232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json deleted file mode 100644 index 3305c120d..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep/232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-3ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-1e-5-3ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2241 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1689 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep/975f54fe-a581-4ce1-b0c1-7becb7605f09.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep/975f54fe-a581-4ce1-b0c1-7becb7605f09.json deleted file mode 100644 index 925baa726..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep/975f54fe-a581-4ce1-b0c1-7becb7605f09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-5ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-1e-5-5ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2292 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3259 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1688 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5/92ae4461-48bc-47fe-a3ad-ea4c3452d395.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5/92ae4461-48bc-47fe-a3ad-ea4c3452d395.json deleted file mode 100644 index 36e1434b4..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5/92ae4461-48bc-47fe-a3ad-ea4c3452d395.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-1e-5", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1986 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1698 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep/638e1cc0-9baf-4555-a278-4b21c46af86f.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep/638e1cc0-9baf-4555-a278-4b21c46af86f.json deleted file mode 100644 index b6a076686..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep/638e1cc0-9baf-4555-a278-4b21c46af86f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-2ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-4-2ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1831 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3568 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1484 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep/cef4161a-4e1c-4a92-bca8-b07f957a13b1.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep/cef4161a-4e1c-4a92-bca8-b07f957a13b1.json deleted file mode 100644 index ae6218e48..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep/cef4161a-4e1c-4a92-bca8-b07f957a13b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-3ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-4-3ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.311 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1416 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep/715b556b-2bc0-4864-b4b1-b7413a5d45bc.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep/715b556b-2bc0-4864-b4b1-b7413a5d45bc.json deleted file mode 100644 index e5bad0533..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep/715b556b-2bc0-4864-b4b1-b7413a5d45bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-5ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-4-5ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1897 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1336 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4/7552ad5c-5d1f-478b-a931-036083b2954e.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4/7552ad5c-5d1f-478b-a931-036083b2954e.json deleted file mode 100644 index 43c9ef63c..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4/7552ad5c-5d1f-478b-a931-036083b2954e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-4", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2034 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json deleted file mode 100644 index a44e27ca9..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2411 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/821d67e5-da8d-4383-8825-3bfa72a91fc9.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/821d67e5-da8d-4383-8825-3bfa72a91fc9.json deleted file mode 100644 index def874bd2..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/821d67e5-da8d-4383-8825-3bfa72a91fc9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2369 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.326 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json deleted file mode 100644 index 48999ddc5..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2262 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/dc35237c-606d-4609-927a-566bea767312.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/dc35237c-606d-4609-927a-566bea767312.json deleted file mode 100644 index 4dd0076c1..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/dc35237c-606d-4609-927a-566bea767312.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3199 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1555 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/3924d1af-e167-4186-a34b-d9b4b8c26d59.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/3924d1af-e167-4186-a34b-d9b4b8c26d59.json deleted file mode 100644 index a8a95addb..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/3924d1af-e167-4186-a34b-d9b4b8c26d59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.239 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.156 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json deleted file mode 100644 index 8e804555f..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2423 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/08f933a0-b096-4271-890e-0df7e20d1d20.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/08f933a0-b096-4271-890e-0df7e20d1d20.json deleted file mode 100644 index 254d8727c..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/08f933a0-b096-4271-890e-0df7e20d1d20.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2493 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.319 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1561 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/8434e448-ed77-45f2-9c31-39128912f842.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/8434e448-ed77-45f2-9c31-39128912f842.json deleted file mode 100644 index 2af5272ab..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/8434e448-ed77-45f2-9c31-39128912f842.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.158 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/d801037b-1eb0-4058-9096-429e5237e015.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/d801037b-1eb0-4058-9096-429e5237e015.json deleted file mode 100644 index b5090ee5f..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/d801037b-1eb0-4058-9096-429e5237e015.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2451 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1561 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/e0c46f18-598e-402f-8955-68e71fab67cd.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/e0c46f18-598e-402f-8955-68e71fab67cd.json deleted file mode 100644 index eb8631855..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/e0c46f18-598e-402f-8955-68e71fab67cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2557 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1575 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json deleted file mode 100644 index a7ae82558..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2605 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1577 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/ec658058-1075-4918-9dc9-fc79d0dcf897.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/ec658058-1075-4918-9dc9-fc79d0dcf897.json deleted file mode 100644 index 090ea8c8f..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/ec658058-1075-4918-9dc9-fc79d0dcf897.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2578 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3173 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1583 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/b68baa86-3e1a-4888-98ba-2ecede79b4a7.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/b68baa86-3e1a-4888-98ba-2ecede79b4a7.json deleted file mode 100644 index 0b37ef072..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/b68baa86-3e1a-4888-98ba-2ecede79b4a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2335 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3198 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3276 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1581 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/0b11c8ab-2cfa-425d-9d81-d999f94401db.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/0b11c8ab-2cfa-425d-9d81-d999f94401db.json deleted file mode 100644 index 8cc0efa5c..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/0b11c8ab-2cfa-425d-9d81-d999f94401db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2472 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3226 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1538 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/a3e48db8-3679-4f19-853d-82a73ef49400.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/a3e48db8-3679-4f19-853d-82a73ef49400.json deleted file mode 100644 index 9df394a48..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/a3e48db8-3679-4f19-853d-82a73ef49400.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2474 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3229 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1539 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/7dbf35b2-80c1-4181-80f9-850ea51cead2.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/7dbf35b2-80c1-4181-80f9-850ea51cead2.json deleted file mode 100644 index 839bdc602..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/7dbf35b2-80c1-4181-80f9-850ea51cead2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2403 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3245 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1573 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/231f47db-1662-4313-9ff4-f32883f5615c.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/231f47db-1662-4313-9ff4-f32883f5615c.json deleted file mode 100644 index 33c170571..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/231f47db-1662-4313-9ff4-f32883f5615c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2368 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3224 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1516 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/c79df898-14c6-4f00-9f65-0d01cd34ed61.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/c79df898-14c6-4f00-9f65-0d01cd34ed61.json deleted file mode 100644 index f02cc4dee..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/c79df898-14c6-4f00-9f65-0d01cd34ed61.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2372 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.155 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/2c52917f-c396-410d-bc78-c93c433797fc.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/2c52917f-c396-410d-bc78-c93c433797fc.json deleted file mode 100644 index 1ff95edd2..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/2c52917f-c396-410d-bc78-c93c433797fc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2499 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/0f1d2925-4e1c-495b-94be-f3515fbd53d7.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/0f1d2925-4e1c-495b-94be-f3515fbd53d7.json deleted file mode 100644 index 69688309d..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/0f1d2925-4e1c-495b-94be-f3515fbd53d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3242 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1572 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/5cbb1972-9895-4689-9f6f-7e0037829a78.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/5cbb1972-9895-4689-9f6f-7e0037829a78.json deleted file mode 100644 index 98d643888..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/5cbb1972-9895-4689-9f6f-7e0037829a78.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2421 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3225 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1496 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/6bc42e37-1f31-47cb-97e4-9d0b28b53691.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/6bc42e37-1f31-47cb-97e4-9d0b28b53691.json deleted file mode 100644 index 146ac953b..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/6bc42e37-1f31-47cb-97e4-9d0b28b53691.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3265 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1499 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json deleted file mode 100644 index 23710b611..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2526 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3177 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1572 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json deleted file mode 100644 index f02becf07..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2457 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1572 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/e4c06400-da86-4448-b421-23476f50bdb3.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/e4c06400-da86-4448-b421-23476f50bdb3.json deleted file mode 100644 index 77c01e879..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/e4c06400-da86-4448-b421-23476f50bdb3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2442 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json deleted file mode 100644 index 4b4f5231f..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json deleted file mode 100644 index d5f2c4e6c..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.249 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3173 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1569 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json deleted file mode 100644 index 03d43d960..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1566 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/9018f443-a63f-4e07-b10b-272f66d1eb0d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/9018f443-a63f-4e07-b10b-272f66d1eb0d.json deleted file mode 100644 index b053ce369..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/9018f443-a63f-4e07-b10b-272f66d1eb0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3211 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1571 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/548d1536-b941-43a9-a60b-ae5448b70933.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/548d1536-b941-43a9-a60b-ae5448b70933.json deleted file mode 100644 index 85329973f..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/548d1536-b941-43a9-a60b-ae5448b70933.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2478 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3198 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1587 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/99853109-17d9-46fa-a502-e4c977c1fb8f.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/99853109-17d9-46fa-a502-e4c977c1fb8f.json deleted file mode 100644 index 76daaf914..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/99853109-17d9-46fa-a502-e4c977c1fb8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3225 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/e171a0a0-f46d-404f-84e8-539155284e17.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/e171a0a0-f46d-404f-84e8-539155284e17.json deleted file mode 100644 index b9590f6bc..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/e171a0a0-f46d-404f-84e8-539155284e17.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.259 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3185 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1586 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json deleted file mode 100644 index 70f5c52d6..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2323 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3179 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/151cb8c4-0a7d-4886-80ea-560902e1f932.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/151cb8c4-0a7d-4886-80ea-560902e1f932.json deleted file mode 100644 index 6d9e81842..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/151cb8c4-0a7d-4886-80ea-560902e1f932.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2315 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.326 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3383 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1521 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1acb97c4-a9d2-4ec8-9486-77eb6857646c.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1acb97c4-a9d2-4ec8-9486-77eb6857646c.json deleted file mode 100644 index 95db25ea7..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1acb97c4-a9d2-4ec8-9486-77eb6857646c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2298 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.332 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3329 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json deleted file mode 100644 index 5f441e414..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2469 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3179 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1575 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/81562e50-23c5-4ef1-b98c-b40625f3b8c6.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/81562e50-23c5-4ef1-b98c-b40625f3b8c6.json deleted file mode 100644 index 1cd8d7c85..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/81562e50-23c5-4ef1-b98c-b40625f3b8c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.252 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3168 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1576 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/95fa292a-ee64-4844-9646-ce3cc7f730d2.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/95fa292a-ee64-4844-9646-ce3cc7f730d2.json deleted file mode 100644 index 361a9000e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/95fa292a-ee64-4844-9646-ce3cc7f730d2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2666 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3191 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/4d14c584-b5a1-41cd-9605-78088dfebd7f.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/4d14c584-b5a1-41cd-9605-78088dfebd7f.json deleted file mode 100644 index a144afa1e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/4d14c584-b5a1-41cd-9605-78088dfebd7f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2499 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json deleted file mode 100644 index fa1682646..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2417 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1575 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/4b0ab369-e72f-4229-b449-3a21ee9d2c95.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/4b0ab369-e72f-4229-b449-3a21ee9d2c95.json deleted file mode 100644 index cbb23bb4c..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/4b0ab369-e72f-4229-b449-3a21ee9d2c95.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2562 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.319 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1576 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/478b6c1f-3329-4c9b-9d90-59b8b551c1af.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/478b6c1f-3329-4c9b-9d90-59b8b551c1af.json deleted file mode 100644 index fc688f9c5..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/478b6c1f-3329-4c9b-9d90-59b8b551c1af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3165 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1557 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/212f8dd2-3c61-45bd-a3de-2326334feb73.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/212f8dd2-3c61-45bd-a3de-2326334feb73.json deleted file mode 100644 index 4c5ef642e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/212f8dd2-3c61-45bd-a3de-2326334feb73.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3204 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1592 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/9251282e-f72f-406e-a2cf-e7063516f624.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/9251282e-f72f-406e-a2cf-e7063516f624.json deleted file mode 100644 index 98dde13cb..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/9251282e-f72f-406e-a2cf-e7063516f624.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2545 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3186 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1561 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json deleted file mode 100644 index b16eabb54..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.252 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3204 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1538 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json deleted file mode 100644 index 2f7a799fb..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2315 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1582 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json deleted file mode 100644 index 928372798..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3187 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1539 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/f374772b-2685-41e2-a455-9002e48e3739.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/f374772b-2685-41e2-a455-9002e48e3739.json deleted file mode 100644 index feb9ce218..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/f374772b-2685-41e2-a455-9002e48e3739.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2472 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1588 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/6db801f8-5253-47c0-b87e-6779bff42f6b.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/6db801f8-5253-47c0-b87e-6779bff42f6b.json deleted file mode 100644 index 70dc4e5f1..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/6db801f8-5253-47c0-b87e-6779bff42f6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.246 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3234 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/0d704671-c0b6-4296-85b5-eaf972d6be6a.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/0d704671-c0b6-4296-85b5-eaf972d6be6a.json deleted file mode 100644 index 7fff16277..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/0d704671-c0b6-4296-85b5-eaf972d6be6a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2524 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1531 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/7e31545f-0865-4843-914b-a71f8a84314f.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/7e31545f-0865-4843-914b-a71f8a84314f.json deleted file mode 100644 index b0805fce1..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/7e31545f-0865-4843-914b-a71f8a84314f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2265 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3252 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1568 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/431c7130-5a19-4a71-8a92-fea9726769ac.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/431c7130-5a19-4a71-8a92-fea9726769ac.json deleted file mode 100644 index a3d21ddaa..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/431c7130-5a19-4a71-8a92-fea9726769ac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2302 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3224 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.15 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/ca850c4a-14d0-4145-9977-0d33e6e3e362.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/ca850c4a-14d0-4145-9977-0d33e6e3e362.json deleted file mode 100644 index 38c8a17c7..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/ca850c4a-14d0-4145-9977-0d33e6e3e362.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2524 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3278 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1521 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/7389caa3-6d8f-43e3-b3f2-d9320e56f621.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/7389caa3-6d8f-43e3-b3f2-d9320e56f621.json deleted file mode 100644 index a5198f971..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/7389caa3-6d8f-43e3-b3f2-d9320e56f621.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2658 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3175 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1575 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json deleted file mode 100644 index 1707361ce..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1595 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json deleted file mode 100644 index 375805022..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.256 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3159 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/e4085c6a-bc16-4328-a724-4b9838b55faa.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/e4085c6a-bc16-4328-a724-4b9838b55faa.json deleted file mode 100644 index 67bf5ff22..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/e4085c6a-bc16-4328-a724-4b9838b55faa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2499 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/b929b955-1fbb-43d0-add1-4d58fdc4097c.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/b929b955-1fbb-43d0-add1-4d58fdc4097c.json deleted file mode 100644 index 0387f9f97..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/b929b955-1fbb-43d0-add1-4d58fdc4097c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3177 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/df723a0f-9a32-42f3-9421-780159f7d821.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/df723a0f-9a32-42f3-9421-780159f7d821.json deleted file mode 100644 index 03229f6c3..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/df723a0f-9a32-42f3-9421-780159f7d821.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1553 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep/c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep/c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json deleted file mode 100644 index d40e539a7..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep/c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-2ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2201 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3217 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.171 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep/60c02070-7554-4764-8a02-841ca75a0d5c.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep/60c02070-7554-4764-8a02-841ca75a0d5c.json deleted file mode 100644 index c536b41df..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep/60c02070-7554-4764-8a02-841ca75a0d5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-3ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-3ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2281 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.324 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1746 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/d243f226-149b-4824-837e-e80ab68bae9d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/d243f226-149b-4824-837e-e80ab68bae9d.json deleted file mode 100644 index a45e23e39..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/d243f226-149b-4824-837e-e80ab68bae9d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2526 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/4f9361d0-2ad9-44da-a1d9-876d43451ae6.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/4f9361d0-2ad9-44da-a1d9-876d43451ae6.json deleted file mode 100644 index d516e44ee..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/4f9361d0-2ad9-44da-a1d9-876d43451ae6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3175 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1597 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json deleted file mode 100644 index fc27247a4..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2548 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3199 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/7cd2c0da-15b8-4ad6-8cad-feb68631c079.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/7cd2c0da-15b8-4ad6-8cad-feb68631c079.json deleted file mode 100644 index 60222a1b0..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/7cd2c0da-15b8-4ad6-8cad-feb68631c079.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2423 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3219 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1563 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json deleted file mode 100644 index 97123425b..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2493 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3191 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1592 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1fd0d1db-1d75-4b10-bae8-33023c2c7466.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1fd0d1db-1d75-4b10-bae8-33023c2c7466.json deleted file mode 100644 index 591bed95d..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1fd0d1db-1d75-4b10-bae8-33023c2c7466.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2478 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep/c6c02512-6c91-4818-a084-c48915fd83de.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep/c6c02512-6c91-4818-a084-c48915fd83de.json deleted file mode 100644 index 6bd7dcd08..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep/c6c02512-6c91-4818-a084-c48915fd83de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5-5ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2348 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3308 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1695 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5/326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5/326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json deleted file mode 100644 index 02a1ed3dd..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5/326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-2e-5", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2068 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3204 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1678 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep/b3a190d1-5b86-4439-a21e-1f118239db82.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep/b3a190d1-5b86-4439-a21e-1f118239db82.json deleted file mode 100644 index 3ce9e8679..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep/b3a190d1-5b86-4439-a21e-1f118239db82.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-2ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-5e-5-2ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2175 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1627 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep/b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep/b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json deleted file mode 100644 index ce21c0bef..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep/b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-3ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-5e-5-3ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3593 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep/05a59445-b816-4982-9b1a-1c2394ffbaa9.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep/05a59445-b816-4982-9b1a-1c2394ffbaa9.json deleted file mode 100644 index 8649fd379..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep/05a59445-b816-4982-9b1a-1c2394ffbaa9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-5ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-5e-5-5ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3276 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3766 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1587 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5/ff952579-e92d-4af8-9497-f49fed5efba0.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5/ff952579-e92d-4af8-9497-f49fed5efba0.json deleted file mode 100644 index c0f65b58b..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5/ff952579-e92d-4af8-9497-f49fed5efba0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-5e-5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-5e-5", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.201 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3109 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1672 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep/b541ede0-6de9-4557-8280-43567fd3dd96.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep/b541ede0-6de9-4557-8280-43567fd3dd96.json deleted file mode 100644 index b702f2d51..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep/b541ede0-6de9-4557-8280-43567fd3dd96.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-2ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-7e-5-2ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2156 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep/8514f601-0bb2-4639-90cc-29e96088e7de.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep/8514f601-0bb2-4639-90cc-29e96088e7de.json deleted file mode 100644 index 686bff7aa..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep/8514f601-0bb2-4639-90cc-29e96088e7de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-3ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-7e-5-3ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3199 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2366 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1522 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep/57e6d0cf-943a-4b83-a1f4-4f03b5066523.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep/57e6d0cf-943a-4b83-a1f4-4f03b5066523.json deleted file mode 100644 index 138ef3aec..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep/57e6d0cf-943a-4b83-a1f4-4f03b5066523.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-5ep/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-7e-5-5ep", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.212 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.32 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1628 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5/ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5/ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json deleted file mode 100644 index 0c78b2864..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5/ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-7e-5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-7e-5", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2093 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3158 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1622 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1/14b260e6-4300-43ec-b7af-587a2f5b03fb.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1/14b260e6-4300-43ec-b7af-587a2f5b03fb.json deleted file mode 100644 index 0329a0fc0..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1/14b260e6-4300-43ec-b7af-587a2f5b03fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-DPO-1epoch_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-DPO-1epoch_v1", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2025 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3268 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/53de1fc9-7097-4103-b731-588a7bf39f80.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/53de1fc9-7097-4103-b731-588a7bf39f80.json deleted file mode 100644 index 2d3c81041..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/53de1fc9-7097-4103-b731-588a7bf39f80.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT-MDPO-1epoch_v1", - "id": "JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3293 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT/1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT/1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json deleted file mode 100644 index 2c42031ee..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT/1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-SFT", - "id": "JayHyeon/Qwen2.5-0.5B-SFT", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1673 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/51b62d59-f39c-49ca-af0a-73df6440e29d.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/51b62d59-f39c-49ca-af0a-73df6440e29d.json deleted file mode 100644 index b18776596..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/51b62d59-f39c-49ca-af0a-73df6440e29d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2532 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1566 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/622a0ae1-0eb5-49f0-bc44-d396c7233e27.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/622a0ae1-0eb5-49f0-bc44-d396c7233e27.json deleted file mode 100644 index 8a1f01917..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/622a0ae1-0eb5-49f0-bc44-d396c7233e27.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.267 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/71291a41-283e-42ca-b192-7b759e3c3712.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/71291a41-283e-42ca-b192-7b759e3c3712.json deleted file mode 100644 index 9c3e33edf..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/71291a41-283e-42ca-b192-7b759e3c3712.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3261 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1565 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/7e504fef-b304-4c1a-856d-06e56a8869d7.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/7e504fef-b304-4c1a-856d-06e56a8869d7.json deleted file mode 100644 index 1841fefbe..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/7e504fef-b304-4c1a-856d-06e56a8869d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2383 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1503 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json deleted file mode 100644 index 0f50c78cd..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2471 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3224 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/099ce031-1e11-4a07-bac1-03bef9b915d6.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/099ce031-1e11-4a07-bac1-03bef9b915d6.json deleted file mode 100644 index 0b464aa1e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/099ce031-1e11-4a07-bac1-03bef9b915d6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2447 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1565 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json deleted file mode 100644 index c6cea6b5c..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2551 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/cbc43c7a-d8ac-4b03-a383-703f7fa51757.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/cbc43c7a-d8ac-4b03-a383-703f7fa51757.json deleted file mode 100644 index 6baf33d2a..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/cbc43c7a-d8ac-4b03-a383-703f7fa51757.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2538 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3153 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3261 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1583 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json deleted file mode 100644 index b8795c78b..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2402 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3168 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1568 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/5eb10878-11e6-43ad-9bb5-658a3495129c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/5eb10878-11e6-43ad-9bb5-658a3495129c.json deleted file mode 100644 index 6f4423d45..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/5eb10878-11e6-43ad-9bb5-658a3495129c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2484 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3211 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1573 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json deleted file mode 100644 index e97c3a4f4..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam", - "id": "JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2578 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3203 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1583 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json deleted file mode 100644 index 5542366cc..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_1e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2316 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3258 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.158 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/273f0d50-aa4e-4469-8360-2ce0a2e1a850.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/273f0d50-aa4e-4469-8360-2ce0a2e1a850.json deleted file mode 100644 index fcfa337b4..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/273f0d50-aa4e-4469-8360-2ce0a2e1a850.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_1e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.236 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3225 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1596 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json deleted file mode 100644 index 8cad12a77..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_3e-6-1ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2337 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3132 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/9da9a0e6-257a-41f6-b3a3-e3279a4924db.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/9da9a0e6-257a-41f6-b3a3-e3279a4924db.json deleted file mode 100644 index 8fcff6407..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/9da9a0e6-257a-41f6-b3a3-e3279a4924db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_3e-6-2ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2569 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3276 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1565 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/dfed058c-48b2-4e1e-9a29-624771e3e9dd.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/dfed058c-48b2-4e1e-9a29-624771e3e9dd.json deleted file mode 100644 index 1408d7de5..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/dfed058c-48b2-4e1e-9a29-624771e3e9dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_3e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.246 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3267 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1543 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json deleted file mode 100644 index 2ee2160cf..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_3e-7-1ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2529 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3229 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3195 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1597 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/8438a108-0d5d-48b6-b73a-981d13329daa.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/8438a108-0d5d-48b6-b73a-981d13329daa.json deleted file mode 100644 index 404b429a1..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/8438a108-0d5d-48b6-b73a-981d13329daa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_3e-7-2ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3195 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1599 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/88616292-1e38-4481-af30-6b60e28fb097.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/88616292-1e38-4481-af30-6b60e28fb097.json deleted file mode 100644 index 6805ba8e2..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/88616292-1e38-4481-af30-6b60e28fb097.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_3e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2387 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3258 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3169 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1589 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/44094907-0b09-4706-a117-116a7e10a6e5.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/44094907-0b09-4706-a117-116a7e10a6e5.json deleted file mode 100644 index 7ed0e9876..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/44094907-0b09-4706-a117-116a7e10a6e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_5e-7-1ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2532 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1593 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/d19e8078-87e9-4760-9b91-6b5f478820e1.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/d19e8078-87e9-4760-9b91-6b5f478820e1.json deleted file mode 100644 index 97e1295f7..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/d19e8078-87e9-4760-9b91-6b5f478820e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_5e-7-2ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2456 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3299 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1602 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/896464f1-01bc-4370-8d90-3368323b2908.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/896464f1-01bc-4370-8d90-3368323b2908.json deleted file mode 100644 index b8bf7d1e7..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/896464f1-01bc-4370-8d90-3368323b2908.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-DPO_5e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2423 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3271 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1595 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/9889f0b9-9051-485c-bd44-32b1e56b865c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/9889f0b9-9051-485c-bd44-32b1e56b865c.json deleted file mode 100644 index 71c059874..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/9889f0b9-9051-485c-bd44-32b1e56b865c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IPO_5e-7-1ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2574 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3279 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3169 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/6563ce79-6df4-4c78-89e2-064f1250d898.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/6563ce79-6df4-4c78-89e2-064f1250d898.json deleted file mode 100644 index 04b6f52a6..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/6563ce79-6df4-4c78-89e2-064f1250d898.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IPO_5e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3072 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3264 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1624 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/b1778755-e6e6-47e2-925d-44d786c4ff62.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/b1778755-e6e6-47e2-925d-44d786c4ff62.json deleted file mode 100644 index 0810520c3..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/b1778755-e6e6-47e2-925d-44d786c4ff62.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2551 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3242 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json deleted file mode 100644 index c28c2cd0f..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2636 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3198 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1586 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/40831e23-0a9e-4bdc-a365-9399b6b82ff9.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/40831e23-0a9e-4bdc-a365-9399b6b82ff9.json deleted file mode 100644 index 69728b3a1..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/40831e23-0a9e-4bdc-a365-9399b6b82ff9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2323 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3169 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1612 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/4a60fa82-34dc-4b0c-9102-65adac5039e4.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/4a60fa82-34dc-4b0c-9102-65adac5039e4.json deleted file mode 100644 index 0bf9d0574..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/4a60fa82-34dc-4b0c-9102-65adac5039e4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2414 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1532 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json deleted file mode 100644 index 493b1d463..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2678 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3362 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1561 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/d7962833-660a-4b9b-9836-8a2f3251f38e.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/d7962833-660a-4b9b-9836-8a2f3251f38e.json deleted file mode 100644 index 0290bbad0..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/d7962833-660a-4b9b-9836-8a2f3251f38e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2561 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3231 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1589 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/ad8ecabf-a868-496e-892b-582efb54fa6a.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/ad8ecabf-a868-496e-892b-582efb54fa6a.json deleted file mode 100644 index 5edc518ea..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/ad8ecabf-a868-496e-892b-582efb54fa6a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2639 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3257 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1587 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/49f25d3d-80c9-4723-8fa9-1501d44d70aa.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/49f25d3d-80c9-4723-8fa9-1501d44d70aa.json deleted file mode 100644 index 2d9b9aac9..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/49f25d3d-80c9-4723-8fa9-1501d44d70aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3214 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3169 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1585 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/70ea520c-3e0c-4412-9dbe-40a00801335c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/70ea520c-3e0c-4412-9dbe-40a00801335c.json deleted file mode 100644 index 47326edc5..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/70ea520c-3e0c-4412-9dbe-40a00801335c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2438 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3266 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1554 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/8e7f8bad-812b-4f6c-8dea-1cf44584c300.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/8e7f8bad-812b-4f6c-8dea-1cf44584c300.json deleted file mode 100644 index 2ecc04e03..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/8e7f8bad-812b-4f6c-8dea-1cf44584c300.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam", - "id": "JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2465 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3246 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1563 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json deleted file mode 100644 index fdc8cfb2c..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2506 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3261 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1522 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json deleted file mode 100644 index ed6db3b8d..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2457 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1566 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/20e5d087-7b20-4a39-81da-7334354b61f0.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/20e5d087-7b20-4a39-81da-7334354b61f0.json deleted file mode 100644 index 2b13ae37e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/20e5d087-7b20-4a39-81da-7334354b61f0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2454 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3216 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1544 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/4c5a769c-0472-402c-8e97-d24e5b302bac.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/4c5a769c-0472-402c-8e97-d24e5b302bac.json deleted file mode 100644 index 455e40c44..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/4c5a769c-0472-402c-8e97-d24e5b302bac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2342 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.158 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/96166735-ed03-4931-81c9-d3daed1913d9.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/96166735-ed03-4931-81c9-d3daed1913d9.json deleted file mode 100644 index 4438ad84e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/96166735-ed03-4931-81c9-d3daed1913d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.232 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3234 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1543 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json deleted file mode 100644 index 9827aba86..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2418 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3175 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.158 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/776fd8d8-9846-4359-97d4-2340425d1315.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/776fd8d8-9846-4359-97d4-2340425d1315.json deleted file mode 100644 index 9a8c26353..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/776fd8d8-9846-4359-97d4-2340425d1315.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2Model", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2493 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3197 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1571 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json deleted file mode 100644 index ecb5d143e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.252 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3198 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1551 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json deleted file mode 100644 index 0e4125637..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.258 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1539 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json deleted file mode 100644 index a66e2fd4a..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.232 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3265 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1537 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json deleted file mode 100644 index a651a40c3..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2488 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3273 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1531 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json deleted file mode 100644 index 19c99bb89..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2524 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.313 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1564 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/fc7284d9-a73f-4562-a781-5cb87247183f.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/fc7284d9-a73f-4562-a781-5cb87247183f.json deleted file mode 100644 index 4c3d00e8d..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/fc7284d9-a73f-4562-a781-5cb87247183f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2514 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1538 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/26ab447c-a850-4197-983a-a0dca4532029.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/26ab447c-a850-4197-983a-a0dca4532029.json deleted file mode 100644 index c06a3ff06..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/26ab447c-a850-4197-983a-a0dca4532029.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2457 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1572 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/ee9e2131-aa99-49e1-9814-f0664614354b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/ee9e2131-aa99-49e1-9814-f0664614354b.json deleted file mode 100644 index 56da94f33..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/ee9e2131-aa99-49e1-9814-f0664614354b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2636 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/23c472f7-f060-4a69-8f72-12490675825a.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/23c472f7-f060-4a69-8f72-12490675825a.json deleted file mode 100644 index 070354a8d..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/23c472f7-f060-4a69-8f72-12490675825a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VDPO_3e-6-1ep_3vpo_const", - "id": "JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3174 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1558 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/04172bef-c06b-4c08-b2af-9e1fe4d97664.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/04172bef-c06b-4c08-b2af-9e1fe4d97664.json deleted file mode 100644 index c722b726e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/04172bef-c06b-4c08-b2af-9e1fe4d97664.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1595 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/3436355a-d2fe-411f-a764-4cb8284deb4c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/3436355a-d2fe-411f-a764-4cb8284deb4c.json deleted file mode 100644 index 7ac21a2a4..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/3436355a-d2fe-411f-a764-4cb8284deb4c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VDPO_5e-7-1ep_10vpo_const", - "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2536 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3234 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1597 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/265655c0-2ead-4dd7-8c7e-4bee69d51bce.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/265655c0-2ead-4dd7-8c7e-4bee69d51bce.json deleted file mode 100644 index d1101d0a7..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/265655c0-2ead-4dd7-8c7e-4bee69d51bce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VDPO_5e-7-1ep_1vpo_const", - "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2448 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.324 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1587 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/645cae82-9e7b-4d1b-b944-e3783089c1c1.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/645cae82-9e7b-4d1b-b944-e3783089c1c1.json deleted file mode 100644 index f0b7804e2..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/645cae82-9e7b-4d1b-b944-e3783089c1c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VDPO_5e-7-1ep_3vpo_const", - "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3227 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1589 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json deleted file mode 100644 index a26877761..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2472 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1587 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json deleted file mode 100644 index 5b2d375d1..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VDPO_5e-7-3ep_1vpo_const", - "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2417 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/ce7e3a31-c65b-4521-b685-fcbd067c75d9.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/ce7e3a31-c65b-4521-b685-fcbd067c75d9.json deleted file mode 100644 index 271b74801..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/ce7e3a31-c65b-4521-b685-fcbd067c75d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VDPO_5e-7-3ep_3vpo_const", - "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2527 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.158 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/adb53e2c-5dee-4840-8eae-e0186c6e103f.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/adb53e2c-5dee-4840-8eae-e0186c6e103f.json deleted file mode 100644 index 67643af45..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/adb53e2c-5dee-4840-8eae-e0186c6e103f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2669 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3168 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1634 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/ba89563d-f53a-4bf0-91e1-92ac950523d8.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/ba89563d-f53a-4bf0-91e1-92ac950523d8.json deleted file mode 100644 index fc2edfe9a..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/ba89563d-f53a-4bf0-91e1-92ac950523d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-1ep_10vpo_const", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2702 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1635 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json deleted file mode 100644 index 21e635518..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-1ep_1vpo_const", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.248 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3309 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1649 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/ed816bcb-bbe9-48ae-a6ac-3603779a985f.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/ed816bcb-bbe9-48ae-a6ac-3603779a985f.json deleted file mode 100644 index a5f23395a..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/ed816bcb-bbe9-48ae-a6ac-3603779a985f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-1ep_30vpo_const", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2622 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3282 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1634 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/f347ed24-066a-4cba-8478-f03628cb2b5b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/f347ed24-066a-4cba-8478-f03628cb2b5b.json deleted file mode 100644 index 1a374d513..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/f347ed24-066a-4cba-8478-f03628cb2b5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-1ep_3vpo_const", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3168 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/ffddfea0-d17e-44e7-8931-a9601e9cb26b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/ffddfea0-d17e-44e7-8931-a9601e9cb26b.json deleted file mode 100644 index bf35e8693..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/ffddfea0-d17e-44e7-8931-a9601e9cb26b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.293 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.322 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3116 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1591 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json deleted file mode 100644 index 45a06565e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-3ep_10vpo_const", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2881 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3102 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1582 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/a0038c34-130b-49dc-a93f-94706a3dad50.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/a0038c34-130b-49dc-a93f-94706a3dad50.json deleted file mode 100644 index 8033e9669..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/a0038c34-130b-49dc-a93f-94706a3dad50.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-3ep_1vpo_const", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2887 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3237 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1609 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json deleted file mode 100644 index 85441076b..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-3ep_30vpo_const", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2905 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3254 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/b902e2b2-a0b3-4467-b076-b98717c40d74.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/b902e2b2-a0b3-4467-b076-b98717c40d74.json deleted file mode 100644 index cb1e7dc31..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/b902e2b2-a0b3-4467-b076-b98717c40d74.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-VIPO_5e-7-3ep_3vpo_const", - "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2905 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3089 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1592 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/4c749665-59ff-49df-a193-0262f66e6003.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/4c749665-59ff-49df-a193-0262f66e6003.json deleted file mode 100644 index 7f560d07a..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/4c749665-59ff-49df-a193-0262f66e6003.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1", - "id": "JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2393 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3244 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1573 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/c99899c6-95e1-4dea-ac12-f8df49728a3b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/c99899c6-95e1-4dea-ac12-f8df49728a3b.json deleted file mode 100644 index 52430793e..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/c99899c6-95e1-4dea-ac12-f8df49728a3b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3", - "id": "JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/13deca9f-073e-444b-bf79-35e816f7c312.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/13deca9f-073e-444b-bf79-35e816f7c312.json deleted file mode 100644 index d31366b10..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/13deca9f-073e-444b-bf79-35e816f7c312.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1", - "id": "JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3278 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3022 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1496 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json deleted file mode 100644 index 9fb481bab..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1", - "id": "JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3253 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1609 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/b146daaf-ce1f-4520-bc19-21ce8679b220.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/b146daaf-ce1f-4520-bc19-21ce8679b220.json deleted file mode 100644 index 345fd539c..000000000 --- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/b146daaf-ce1f-4520-bc19-21ce8679b220.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3", - "id": "JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3", - "developer": "JayHyeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2739 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3245 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3089 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1597 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2/45e1d037-1ed0-472c-a311-c651fde270fc.json b/data/hfopenllm_v2/Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2/45e1d037-1ed0-472c-a311-c651fde270fc.json deleted file mode 100644 index 36efeda0f..000000000 --- a/data/hfopenllm_v2/Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2/45e1d037-1ed0-472c-a311-c651fde270fc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jimmy19991222_Llama-3-Instruct-8B-SimPO-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SimPO-v0.2", - "id": "Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2", - "developer": "Jimmy19991222", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.654 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4013 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json deleted file mode 100644 index ea821d05a..000000000 --- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun", - "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun", - "developer": "Jimmy19991222", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6717 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.488 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4041 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3634 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/470d52be-9dbd-4714-b004-f65cc82d245f.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/470d52be-9dbd-4714-b004-f65cc82d245f.json deleted file mode 100644 index 49a213fd9..000000000 --- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/470d52be-9dbd-4714-b004-f65cc82d245f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log", - "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log", - "developer": "Jimmy19991222", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6556 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4935 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/c836fd05-1969-439c-91e1-fd0cab816f6c.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/c836fd05-1969-439c-91e1-fd0cab816f6c.json deleted file mode 100644 index 539534c00..000000000 --- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/c836fd05-1969-439c-91e1-fd0cab816f6c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log", - "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log", - "developer": "Jimmy19991222", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6315 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4916 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3611 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/14774c6b-eb03-4abc-92df-1e7a196ca8a4.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/14774c6b-eb03-4abc-92df-1e7a196ca8a4.json deleted file mode 100644 index 80d5fde9f..000000000 --- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/14774c6b-eb03-4abc-92df-1e7a196ca8a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4", - "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4", - "developer": "Jimmy19991222", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6285 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4986 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4014 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3545 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/5293ae0c-8022-44d4-b2f5-4f5390dff93e.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/5293ae0c-8022-44d4-b2f5-4f5390dff93e.json deleted file mode 100644 index ce4d8a93a..000000000 --- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/5293ae0c-8022-44d4-b2f5-4f5390dff93e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun", - "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun", - "developer": "Jimmy19991222", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6678 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.494 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3987 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/9020f91f-a8f0-447d-af68-247aa81a25c6.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/9020f91f-a8f0-447d-af68-247aa81a25c6.json deleted file mode 100644 index 4e211ab7c..000000000 --- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/9020f91f-a8f0-447d-af68-247aa81a25c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log", - "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log", - "developer": "Jimmy19991222", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6605 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4916 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3664 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/0cd6837a-8c3f-4529-9ea0-8755e1725467.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/0cd6837a-8c3f-4529-9ea0-8755e1725467.json deleted file mode 100644 index b0cd90258..000000000 --- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/0cd6837a-8c3f-4529-9ea0-8755e1725467.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log", - "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log", - "developer": "Jimmy19991222", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6492 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4952 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3961 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3711 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/7cb17011-cf77-4e86-b67f-84e6ff4b8086.json b/data/hfopenllm_v2/Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/7cb17011-cf77-4e86-b67f-84e6ff4b8086.json deleted file mode 100644 index 25c8b3965..000000000 --- a/data/hfopenllm_v2/Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/7cb17011-cf77-4e86-b67f-84e6ff4b8086.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Joseph717171_Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32", - "id": "Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32", - "developer": "Joseph717171", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6185 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5177 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4369 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/086831f9-c677-428b-a997-4da58733633c.json b/data/hfopenllm_v2/Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/086831f9-c677-428b-a997-4da58733633c.json deleted file mode 100644 index eb6e8c12a..000000000 --- a/data/hfopenllm_v2/Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/086831f9-c677-428b-a997-4da58733633c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Joseph717171_Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-SuperNova-8B-Lite_TIES_with_Base", - "id": "Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base", - "developer": "Joseph717171", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8096 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5147 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1835 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.411 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Josephgflowers/Cinder-Phi-2-V1-F16-gguf/d71893b8-b82c-490b-a700-b579d64e0610.json b/data/hfopenllm_v2/Josephgflowers/Cinder-Phi-2-V1-F16-gguf/d71893b8-b82c-490b-a700-b579d64e0610.json deleted file mode 100644 index 536e2fe85..000000000 --- a/data/hfopenllm_v2/Josephgflowers/Cinder-Phi-2-V1-F16-gguf/d71893b8-b82c-490b-a700-b579d64e0610.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Josephgflowers_Cinder-Phi-2-V1-F16-gguf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cinder-Phi-2-V1-F16-gguf", - "id": "Josephgflowers/Cinder-Phi-2-V1-F16-gguf", - "developer": "Josephgflowers", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2357 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4397 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2161 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama/9893689f-c27d-4148-a27f-cd07b07e98b7.json b/data/hfopenllm_v2/Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama/9893689f-c27d-4148-a27f-cd07b07e98b7.json deleted file mode 100644 index dab2ee4a6..000000000 --- a/data/hfopenllm_v2/Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama/9893689f-c27d-4148-a27f-cd07b07e98b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Josephgflowers_Differential-Attention-Liquid-Metal-Tinyllama/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Differential-Attention-Liquid-Metal-Tinyllama", - "id": "Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama", - "developer": "Josephgflowers", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2227 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2926 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1214 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Josephgflowers/TinyLlama-Cinder-Agent-v1/90f2df23-a9ec-44be-ade5-89b59cb7368a.json b/data/hfopenllm_v2/Josephgflowers/TinyLlama-Cinder-Agent-v1/90f2df23-a9ec-44be-ade5-89b59cb7368a.json deleted file mode 100644 index 7c3cb09c4..000000000 --- a/data/hfopenllm_v2/Josephgflowers/TinyLlama-Cinder-Agent-v1/90f2df23-a9ec-44be-ade5-89b59cb7368a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Josephgflowers_TinyLlama-Cinder-Agent-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyLlama-Cinder-Agent-v1", - "id": "Josephgflowers/TinyLlama-Cinder-Agent-v1", - "developer": "Josephgflowers", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.267 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3116 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1161 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Josephgflowers/TinyLlama-v1.1-Cinders-World/afd545da-390a-478a-b0f5-ea819f088f27.json b/data/hfopenllm_v2/Josephgflowers/TinyLlama-v1.1-Cinders-World/afd545da-390a-478a-b0f5-ea819f088f27.json deleted file mode 100644 index 3ac65a5c2..000000000 --- a/data/hfopenllm_v2/Josephgflowers/TinyLlama-v1.1-Cinders-World/afd545da-390a-478a-b0f5-ea819f088f27.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Josephgflowers_TinyLlama-v1.1-Cinders-World/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyLlama-v1.1-Cinders-World", - "id": "Josephgflowers/TinyLlama-v1.1-Cinders-World", - "developer": "Josephgflowers", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2469 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2998 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1198 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Josephgflowers/TinyLlama_v1.1_math_code-world-test-1/ce776f68-856f-4aee-b7e4-e55d15e8d714.json b/data/hfopenllm_v2/Josephgflowers/TinyLlama_v1.1_math_code-world-test-1/ce776f68-856f-4aee-b7e4-e55d15e8d714.json deleted file mode 100644 index 489ca17d9..000000000 --- a/data/hfopenllm_v2/Josephgflowers/TinyLlama_v1.1_math_code-world-test-1/ce776f68-856f-4aee-b7e4-e55d15e8d714.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Josephgflowers_TinyLlama_v1.1_math_code-world-test-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyLlama_v1.1_math_code-world-test-1", - "id": "Josephgflowers/TinyLlama_v1.1_math_code-world-test-1", - "developer": "Josephgflowers", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0078 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2341 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3499 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1132 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1/9b015729-524c-44f3-9c2c-c42981d7a61e.json b/data/hfopenllm_v2/Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1/9b015729-524c-44f3-9c2c-c42981d7a61e.json deleted file mode 100644 index 586b23dba..000000000 --- a/data/hfopenllm_v2/Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1/9b015729-524c-44f3-9c2c-c42981d7a61e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Josephgflowers_Tinyllama-STEM-Cinder-Agent-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tinyllama-STEM-Cinder-Agent-v1", - "id": "Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1", - "developer": "Josephgflowers", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2126 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1086 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Josephgflowers/Tinyllama-r1/56a54ffc-4692-496c-95df-8e4ad19d4d95.json b/data/hfopenllm_v2/Josephgflowers/Tinyllama-r1/56a54ffc-4692-496c-95df-8e4ad19d4d95.json deleted file mode 100644 index 5e761ffc2..000000000 --- a/data/hfopenllm_v2/Josephgflowers/Tinyllama-r1/56a54ffc-4692-496c-95df-8e4ad19d4d95.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Josephgflowers_Tinyllama-r1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tinyllama-r1", - "id": "Josephgflowers/Tinyllama-r1", - "developer": "Josephgflowers", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2119 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3015 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1134 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3/4b105969-2ce5-4c62-89ef-efd392c2ca89.json b/data/hfopenllm_v2/JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3/4b105969-2ce5-4c62-89ef-efd392c2ca89.json deleted file mode 100644 index 6864f9d79..000000000 --- a/data/hfopenllm_v2/JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3/4b105969-2ce5-4c62-89ef-efd392c2ca89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JungZoona_T3Q-Qwen2.5-14B-Instruct-1M-e3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "T3Q-Qwen2.5-14B-Instruct-1M-e3", - "id": "JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3", - "developer": "JungZoona", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Unknown", - "params_billions": 0.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7324 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7586 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2863 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4169 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5884 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/JungZoona/T3Q-qwen2.5-14b-v1.0-e3/31af79b1-48c1-4399-9d16-8582c92996ee.json b/data/hfopenllm_v2/JungZoona/T3Q-qwen2.5-14b-v1.0-e3/31af79b1-48c1-4399-9d16-8582c92996ee.json deleted file mode 100644 index ebb5f9b2d..000000000 --- a/data/hfopenllm_v2/JungZoona/T3Q-qwen2.5-14b-v1.0-e3/31af79b1-48c1-4399-9d16-8582c92996ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/JungZoona_T3Q-qwen2.5-14b-v1.0-e3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "T3Q-qwen2.5-14b-v1.0-e3", - "id": "JungZoona/T3Q-qwen2.5-14b-v1.0-e3", - "developer": "JungZoona", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7324 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7586 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2863 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4169 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5884 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Junhoee/Qwen-Megumin/59a67f29-cb7d-497c-b7bb-1764a665ae33.json b/data/hfopenllm_v2/Junhoee/Qwen-Megumin/59a67f29-cb7d-497c-b7bb-1764a665ae33.json deleted file mode 100644 index 8f5c2b802..000000000 --- a/data/hfopenllm_v2/Junhoee/Qwen-Megumin/59a67f29-cb7d-497c-b7bb-1764a665ae33.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Junhoee_Qwen-Megumin/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-Megumin", - "id": "Junhoee/Qwen-Megumin", - "developer": "Junhoee", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 15.231 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7141 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5285 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-1415/fe57367c-74b7-483e-af54-4f404cbea75b.json b/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-1415/fe57367c-74b7-483e-af54-4f404cbea75b.json deleted file mode 100644 index d5244c3c9..000000000 --- a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-1415/fe57367c-74b7-483e-af54-4f404cbea75b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KSU-HW-SEC_Llama3-70b-SVA-FT-1415/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-70b-SVA-FT-1415", - "id": "KSU-HW-SEC/Llama3-70b-SVA-FT-1415", - "developer": "KSU-HW-SEC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.618 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.665 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4565 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5243 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-500/fda2277b-1513-416e-b586-ed05920a0bb4.json b/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-500/fda2277b-1513-416e-b586-ed05920a0bb4.json deleted file mode 100644 index 12fd56cb7..000000000 --- a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-500/fda2277b-1513-416e-b586-ed05920a0bb4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KSU-HW-SEC_Llama3-70b-SVA-FT-500/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-70b-SVA-FT-500", - "id": "KSU-HW-SEC/Llama3-70b-SVA-FT-500", - "developer": "KSU-HW-SEC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6105 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6692 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2137 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4511 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5227 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-final/b3dde216-f80a-4664-aadc-b5f5dd3e5895.json b/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-final/b3dde216-f80a-4664-aadc-b5f5dd3e5895.json deleted file mode 100644 index 54dad4185..000000000 --- a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-final/b3dde216-f80a-4664-aadc-b5f5dd3e5895.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KSU-HW-SEC_Llama3-70b-SVA-FT-final/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-70b-SVA-FT-final", - "id": "KSU-HW-SEC/Llama3-70b-SVA-FT-final", - "developer": "KSU-HW-SEC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6165 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.665 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4565 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5243 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step/07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json b/data/hfopenllm_v2/KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step/07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json deleted file mode 100644 index 39cc4b352..000000000 --- a/data/hfopenllm_v2/KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step/07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KSU-HW-SEC_Llama3.1-70b-SVA-FT-1000step/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-70b-SVA-FT-1000step", - "id": "KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step", - "developer": "KSU-HW-SEC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7238 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6903 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.321 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4592 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5252 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Khetterman/DarkAtom-12B-v3/ba76c356-cd6a-4636-8ab1-18bb9df69881.json b/data/hfopenllm_v2/Khetterman/DarkAtom-12B-v3/ba76c356-cd6a-4636-8ab1-18bb9df69881.json deleted file mode 100644 index 45d9e2c04..000000000 --- a/data/hfopenllm_v2/Khetterman/DarkAtom-12B-v3/ba76c356-cd6a-4636-8ab1-18bb9df69881.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Khetterman_DarkAtom-12B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DarkAtom-12B-v3", - "id": "Khetterman/DarkAtom-12B-v3", - "developer": "Khetterman", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6173 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5154 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4468 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3546 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Khetterman/Kosmos-8B-v1/c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json b/data/hfopenllm_v2/Khetterman/Kosmos-8B-v1/c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json deleted file mode 100644 index 9f41fea54..000000000 --- a/data/hfopenllm_v2/Khetterman/Kosmos-8B-v1/c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Khetterman_Kosmos-8B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-8B-v1", - "id": "Khetterman/Kosmos-8B-v1", - "developer": "Khetterman", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4129 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5234 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3919 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3669 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kimargin/GPT-NEO-1.3B-wiki/6f296f0e-80ca-49b7-94e7-cb45b795c715.json b/data/hfopenllm_v2/Kimargin/GPT-NEO-1.3B-wiki/6f296f0e-80ca-49b7-94e7-cb45b795c715.json deleted file mode 100644 index 2495d87be..000000000 --- a/data/hfopenllm_v2/Kimargin/GPT-NEO-1.3B-wiki/6f296f0e-80ca-49b7-94e7-cb45b795c715.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kimargin_GPT-NEO-1.3B-wiki/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-NEO-1.3B-wiki", - "id": "Kimargin/GPT-NEO-1.3B-wiki", - "developer": "Kimargin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoForCausalLM", - "params_billions": 1.316 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1921 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3883 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1099 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KingNish/Qwen2.5-0.5b-Test-ft/b5509e11-820a-4ad4-8c6a-0294762502a8.json b/data/hfopenllm_v2/KingNish/Qwen2.5-0.5b-Test-ft/b5509e11-820a-4ad4-8c6a-0294762502a8.json deleted file mode 100644 index c989aeb0c..000000000 --- a/data/hfopenllm_v2/KingNish/Qwen2.5-0.5b-Test-ft/b5509e11-820a-4ad4-8c6a-0294762502a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KingNish_Qwen2.5-0.5b-Test-ft/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5b-Test-ft", - "id": "KingNish/Qwen2.5-0.5b-Test-ft", - "developer": "KingNish", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2671 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3232 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1689 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KingNish/Reasoning-0.5b/90d73665-8d83-4e74-ab7d-29b1d3b6181b.json b/data/hfopenllm_v2/KingNish/Reasoning-0.5b/90d73665-8d83-4e74-ab7d-29b1d3b6181b.json deleted file mode 100644 index 57493bf65..000000000 --- a/data/hfopenllm_v2/KingNish/Reasoning-0.5b/90d73665-8d83-4e74-ab7d-29b1d3b6181b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KingNish_Reasoning-0.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reasoning-0.5b", - "id": "KingNish/Reasoning-0.5b", - "developer": "KingNish", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2174 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3513 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1641 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KingNish/Reasoning-Llama-3b-v0.1/72387647-cbac-4b72-9c22-db7029a39457.json b/data/hfopenllm_v2/KingNish/Reasoning-Llama-3b-v0.1/72387647-cbac-4b72-9c22-db7029a39457.json deleted file mode 100644 index e30cf7c55..000000000 --- a/data/hfopenllm_v2/KingNish/Reasoning-Llama-3b-v0.1/72387647-cbac-4b72-9c22-db7029a39457.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KingNish_Reasoning-Llama-3b-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reasoning-Llama-3b-v0.1", - "id": "KingNish/Reasoning-Llama-3b-v0.1", - "developer": "KingNish", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6225 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3168 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.1/6219ec01-4b6a-4acd-aee1-96c3e8e48643.json b/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.1/6219ec01-4b6a-4acd-aee1-96c3e8e48643.json deleted file mode 100644 index 143abb8e4..000000000 --- a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.1/6219ec01-4b6a-4acd-aee1-96c3e8e48643.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KingNish_qwen-1b-continued-v2.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-1b-continued-v2.1", - "id": "KingNish/qwen-1b-continued-v2.1", - "developer": "KingNish", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.277 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3042 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1278 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.2/5c323d7c-25cd-4718-8a1f-54d986cadaf2.json b/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.2/5c323d7c-25cd-4718-8a1f-54d986cadaf2.json deleted file mode 100644 index 04f12a579..000000000 --- a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.2/5c323d7c-25cd-4718-8a1f-54d986cadaf2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KingNish_qwen-1b-continued-v2.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-1b-continued-v2.2", - "id": "KingNish/qwen-1b-continued-v2.2", - "developer": "KingNish", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.277 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1413 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3059 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3513 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1262 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2/adfab21a-941b-4efc-8b63-fdfb3074ba9b.json b/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2/adfab21a-941b-4efc-8b63-fdfb3074ba9b.json deleted file mode 100644 index 7b2c2ff2f..000000000 --- a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2/adfab21a-941b-4efc-8b63-fdfb3074ba9b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KingNish_qwen-1b-continued-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-1b-continued-v2", - "id": "KingNish/qwen-1b-continued-v2", - "developer": "KingNish", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.277 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1579 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3119 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3393 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/KingNish/qwen-1b-continued/350d00a4-7501-4130-a069-323530bc9729.json b/data/hfopenllm_v2/KingNish/qwen-1b-continued/350d00a4-7501-4130-a069-323530bc9729.json deleted file mode 100644 index 7d5687eaf..000000000 --- a/data/hfopenllm_v2/KingNish/qwen-1b-continued/350d00a4-7501-4130-a069-323530bc9729.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/KingNish_qwen-1b-continued/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-1b-continued", - "id": "KingNish/qwen-1b-continued", - "developer": "KingNish", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.277 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1255 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2991 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kquant03/CognitiveFusion2-4x7B-BF16/ea809d28-178e-4a0b-ab5a-34739077c5ff.json b/data/hfopenllm_v2/Kquant03/CognitiveFusion2-4x7B-BF16/ea809d28-178e-4a0b-ab5a-34739077c5ff.json deleted file mode 100644 index 49d973710..000000000 --- a/data/hfopenllm_v2/Kquant03/CognitiveFusion2-4x7B-BF16/ea809d28-178e-4a0b-ab5a-34739077c5ff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kquant03_CognitiveFusion2-4x7B-BF16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CognitiveFusion2-4x7B-BF16", - "id": "Kquant03/CognitiveFusion2-4x7B-BF16", - "developer": "Kquant03", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3567 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4108 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4146 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2793 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kquant03/L3-Pneuma-8B/243d5ccd-58f3-4da5-8718-553f3f456490.json b/data/hfopenllm_v2/Kquant03/L3-Pneuma-8B/243d5ccd-58f3-4da5-8718-553f3f456490.json deleted file mode 100644 index 408f75c77..000000000 --- a/data/hfopenllm_v2/Kquant03/L3-Pneuma-8B/243d5ccd-58f3-4da5-8718-553f3f456490.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kquant03_L3-Pneuma-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Pneuma-8B", - "id": "Kquant03/L3-Pneuma-8B", - "developer": "Kquant03", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2374 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4172 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3184 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Krystalan/DRT-o1-14B/a45537a7-76a6-4855-b83b-abe965f13460.json b/data/hfopenllm_v2/Krystalan/DRT-o1-14B/a45537a7-76a6-4855-b83b-abe965f13460.json deleted file mode 100644 index 970337ee9..000000000 --- a/data/hfopenllm_v2/Krystalan/DRT-o1-14B/a45537a7-76a6-4855-b83b-abe965f13460.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Krystalan_DRT-o1-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DRT-o1-14B", - "id": "Krystalan/DRT-o1-14B", - "developer": "Krystalan", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4068 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6379 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4826 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4795 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5179 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Krystalan/DRT-o1-7B/9be911b6-b9f4-47b1-849d-62eb20c9e944.json b/data/hfopenllm_v2/Krystalan/DRT-o1-7B/9be911b6-b9f4-47b1-849d-62eb20c9e944.json deleted file mode 100644 index e93c9a10d..000000000 --- a/data/hfopenllm_v2/Krystalan/DRT-o1-7B/9be911b6-b9f4-47b1-849d-62eb20c9e944.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Krystalan_DRT-o1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DRT-o1-7B", - "id": "Krystalan/DRT-o1-7B", - "developer": "Krystalan", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5468 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5087 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4151 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5/33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json b/data/hfopenllm_v2/Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5/33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json deleted file mode 100644 index 0cf70abae..000000000 --- a/data/hfopenllm_v2/Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5/33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralExperiment-7b-MagicCoder-v7.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralExperiment-7b-MagicCoder-v7.5", - "id": "Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5", - "developer": "Kukedlc", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4553 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3988 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4282 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2824 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-DT-v0.1/4355fbdd-ac72-4f26-8e07-b7e8d774d238.json b/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-DT-v0.1/4355fbdd-ac72-4f26-8e07-b7e8d774d238.json deleted file mode 100644 index cdfa1a3cb..000000000 --- a/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-DT-v0.1/4355fbdd-ac72-4f26-8e07-b7e8d774d238.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralLLaMa-3-8b-DT-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralLLaMa-3-8b-DT-v0.1", - "id": "Kukedlc/NeuralLLaMa-3-8b-DT-v0.1", - "developer": "Kukedlc", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4987 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3/4bffc633-e20c-4874-b7db-d1b7dabb8070.json b/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3/4bffc633-e20c-4874-b7db-d1b7dabb8070.json deleted file mode 100644 index 76e148b2f..000000000 --- a/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3/4bffc633-e20c-4874-b7db-d1b7dabb8070.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralLLaMa-3-8b-ORPO-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralLLaMa-3-8b-ORPO-v0.3", - "id": "Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3", - "developer": "Kukedlc", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5276 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4557 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3057 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.1/2d5c844d-d950-4254-bac2-0a986659c541.json b/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.1/2d5c844d-d950-4254-bac2-0a986659c541.json deleted file mode 100644 index 79df21f7a..000000000 --- a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.1/2d5c844d-d950-4254-bac2-0a986659c541.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralSynthesis-7B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralSynthesis-7B-v0.1", - "id": "Kukedlc/NeuralSynthesis-7B-v0.1", - "developer": "Kukedlc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5145 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4333 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3049 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.3/f6e74b3c-9ee4-40c3-bf92-35d965503a04.json b/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.3/f6e74b3c-9ee4-40c3-bf92-35d965503a04.json deleted file mode 100644 index 869b589d4..000000000 --- a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.3/f6e74b3c-9ee4-40c3-bf92-35d965503a04.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralSynthesis-7B-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralSynthesis-7B-v0.3", - "id": "Kukedlc/NeuralSynthesis-7B-v0.3", - "developer": "Kukedlc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4078 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5138 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.305 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7b-v0.4-slerp/8f1d2600-7347-48b8-9759-11570598459d.json b/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7b-v0.4-slerp/8f1d2600-7347-48b8-9759-11570598459d.json deleted file mode 100644 index 950dab5d0..000000000 --- a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7b-v0.4-slerp/8f1d2600-7347-48b8-9759-11570598459d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralSynthesis-7b-v0.4-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralSynthesis-7b-v0.4-slerp", - "id": "Kukedlc/NeuralSynthesis-7b-v0.4-slerp", - "developer": "Kukedlc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3947 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5143 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3043 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT/cd653bfd-2c06-4224-aeeb-bf591995a69e.json b/data/hfopenllm_v2/Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT/cd653bfd-2c06-4224-aeeb-bf591995a69e.json deleted file mode 100644 index 21324748e..000000000 --- a/data/hfopenllm_v2/Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT/cd653bfd-2c06-4224-aeeb-bf591995a69e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kukedlc_Qwen-2.5-7b-Spanish-o1-CoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-7b-Spanish-o1-CoT", - "id": "Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT", - "developer": "Kukedlc", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5602 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4777 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4363 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Kumar955/Hemanth-llm/cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json b/data/hfopenllm_v2/Kumar955/Hemanth-llm/cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json deleted file mode 100644 index c93b19301..000000000 --- a/data/hfopenllm_v2/Kumar955/Hemanth-llm/cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Kumar955_Hemanth-llm/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hemanth-llm", - "id": "Kumar955/Hemanth-llm", - "developer": "Kumar955", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5045 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5225 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1/4828bd36-5453-4383-8985-08d04a7ebecd.json b/data/hfopenllm_v2/L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1/4828bd36-5453-4383-8985-08d04a7ebecd.json deleted file mode 100644 index 22101047a..000000000 --- a/data/hfopenllm_v2/L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1/4828bd36-5453-4383-8985-08d04a7ebecd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/L-RAGE_3_PRYMMAL-ECE-7B-SLERP-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "3_PRYMMAL-ECE-7B-SLERP-V1", - "id": "L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1", - "developer": "L-RAGE", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2742 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3841 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2925 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki100p/4c2baa59-c2f1-4779-9d21-1f69c0821968.json b/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki100p/4c2baa59-c2f1-4779-9d21-1f69c0821968.json deleted file mode 100644 index 6b1aef1c0..000000000 --- a/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki100p/4c2baa59-c2f1-4779-9d21-1f69c0821968.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LEESM_llama-2-7b-hf-lora-oki100p/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-2-7b-hf-lora-oki100p", - "id": "LEESM/llama-2-7b-hf-lora-oki100p", - "developer": "LEESM", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2513 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1856 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki10p/555c1079-c4d0-4b9e-9d2d-769e7ba32429.json b/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki10p/555c1079-c4d0-4b9e-9d2d-769e7ba32429.json deleted file mode 100644 index a80706c7c..000000000 --- a/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki10p/555c1079-c4d0-4b9e-9d2d-769e7ba32429.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LEESM_llama-2-7b-hf-lora-oki10p/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-2-7b-hf-lora-oki10p", - "id": "LEESM/llama-2-7b-hf-lora-oki10p", - "developer": "LEESM", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.227 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3531 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LEESM/llama-3-8b-bnb-4b-kowiki231101/58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json b/data/hfopenllm_v2/LEESM/llama-3-8b-bnb-4b-kowiki231101/58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json deleted file mode 100644 index 87aab7f15..000000000 --- a/data/hfopenllm_v2/LEESM/llama-3-8b-bnb-4b-kowiki231101/58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LEESM_llama-3-8b-bnb-4b-kowiki231101/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-bnb-4b-kowiki231101", - "id": "LEESM/llama-3-8b-bnb-4b-kowiki231101", - "developer": "LEESM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4131 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2425 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p/eea2a38a-4f1b-48d0-894c-09974894f264.json b/data/hfopenllm_v2/LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p/eea2a38a-4f1b-48d0-894c-09974894f264.json deleted file mode 100644 index 65d39674d..000000000 --- a/data/hfopenllm_v2/LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p/eea2a38a-4f1b-48d0-894c-09974894f264.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LEESM_llama-3-Korean-Bllossom-8B-trexlab-oki10p/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-Korean-Bllossom-8B-trexlab-oki10p", - "id": "LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p", - "developer": "LEESM", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2137 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3869 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3177 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/3d8063ab-0ad5-43e4-83ff-90b46dee766f.json b/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/3d8063ab-0ad5-43e4-83ff-90b46dee766f.json deleted file mode 100644 index 53d2d42f2..000000000 --- a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/3d8063ab-0ad5-43e4-83ff-90b46dee766f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LGAI-EXAONE_EXAONE-3.0-7.8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EXAONE-3.0-7.8B-Instruct", - "id": "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", - "developer": "LGAI-EXAONE", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "ExaoneForCausalLM", - "params_billions": 7.8 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7193 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4174 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3044 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3577 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct/da5e0284-7c44-42d4-a110-a23880de277f.json b/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct/da5e0284-7c44-42d4-a110-a23880de277f.json deleted file mode 100644 index ae4eb863d..000000000 --- a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct/da5e0284-7c44-42d4-a110-a23880de277f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LGAI-EXAONE_EXAONE-3.5-2.4B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EXAONE-3.5-2.4B-Instruct", - "id": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct", - "developer": "LGAI-EXAONE", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "ExaoneForCausalLM", - "params_billions": 2.405 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.795 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4092 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3678 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-32B-Instruct/bef017bb-47b1-48e4-93c4-3b222a16af7a.json b/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-32B-Instruct/bef017bb-47b1-48e4-93c4-3b222a16af7a.json deleted file mode 100644 index 5bbd9c37d..000000000 --- a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-32B-Instruct/bef017bb-47b1-48e4-93c4-3b222a16af7a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LGAI-EXAONE_EXAONE-3.5-32B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EXAONE-3.5-32B-Instruct", - "id": "LGAI-EXAONE/EXAONE-3.5-32B-Instruct", - "developer": "LGAI-EXAONE", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "ExaoneForCausalLM", - "params_billions": 32.003 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8392 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5761 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct/401c83b0-b7d2-4987-9e46-f127fdbb595f.json b/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct/401c83b0-b7d2-4987-9e46-f127fdbb595f.json deleted file mode 100644 index 48cb86a10..000000000 --- a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct/401c83b0-b7d2-4987-9e46-f127fdbb595f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LGAI-EXAONE_EXAONE-3.5-7.8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EXAONE-3.5-7.8B-Instruct", - "id": "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct", - "developer": "LGAI-EXAONE", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "ExaoneForCausalLM", - "params_billions": 7.818 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8136 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4728 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4751 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3779 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LLM360/K2-Chat/c6fde59b-73ed-4179-a907-076be068b262.json b/data/hfopenllm_v2/LLM360/K2-Chat/c6fde59b-73ed-4179-a907-076be068b262.json deleted file mode 100644 index 6ec8b2598..000000000 --- a/data/hfopenllm_v2/LLM360/K2-Chat/c6fde59b-73ed-4179-a907-076be068b262.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LLM360_K2-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "K2-Chat", - "id": "LLM360/K2-Chat", - "developer": "LLM360", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 65.286 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5152 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5358 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.457 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LLM360/K2/90997fea-6c67-493e-bd8e-5327cfb33ea4.json b/data/hfopenllm_v2/LLM360/K2/90997fea-6c67-493e-bd8e-5327cfb33ea4.json deleted file mode 100644 index c8a7cd176..000000000 --- a/data/hfopenllm_v2/LLM360/K2/90997fea-6c67-493e-bd8e-5327cfb33ea4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LLM360_K2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "K2", - "id": "LLM360/K2", - "developer": "LLM360", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 65.286 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2252 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4972 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LLM4Binary/llm4decompile-1.3b-v2/08957d63-7462-44ff-9dd8-060a5801a31b.json b/data/hfopenllm_v2/LLM4Binary/llm4decompile-1.3b-v2/08957d63-7462-44ff-9dd8-060a5801a31b.json deleted file mode 100644 index 85cc8161c..000000000 --- a/data/hfopenllm_v2/LLM4Binary/llm4decompile-1.3b-v2/08957d63-7462-44ff-9dd8-060a5801a31b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LLM4Binary_llm4decompile-1.3b-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llm4decompile-1.3b-v2", - "id": "LLM4Binary/llm4decompile-1.3b-v2", - "developer": "LLM4Binary", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.346 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2268 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2357 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4072 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1209 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lambent/qwen2.5-reinstruct-alternate-lumen-14B/a434f569-e7d6-4464-afa8-6104be43fa06.json b/data/hfopenllm_v2/Lambent/qwen2.5-reinstruct-alternate-lumen-14B/a434f569-e7d6-4464-afa8-6104be43fa06.json deleted file mode 100644 index 105590f52..000000000 --- a/data/hfopenllm_v2/Lambent/qwen2.5-reinstruct-alternate-lumen-14B/a434f569-e7d6-4464-afa8-6104be43fa06.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lambent_qwen2.5-reinstruct-alternate-lumen-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-reinstruct-alternate-lumen-14B", - "id": "Lambent/qwen2.5-reinstruct-alternate-lumen-14B", - "developer": "Lambent", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4794 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6459 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4622 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.477 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5388 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Langboat/Mengzi3-8B-Chat/e32ed251-e817-409f-b4c3-8f168f1ff822.json b/data/hfopenllm_v2/Langboat/Mengzi3-8B-Chat/e32ed251-e817-409f-b4c3-8f168f1ff822.json deleted file mode 100644 index 1c7e4d12f..000000000 --- a/data/hfopenllm_v2/Langboat/Mengzi3-8B-Chat/e32ed251-e817-409f-b4c3-8f168f1ff822.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Langboat_Mengzi3-8B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mengzi3-8B-Chat", - "id": "Langboat/Mengzi3-8B-Chat", - "developer": "Langboat", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4684 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4078 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBA100/1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json b/data/hfopenllm_v2/Lawnakk/BBA100/1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json deleted file mode 100644 index 9cbc0affe..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBA100/1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBA100/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBA100", - "id": "Lawnakk/BBA100", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2076 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2826 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.0/608398da-ae2a-4be2-aaf9-6ec8899aa63d.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.0/608398da-ae2a-4be2-aaf9-6ec8899aa63d.json deleted file mode 100644 index 11de02d19..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBALAW1.0/608398da-ae2a-4be2-aaf9-6ec8899aa63d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBALAW1.0", - "id": "Lawnakk/BBALAW1.0", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 4.353 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1351 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2828 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3526 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.2/80e04641-be7d-4351-a4f6-1318981ef834.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.2/80e04641-be7d-4351-a4f6-1318981ef834.json deleted file mode 100644 index 0b4de2bab..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBALAW1.2/80e04641-be7d-4351-a4f6-1318981ef834.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBALAW1.2", - "id": "Lawnakk/BBALAW1.2", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 4.353 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1354 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2811 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.3/e74222c6-636c-4075-8d4d-30c73fa70fda.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.3/e74222c6-636c-4075-8d4d-30c73fa70fda.json deleted file mode 100644 index a9204af31..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBALAW1.3/e74222c6-636c-4075-8d4d-30c73fa70fda.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBALAW1.3", - "id": "Lawnakk/BBALAW1.3", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 4.353 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1354 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3619 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1094 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.6/aed80361-9304-44a0-934a-52976d7f1bf3.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.6/aed80361-9304-44a0-934a-52976d7f1bf3.json deleted file mode 100644 index 02391ea7b..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBALAW1.6/aed80361-9304-44a0-934a-52976d7f1bf3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBALAW1.6", - "id": "Lawnakk/BBALAW1.6", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5245 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5554 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4507 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.61/709bd280-b03e-4908-808f-34566bc968f4.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.61/709bd280-b03e-4908-808f-34566bc968f4.json deleted file mode 100644 index d518940c4..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBALAW1.61/709bd280-b03e-4908-808f-34566bc968f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.61/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBALAW1.61", - "id": "Lawnakk/BBALAW1.61", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5771 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5549 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4471 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.62/66c495b3-4b09-42ad-b742-4d753c3bde7a.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.62/66c495b3-4b09-42ad-b742-4d753c3bde7a.json deleted file mode 100644 index 0ca80591d..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBALAW1.62/66c495b3-4b09-42ad-b742-4d753c3bde7a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.62/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBALAW1.62", - "id": "Lawnakk/BBALAW1.62", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5046 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5581 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2825 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4545 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.63/e24f7be6-3051-4990-8b93-121aec5402eb.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.63/e24f7be6-3051-4990-8b93-121aec5402eb.json deleted file mode 100644 index cb01d3410..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBALAW1.63/e24f7be6-3051-4990-8b93-121aec5402eb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.63/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBALAW1.63", - "id": "Lawnakk/BBALAW1.63", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4407 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5541 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4303 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4471 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.64/0321571b-4246-4490-bd6c-7b106eb8e15a.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.64/0321571b-4246-4490-bd6c-7b106eb8e15a.json deleted file mode 100644 index 227047148..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBALAW1.64/0321571b-4246-4490-bd6c-7b106eb8e15a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.64/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBALAW1.64", - "id": "Lawnakk/BBALAW1.64", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1395 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2779 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1/54dbf947-ab18-40dd-9cd7-a496289b2e72.json b/data/hfopenllm_v2/Lawnakk/BBALAW1/54dbf947-ab18-40dd-9cd7-a496289b2e72.json deleted file mode 100644 index f7c7740c8..000000000 --- a/data/hfopenllm_v2/Lawnakk/BBALAW1/54dbf947-ab18-40dd-9cd7-a496289b2e72.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBALAW1", - "id": "Lawnakk/BBALAW1", - "developer": "Lawnakk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1905 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2872 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4153 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1121 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-gemma-2b-v0/d841e204-ed6a-439d-8408-d5cfb3b38dae.json b/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-gemma-2b-v0/d841e204-ed6a-439d-8408-d5cfb3b38dae.json deleted file mode 100644 index 6394ebbd7..000000000 --- a/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-gemma-2b-v0/d841e204-ed6a-439d-8408-d5cfb3b38dae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LenguajeNaturalAI_leniachat-gemma-2b-v0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "leniachat-gemma-2b-v0", - "id": "LenguajeNaturalAI/leniachat-gemma-2b-v0", - "developer": "LenguajeNaturalAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.215 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3074 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3659 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-qwen2-1.5B-v0/96b57891-83e3-4948-ad48-64a2a370e166.json b/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-qwen2-1.5B-v0/96b57891-83e3-4948-ad48-64a2a370e166.json deleted file mode 100644 index 1f7ec34a2..000000000 --- a/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-qwen2-1.5B-v0/96b57891-83e3-4948-ad48-64a2a370e166.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LenguajeNaturalAI_leniachat-qwen2-1.5B-v0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "leniachat-qwen2-1.5B-v0", - "id": "LenguajeNaturalAI/leniachat-qwen2-1.5B-v0", - "developer": "LenguajeNaturalAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.543 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2221 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.188 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/CheckPoint_A/30301818-6dad-45f9-acfb-a68ccc7c0609.json b/data/hfopenllm_v2/LeroyDyer/CheckPoint_A/30301818-6dad-45f9-acfb-a68ccc7c0609.json deleted file mode 100644 index 009c8048b..000000000 --- a/data/hfopenllm_v2/LeroyDyer/CheckPoint_A/30301818-6dad-45f9-acfb-a68ccc7c0609.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_CheckPoint_A/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CheckPoint_A", - "id": "LeroyDyer/CheckPoint_A", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4513 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4748 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.288 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/CheckPoint_B/50743107-30de-4c5d-bf83-cc003af8a5db.json b/data/hfopenllm_v2/LeroyDyer/CheckPoint_B/50743107-30de-4c5d-bf83-cc003af8a5db.json deleted file mode 100644 index c87445cfa..000000000 --- a/data/hfopenllm_v2/LeroyDyer/CheckPoint_B/50743107-30de-4c5d-bf83-cc003af8a5db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_CheckPoint_B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CheckPoint_B", - "id": "LeroyDyer/CheckPoint_B", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3898 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2907 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/CheckPoint_C/625ee1b3-e0a1-4a86-83a4-6e66b380f864.json b/data/hfopenllm_v2/LeroyDyer/CheckPoint_C/625ee1b3-e0a1-4a86-83a4-6e66b380f864.json deleted file mode 100644 index 27af86b1d..000000000 --- a/data/hfopenllm_v2/LeroyDyer/CheckPoint_C/625ee1b3-e0a1-4a86-83a4-6e66b380f864.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_CheckPoint_C/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CheckPoint_C", - "id": "LeroyDyer/CheckPoint_C", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3477 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4586 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3021 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/CheckPoint_R1/89fda762-1989-4850-837c-f79ef538c58c.json b/data/hfopenllm_v2/LeroyDyer/CheckPoint_R1/89fda762-1989-4850-837c-f79ef538c58c.json deleted file mode 100644 index 5131c24dd..000000000 --- a/data/hfopenllm_v2/LeroyDyer/CheckPoint_R1/89fda762-1989-4850-837c-f79ef538c58c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_CheckPoint_R1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CheckPoint_R1", - "id": "LeroyDyer/CheckPoint_R1", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1728 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4225 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_001/1de1f906-0e36-4f79-b159-16ef8ee33ab3.json b/data/hfopenllm_v2/LeroyDyer/LCARS_AI_001/1de1f906-0e36-4f79-b159-16ef8ee33ab3.json deleted file mode 100644 index f13e5d9d1..000000000 --- a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_001/1de1f906-0e36-4f79-b159-16ef8ee33ab3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_LCARS_AI_001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LCARS_AI_001", - "id": "LeroyDyer/LCARS_AI_001", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3109 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4258 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4384 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.267 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_1x4_003_SuperAI/d8588222-9e4b-47c1-9f86-92f47c9c8e38.json b/data/hfopenllm_v2/LeroyDyer/LCARS_AI_1x4_003_SuperAI/d8588222-9e4b-47c1-9f86-92f47c9c8e38.json deleted file mode 100644 index a513ad18e..000000000 --- a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_1x4_003_SuperAI/d8588222-9e4b-47c1-9f86-92f47c9c8e38.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_LCARS_AI_1x4_003_SuperAI/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LCARS_AI_1x4_003_SuperAI", - "id": "LeroyDyer/LCARS_AI_1x4_003_SuperAI", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4111 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4506 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2972 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_StarTrek_Computer/15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json b/data/hfopenllm_v2/LeroyDyer/LCARS_AI_StarTrek_Computer/15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json deleted file mode 100644 index c31ece219..000000000 --- a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_StarTrek_Computer/15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_LCARS_AI_StarTrek_Computer/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LCARS_AI_StarTrek_Computer", - "id": "LeroyDyer/LCARS_AI_StarTrek_Computer", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3583 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4446 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/LCARS_TOP_SCORE/81225b85-1523-49c1-b770-897112d2e6ae.json b/data/hfopenllm_v2/LeroyDyer/LCARS_TOP_SCORE/81225b85-1523-49c1-b770-897112d2e6ae.json deleted file mode 100644 index 6e727e47a..000000000 --- a/data/hfopenllm_v2/LeroyDyer/LCARS_TOP_SCORE/81225b85-1523-49c1-b770-897112d2e6ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_LCARS_TOP_SCORE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LCARS_TOP_SCORE", - "id": "LeroyDyer/LCARS_TOP_SCORE", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5127 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3031 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/Mixtral_AI_SwahiliTron_7b/254deaf7-a253-4d41-a10d-1143f86b288c.json b/data/hfopenllm_v2/LeroyDyer/Mixtral_AI_SwahiliTron_7b/254deaf7-a253-4d41-a10d-1143f86b288c.json deleted file mode 100644 index 3ffc415fc..000000000 --- a/data/hfopenllm_v2/LeroyDyer/Mixtral_AI_SwahiliTron_7b/254deaf7-a253-4d41-a10d-1143f86b288c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_Mixtral_AI_SwahiliTron_7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral_AI_SwahiliTron_7b", - "id": "LeroyDyer/Mixtral_AI_SwahiliTron_7b", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1534 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3055 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI/ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json b/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI/ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json deleted file mode 100644 index 93ca501b0..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI/ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWebAI_Human_AGI/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWebAI_Human_AGI", - "id": "LeroyDyer/SpydazWebAI_Human_AGI", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3388 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3375 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3966 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1479 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI_001/eed0b3b4-e277-49ee-aed5-f3599b2d5653.json b/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI_001/eed0b3b4-e277-49ee-aed5-f3599b2d5653.json deleted file mode 100644 index 03d018a11..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI_001/eed0b3b4-e277-49ee-aed5-f3599b2d5653.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWebAI_Human_AGI_001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWebAI_Human_AGI_001", - "id": "LeroyDyer/SpydazWebAI_Human_AGI_001", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3118 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3433 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3994 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1426 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b/96a21b6e-ed47-40fb-85cd-15924330e60d.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b/96a21b6e-ed47-40fb-85cd-15924330e60d.json deleted file mode 100644 index faca52fe7..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b/96a21b6e-ed47-40fb-85cd-15924330e60d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_CyberTron_Ultra_7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_CyberTron_Ultra_7b", - "id": "LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4811 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2866 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2/f41f5471-6384-4510-85d2-41f236082583.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2/f41f5471-6384-4510-85d2-41f236082583.json deleted file mode 100644 index 0b361a56c..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2/f41f5471-6384-4510-85d2-41f236082583.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAGI_001_M2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAGI_001_M2", - "id": "LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.394 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4888 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4503 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3005 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_002/2728eccc-525f-4350-901b-dbc352c78014.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_002/2728eccc-525f-4350-901b-dbc352c78014.json deleted file mode 100644 index 0eb676741..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_002/2728eccc-525f-4350-901b-dbc352c78014.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAGI_002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAGI_002", - "id": "LeroyDyer/SpydazWeb_AI_HumanAGI_002", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4088 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4865 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3059 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_001/3e7ae935-46c3-427c-8713-41c659c1828a.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_001/3e7ae935-46c3-427c-8713-41c659c1828a.json deleted file mode 100644 index f6494cb5b..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_001/3e7ae935-46c3-427c-8713-41c659c1828a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_001", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_001", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2252 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3344 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1271 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_006/66782676-c942-4aff-b754-b96cd96cf1f9.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_006/66782676-c942-4aff-b754-b96cd96cf1f9.json deleted file mode 100644 index 9408883d4..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_006/66782676-c942-4aff-b754-b96cd96cf1f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_006/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_006", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_006", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.143 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3568 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1135 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_007/941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_007/941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json deleted file mode 100644 index fd4a7fc65..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_007/941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_007/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_007", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_007", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT/caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT/caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json deleted file mode 100644 index 67f16d3b2..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT/caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_009_CHAT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_009_CHAT", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2973 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3307 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1433 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT/d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT/d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json deleted file mode 100644 index 3fc4a754a..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT/d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_010_CHAT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_010_CHAT", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2507 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3336 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4137 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT/615bf89b-9357-46f4-82ed-f49b0021da01.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT/615bf89b-9357-46f4-82ed-f49b0021da01.json deleted file mode 100644 index 877f891f4..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT/615bf89b-9357-46f4-82ed-f49b0021da01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3149 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1595 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/06398630-23ad-4000-8ea2-fcca230568d7.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/06398630-23ad-4000-8ea2-fcca230568d7.json deleted file mode 100644 index 5534cf616..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/06398630-23ad-4000-8ea2-fcca230568d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT_ML", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4239 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2019 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/bdfa30f8-da0f-418f-adaf-caafda4c81a5.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/bdfa30f8-da0f-418f-adaf-caafda4c81a5.json deleted file mode 100644 index 9e0ca9e6d..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/bdfa30f8-da0f-418f-adaf-caafda4c81a5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.405 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4858 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3921 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2956 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/bd5e550c-5355-4e01-bafc-2ca89899253a.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/bd5e550c-5355-4e01-bafc-2ca89899253a.json deleted file mode 100644 index 05508c5e5..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/bd5e550c-5355-4e01-bafc-2ca89899253a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_IA", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3066 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4577 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2318 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json deleted file mode 100644 index 6bafa117f..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_IA", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3036 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4575 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/3a09590f-28f3-4161-8a93-d42cec62aa90.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/3a09590f-28f3-4161-8a93-d42cec62aa90.json deleted file mode 100644 index fad8789d2..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/3a09590f-28f3-4161-8a93-d42cec62aa90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_MX", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3066 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3158 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3444 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1107 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json deleted file mode 100644 index 33826b74e..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4477 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4134 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/f276ad54-4e3b-4718-ae1f-0479565e4565.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/f276ad54-4e3b-4718-ae1f-0479565e4565.json deleted file mode 100644 index 4d6c638c9..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/f276ad54-4e3b-4718-ae1f-0479565e4565.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3798 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4483 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4148 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_RP/dec20396-6555-4773-bf02-2cd1fcedda89.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_RP/dec20396-6555-4773-bf02-2cd1fcedda89.json deleted file mode 100644 index 35280576b..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_RP/dec20396-6555-4773-bf02-2cd1fcedda89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_RP", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_RP", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2541 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3323 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3883 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1324 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_TextVision/eebc33e1-0016-4adf-815a-72653a34c01b.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_TextVision/eebc33e1-0016-4adf-815a-72653a34c01b.json deleted file mode 100644 index b478f0a27..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_TextVision/eebc33e1-0016-4adf-815a-72653a34c01b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_TextVision/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_AI_HumanAI_TextVision", - "id": "LeroyDyer/SpydazWeb_AI_HumanAI_TextVision", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3938 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M1/803c3898-c1a6-4832-ac3a-a86139489810.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M1/803c3898-c1a6-4832-ac3a-a86139489810.json deleted file mode 100644 index 0bd353a92..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M1/803c3898-c1a6-4832-ac3a-a86139489810.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_HumanAI_M1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_HumanAI_M1", - "id": "LeroyDyer/SpydazWeb_HumanAI_M1", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1663 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M2/bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M2/bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json deleted file mode 100644 index 3df076af2..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M2/bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_HumanAI_M2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_HumanAI_M2", - "id": "LeroyDyer/SpydazWeb_HumanAI_M2", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3931 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3751 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.201 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M3/99debdd2-1dea-4eb6-be5c-c144656cfe20.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M3/99debdd2-1dea-4eb6-be5c-c144656cfe20.json deleted file mode 100644 index 5211d3064..000000000 --- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M3/99debdd2-1dea-4eb6-be5c-c144656cfe20.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_HumanAI_M3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SpydazWeb_HumanAI_M3", - "id": "LeroyDyer/SpydazWeb_HumanAI_M3", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1579 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1149 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_12/ad67bb88-7f74-4eb4-b771-0b3b60be4416.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_12/ad67bb88-7f74-4eb4-b771-0b3b60be4416.json deleted file mode 100644 index 8fb0a66ae..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_12/ad67bb88-7f74-4eb4-b771-0b3b60be4416.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_12/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_12", - "id": "LeroyDyer/_Spydaz_Web_AI_12", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2765 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_14/af2f579d-1e8a-47d8-8e44-a599bee83e37.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_14/af2f579d-1e8a-47d8-8e44-a599bee83e37.json deleted file mode 100644 index 6d3423046..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_14/af2f579d-1e8a-47d8-8e44-a599bee83e37.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_14/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_14", - "id": "LeroyDyer/_Spydaz_Web_AI_14", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1812 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2989 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_001/763c840e-ea73-453e-8e54-5f4fd6fda9cd.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_001/763c840e-ea73-453e-8e54-5f4fd6fda9cd.json deleted file mode 100644 index 5e467d10a..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_001/763c840e-ea73-453e-8e54-5f4fd6fda9cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_001", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_001", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4609 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4256 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2734 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_002/4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_002/4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json deleted file mode 100644 index bedce984a..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_002/4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_002", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_002", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5307 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4683 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4255 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR/ffc4ef41-4a28-4816-be54-8ffd8e153073.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR/ffc4ef41-4a28-4816-be54-8ffd8e153073.json deleted file mode 100644 index ccb933fa1..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR/ffc4ef41-4a28-4816-be54-8ffd8e153073.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_MUSR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_MUSR", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4786 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4672 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4869 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2828 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder/f75fe902-f1c7-4e6c-87d6-128688db8d94.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder/f75fe902-f1c7-4e6c-87d6-128688db8d94.json deleted file mode 100644 index b1dc3496d..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder/f75fe902-f1c7-4e6c-87d6-128688db8d94.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_MasterCoder/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_MasterCoder", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4143 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4689 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.472 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001/dbd3098b-4532-441b-a81c-072c52579be6.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001/dbd3098b-4532-441b-a81c-072c52579be6.json deleted file mode 100644 index 96985ae3e..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001/dbd3098b-4532-441b-a81c-072c52579be6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_Math_001", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4571 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4818 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4778 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2681 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003/438e4aa3-5e02-446e-bd3a-07ef724d24ff.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003/438e4aa3-5e02-446e-bd3a-07ef724d24ff.json deleted file mode 100644 index d2edae906..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003/438e4aa3-5e02-446e-bd3a-07ef724d24ff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_003/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_Math_003", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4756 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2999 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/027fdc55-61eb-416c-b6ad-4408912d151b.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/027fdc55-61eb-416c-b6ad-4408912d151b.json deleted file mode 100644 index e3bc05a1b..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/027fdc55-61eb-416c-b6ad-4408912d151b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5951 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4927 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student/37a4895d-def5-494d-9b62-d8c97ba9350b.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student/37a4895d-def5-494d-9b62-d8c97ba9350b.json deleted file mode 100644 index d67104ff1..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student/37a4895d-def5-494d-9b62-d8c97ba9350b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Student/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_Math_Student", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5736 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4881 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2927 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher/0d53c27e-962c-428f-b540-35ab027883a8.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher/0d53c27e-962c-428f-b540-35ab027883a8.json deleted file mode 100644 index 173bb8257..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher/0d53c27e-962c-428f-b540-35ab027883a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Teacher/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_Math_Teacher", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5772 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4805 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2956 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001/6f7b2d91-24d6-442c-93a5-9afc88e9a308.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001/6f7b2d91-24d6-442c-93a5-9afc88e9a308.json deleted file mode 100644 index a4540e902..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001/6f7b2d91-24d6-442c-93a5-9afc88e9a308.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_OmG_001", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5818 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4908 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2906 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002/21793520-7d1a-4040-bb96-fa7fe98ae580.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002/21793520-7d1a-4040-bb96-fa7fe98ae580.json deleted file mode 100644 index c42060e34..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002/21793520-7d1a-4040-bb96-fa7fe98ae580.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_OmG_002", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4511 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2867 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder/59d53c40-5b16-4a70-a693-5fb554cf7614.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder/59d53c40-5b16-4a70-a693-5fb554cf7614.json deleted file mode 100644 index 3df532609..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder/59d53c40-5b16-4a70-a693-5fb554cf7614.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Coder/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_OmG_Coder", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4638 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5625 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.289 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math/b28a569c-6bdf-4547-a2ce-c3e224764be3.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math/b28a569c-6bdf-4547-a2ce-c3e224764be3.json deleted file mode 100644 index 481f7a4c2..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math/b28a569c-6bdf-4547-a2ce-c3e224764be3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Math/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_OmG_Math", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5033 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4677 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4326 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2913 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster/2de129c8-2259-4367-a619-85d9e8f61e06.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster/2de129c8-2259-4367-a619-85d9e8f61e06.json deleted file mode 100644 index b5ee95324..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster/2de129c8-2259-4367-a619-85d9e8f61e06.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_MathMaster/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_OmG_MathMaster", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5558 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4742 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2672 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder/c242030f-fb2b-42dc-a5d1-687273b17282.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder/c242030f-fb2b-42dc-a5d1-687273b17282.json deleted file mode 100644 index d5484f88e..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder/c242030f-fb2b-42dc-a5d1-687273b17282.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Student_Coder/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_Student_Coder", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.545 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4651 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4388 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder/3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder/3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json deleted file mode 100644 index f0fe06eb1..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder/3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Teacher_Coder/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_Teacher_Coder", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5082 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4797 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2845 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student/ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student/ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json deleted file mode 100644 index bf973d06a..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student/ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Top_Student/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_Top_Student", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4988 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3024 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1/f8c131a4-1fee-4694-8753-88853418ef4b.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1/f8c131a4-1fee-4694-8753-88853418ef4b.json deleted file mode 100644 index 6d959d14f..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1/f8c131a4-1fee-4694-8753-88853418ef4b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_X1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_X1", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4759 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2891 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2/27dec9ff-fb18-43dd-949f-7c0587a5858f.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2/27dec9ff-fb18-43dd-949f-7c0587a5858f.json deleted file mode 100644 index 0a0055521..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2/27dec9ff-fb18-43dd-949f-7c0587a5858f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_X2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_R1_X2", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5434 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4786 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4695 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2921 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1/060df34d-ab67-43e1-bd56-ebaceb77abd3.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1/060df34d-ab67-43e1-bd56-ebaceb77abd3.json deleted file mode 100644 index b431b930c..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1/060df34d-ab67-43e1-bd56-ebaceb77abd3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_RP_R1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_AGI_RP_R1", - "id": "LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5426 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4701 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4201 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_BIBLE_002/a6357673-3daa-4593-8593-2b65a7d5477e.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_BIBLE_002/a6357673-3daa-4593-8593-2b65a7d5477e.json deleted file mode 100644 index a62a6bb3e..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_BIBLE_002/a6357673-3daa-4593-8593-2b65a7d5477e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_BIBLE_002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_BIBLE_002", - "id": "LeroyDyer/_Spydaz_Web_AI_BIBLE_002", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2195 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatML_002/121d4877-1955-48db-a23a-6b0ad0623b9e.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatML_002/121d4877-1955-48db-a23a-6b0ad0623b9e.json deleted file mode 100644 index d97ec510d..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatML_002/121d4877-1955-48db-a23a-6b0ad0623b9e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_ChatML_002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_ChatML_002", - "id": "LeroyDyer/_Spydaz_Web_AI_ChatML_002", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3106 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3623 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA/1f1eab02-219e-4ad8-af50-e103541e1c9d.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA/1f1eab02-219e-4ad8-af50-e103541e1c9d.json deleted file mode 100644 index 21baf209e..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA/1f1eab02-219e-4ad8-af50-e103541e1c9d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_ChatQA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_ChatQA", - "id": "LeroyDyer/_Spydaz_Web_AI_ChatQA", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1415 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3236 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1475 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA_003/b4cccfb3-1c17-48a3-a211-a26c44de757f.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA_003/b4cccfb3-1c17-48a3-a211-a26c44de757f.json deleted file mode 100644 index 5a698b13e..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA_003/b4cccfb3-1c17-48a3-a211-a26c44de757f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_ChatQA_003/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_ChatQA_003", - "id": "LeroyDyer/_Spydaz_Web_AI_ChatQA_003", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2209 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3818 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_TEMP_/05e97a86-681d-42a2-8a47-beade25d8fc9.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_TEMP_/05e97a86-681d-42a2-8a47-beade25d8fc9.json deleted file mode 100644 index 09c825d37..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_TEMP_/05e97a86-681d-42a2-8a47-beade25d8fc9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_TEMP_/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_TEMP_", - "id": "LeroyDyer/_Spydaz_Web_AI_TEMP_", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4795 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4957 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4218 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_Top_Teacher_/6c0899b4-f066-45f6-827d-11c535ef0634.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_Top_Teacher_/6c0899b4-f066-45f6-827d-11c535ef0634.json deleted file mode 100644 index c36cff39f..000000000 --- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_Top_Teacher_/6c0899b4-f066-45f6-827d-11c535ef0634.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_Top_Teacher_/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "_Spydaz_Web_AI_Top_Teacher_", - "id": "LeroyDyer/_Spydaz_Web_AI_Top_Teacher_", - "developer": "LeroyDyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4404 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4891 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.0/f9660557-b9f6-4ecc-b260-c245f0e62b5b.json b/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.0/f9660557-b9f6-4ecc-b260-c245f0e62b5b.json deleted file mode 100644 index b9c99ac4d..000000000 --- a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.0/f9660557-b9f6-4ecc-b260-c245f0e62b5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LightningRodLabs_Flashlight-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Flashlight-v1.0", - "id": "LightningRodLabs/Flashlight-v1.0", - "developer": "LightningRodLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6745 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6877 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.497 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.1/89168032-5840-4c2c-821e-b3d717ade46f.json b/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.1/89168032-5840-4c2c-821e-b3d717ade46f.json deleted file mode 100644 index 4c4c25448..000000000 --- a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.1/89168032-5840-4c2c-821e-b3d717ade46f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LightningRodLabs_Flashlight-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Flashlight-v1.1", - "id": "LightningRodLabs/Flashlight-v1.1", - "developer": "LightningRodLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6721 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6901 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4048 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.2/10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json b/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.2/10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json deleted file mode 100644 index 5dd6485a7..000000000 --- a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.2/10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LightningRodLabs_Flashlight-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Flashlight-v1.2", - "id": "LightningRodLabs/Flashlight-v1.2", - "developer": "LightningRodLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3265 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2357 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2485 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1/6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1/6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json deleted file mode 100644 index 9f4103403..000000000 --- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1/6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "2_PRYMMAL-ECE-2B-SLERP-V1", - "id": "Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1", - "developer": "Lil-R", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5823 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4287 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2678 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2/5e715199-7030-47b4-89c6-83ba0968c07c.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2/5e715199-7030-47b4-89c6-83ba0968c07c.json deleted file mode 100644 index 343a94755..000000000 --- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2/5e715199-7030-47b4-89c6-83ba0968c07c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "2_PRYMMAL-ECE-2B-SLERP-V2", - "id": "Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2", - "developer": "Lil-R", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5543 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4376 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2744 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1/3fca39e8-443d-47da-a858-83a68c18eec9.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1/3fca39e8-443d-47da-a858-83a68c18eec9.json deleted file mode 100644 index e5d8c93e6..000000000 --- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1/3fca39e8-443d-47da-a858-83a68c18eec9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "2_PRYMMAL-ECE-7B-SLERP-V1", - "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1", - "developer": "Lil-R", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3053 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2/b7518bd2-d3af-49e6-823a-f8d507e8e60f.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2/b7518bd2-d3af-49e6-823a-f8d507e8e60f.json deleted file mode 100644 index 6709db764..000000000 --- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2/b7518bd2-d3af-49e6-823a-f8d507e8e60f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "2_PRYMMAL-ECE-7B-SLERP-V2", - "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2", - "developer": "Lil-R", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3053 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3/fa399f16-1652-430c-be19-afaf5ab96be1.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3/fa399f16-1652-430c-be19-afaf5ab96be1.json deleted file mode 100644 index 04700dc51..000000000 --- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3/fa399f16-1652-430c-be19-afaf5ab96be1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "2_PRYMMAL-ECE-7B-SLERP-V3", - "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3", - "developer": "Lil-R", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3578 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4107 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP/cbe5032b-122c-4a0b-a099-50e998a4bc77.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP/cbe5032b-122c-4a0b-a099-50e998a4bc77.json deleted file mode 100644 index ad0887f0c..000000000 --- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP/cbe5032b-122c-4a0b-a099-50e998a4bc77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-7B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "2_PRYMMAL-ECE-7B-SLERP", - "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP", - "developer": "Lil-R", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5577 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5557 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4507 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-1B-SLERP-V1/fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json b/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-1B-SLERP-V1/fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json deleted file mode 100644 index 82de476ba..000000000 --- a/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-1B-SLERP-V1/fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lil-R_PRYMMAL-ECE-1B-SLERP-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRYMMAL-ECE-1B-SLERP-V1", - "id": "Lil-R/PRYMMAL-ECE-1B-SLERP-V1", - "developer": "Lil-R", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2874 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3974 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2926 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-7B-SLERP-V8/1a18d49c-ad7b-4823-abbc-7191e9d659cd.json b/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-7B-SLERP-V8/1a18d49c-ad7b-4823-abbc-7191e9d659cd.json deleted file mode 100644 index 1bab75b49..000000000 --- a/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-7B-SLERP-V8/1a18d49c-ad7b-4823-abbc-7191e9d659cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lil-R_PRYMMAL-ECE-7B-SLERP-V8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRYMMAL-ECE-7B-SLERP-V8", - "id": "Lil-R/PRYMMAL-ECE-7B-SLERP-V8", - "developer": "Lil-R", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1258 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2955 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/10PRYMMAL-3B-slerp/9e2c614e-1104-43a6-9e8f-b7851562e01a.json b/data/hfopenllm_v2/LilRg/10PRYMMAL-3B-slerp/9e2c614e-1104-43a6-9e8f-b7851562e01a.json deleted file mode 100644 index 3ab3feb1c..000000000 --- a/data/hfopenllm_v2/LilRg/10PRYMMAL-3B-slerp/9e2c614e-1104-43a6-9e8f-b7851562e01a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_10PRYMMAL-3B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "10PRYMMAL-3B-slerp", - "id": "LilRg/10PRYMMAL-3B-slerp", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1946 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.532 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1495 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4529 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3881 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/ECE-1B-merge-PRYMMAL/7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json b/data/hfopenllm_v2/LilRg/ECE-1B-merge-PRYMMAL/7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json deleted file mode 100644 index 93bc8cf86..000000000 --- a/data/hfopenllm_v2/LilRg/ECE-1B-merge-PRYMMAL/7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_ECE-1B-merge-PRYMMAL/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-1B-merge-PRYMMAL", - "id": "LilRg/ECE-1B-merge-PRYMMAL", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2712 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4235 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3801 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2906 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/ECE_Finetunning/a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json b/data/hfopenllm_v2/LilRg/ECE_Finetunning/a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json deleted file mode 100644 index fd422f5f3..000000000 --- a/data/hfopenllm_v2/LilRg/ECE_Finetunning/a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_ECE_Finetunning/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE_Finetunning", - "id": "LilRg/ECE_Finetunning", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 16.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4732 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3839 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3191 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-6B-slerp/21f6688c-be52-4352-9c95-d37c0a5f6c94.json b/data/hfopenllm_v2/LilRg/PRYMMAL-6B-slerp/21f6688c-be52-4352-9c95-d37c0a5f6c94.json deleted file mode 100644 index ecbfdcf37..000000000 --- a/data/hfopenllm_v2/LilRg/PRYMMAL-6B-slerp/21f6688c-be52-4352-9c95-d37c0a5f6c94.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-6B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRYMMAL-6B-slerp", - "id": "LilRg/PRYMMAL-6B-slerp", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.293 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1153 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2868 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V3/e92ba586-7bee-4a9b-b388-e35efde3d36f.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V3/e92ba586-7bee-4a9b-b388-e35efde3d36f.json deleted file mode 100644 index e89c0598f..000000000 --- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V3/e92ba586-7bee-4a9b-b388-e35efde3d36f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRYMMAL-ECE-7B-SLERP-V3", - "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V3", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1243 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2957 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V4/45ed0bb3-efbf-4a32-9735-d814aa08790a.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V4/45ed0bb3-efbf-4a32-9735-d814aa08790a.json deleted file mode 100644 index caaf555af..000000000 --- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V4/45ed0bb3-efbf-4a32-9735-d814aa08790a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRYMMAL-ECE-7B-SLERP-V4", - "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V4", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1249 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2957 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V5/eff28375-89a7-4970-9342-428b07d0c6f4.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V5/eff28375-89a7-4970-9342-428b07d0c6f4.json deleted file mode 100644 index 17c5c7e82..000000000 --- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V5/eff28375-89a7-4970-9342-428b07d0c6f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRYMMAL-ECE-7B-SLERP-V5", - "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V5", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1249 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2957 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V6/23877e30-b8fb-45ea-a803-47df757ea909.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V6/23877e30-b8fb-45ea-a803-47df757ea909.json deleted file mode 100644 index ec37cc975..000000000 --- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V6/23877e30-b8fb-45ea-a803-47df757ea909.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRYMMAL-ECE-7B-SLERP-V6", - "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V6", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1243 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2957 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V7/8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V7/8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json deleted file mode 100644 index 06e5be7ee..000000000 --- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V7/8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRYMMAL-ECE-7B-SLERP-V7", - "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V7", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1249 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2957 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-slerp-Merge/d2d4b5a5-109d-4d26-a166-3d97b341584e.json b/data/hfopenllm_v2/LilRg/PRYMMAL-slerp-Merge/d2d4b5a5-109d-4d26-a166-3d97b341584e.json deleted file mode 100644 index 3ff38766a..000000000 --- a/data/hfopenllm_v2/LilRg/PRYMMAL-slerp-Merge/d2d4b5a5-109d-4d26-a166-3d97b341584e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-slerp-Merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRYMMAL-slerp-Merge", - "id": "LilRg/PRYMMAL-slerp-Merge", - "developer": "LilRg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3044 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5364 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1616 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4635 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3863 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged/ac404d92-7a06-4758-ab1d-fcf840c2b995.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged/ac404d92-7a06-4758-ab1d-fcf840c2b995.json deleted file mode 100644 index c0faabec4..000000000 --- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged/ac404d92-7a06-4758-ab1d-fcf840c2b995.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3-8B-unsloth_v2-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CodeMind-Llama3-8B-unsloth_v2-merged", - "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged", - "developer": "LimYeri", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6946 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3506 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged/95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged/95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json deleted file mode 100644 index 535530ceb..000000000 --- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged/95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3-8B-unsloth_v3-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CodeMind-Llama3-8B-unsloth_v3-merged", - "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged", - "developer": "LimYeri", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4908 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3496 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/c101e272-24d2-44db-9b0f-2ed4d17cec41.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/c101e272-24d2-44db-9b0f-2ed4d17cec41.json deleted file mode 100644 index c1155f027..000000000 --- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/c101e272-24d2-44db-9b0f-2ed4d17cec41.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged", - "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged", - "developer": "LimYeri", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6492 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4853 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3608 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged/2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged/2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json deleted file mode 100644 index 14803d71b..000000000 --- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged/2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CodeMind-Llama3-8B-unsloth_v4-one-merged", - "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged", - "developer": "LimYeri", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3211 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4739 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4069 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3.1-8B-unsloth-merged/a414aefd-ce24-49a9-b431-0c6014ebfbd8.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3.1-8B-unsloth-merged/a414aefd-ce24-49a9-b431-0c6014ebfbd8.json deleted file mode 100644 index 323a4d3be..000000000 --- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3.1-8B-unsloth-merged/a414aefd-ce24-49a9-b431-0c6014ebfbd8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3.1-8B-unsloth-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CodeMind-Llama3.1-8B-unsloth-merged", - "id": "LimYeri/CodeMind-Llama3.1-8B-unsloth-merged", - "developer": "LimYeri", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4695 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.334 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Locutusque/CollectiveLM-Falcon-3-7B/91fcb6a3-d351-48c8-87e8-e2a06642e925.json b/data/hfopenllm_v2/Locutusque/CollectiveLM-Falcon-3-7B/91fcb6a3-d351-48c8-87e8-e2a06642e925.json deleted file mode 100644 index 95e472469..000000000 --- a/data/hfopenllm_v2/Locutusque/CollectiveLM-Falcon-3-7B/91fcb6a3-d351-48c8-87e8-e2a06642e925.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Locutusque_CollectiveLM-Falcon-3-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CollectiveLM-Falcon-3-7B", - "id": "Locutusque/CollectiveLM-Falcon-3-7B", - "developer": "Locutusque", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3918 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5105 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3887 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Locutusque/Hercules-6.0-Llama-3.1-8B/3cd90efa-ddf0-43c4-884c-84337ded14b2.json b/data/hfopenllm_v2/Locutusque/Hercules-6.0-Llama-3.1-8B/3cd90efa-ddf0-43c4-884c-84337ded14b2.json deleted file mode 100644 index 200257158..000000000 --- a/data/hfopenllm_v2/Locutusque/Hercules-6.0-Llama-3.1-8B/3cd90efa-ddf0-43c4-884c-84337ded14b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Locutusque_Hercules-6.0-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hercules-6.0-Llama-3.1-8B", - "id": "Locutusque/Hercules-6.0-Llama-3.1-8B", - "developer": "Locutusque", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.663 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4813 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1669 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Locutusque/Hercules-6.1-Llama-3.1-8B/c66c21e9-a332-40f9-ae87-bdd78a25d753.json b/data/hfopenllm_v2/Locutusque/Hercules-6.1-Llama-3.1-8B/c66c21e9-a332-40f9-ae87-bdd78a25d753.json deleted file mode 100644 index c70788d79..000000000 --- a/data/hfopenllm_v2/Locutusque/Hercules-6.1-Llama-3.1-8B/c66c21e9-a332-40f9-ae87-bdd78a25d753.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Locutusque_Hercules-6.1-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hercules-6.1-Llama-3.1-8B", - "id": "Locutusque/Hercules-6.1-Llama-3.1-8B", - "developer": "Locutusque", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6007 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4656 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3553 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3669 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Locutusque/Llama-3-NeuralHercules-5.0-8B/0b4def91-29df-45d9-8dd4-c4097ec47ba3.json b/data/hfopenllm_v2/Locutusque/Llama-3-NeuralHercules-5.0-8B/0b4def91-29df-45d9-8dd4-c4097ec47ba3.json deleted file mode 100644 index 289875aa0..000000000 --- a/data/hfopenllm_v2/Locutusque/Llama-3-NeuralHercules-5.0-8B/0b4def91-29df-45d9-8dd4-c4097ec47ba3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Locutusque_Llama-3-NeuralHercules-5.0-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-NeuralHercules-5.0-8B", - "id": "Locutusque/Llama-3-NeuralHercules-5.0-8B", - "developer": "Locutusque", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.394 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3881 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2933 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Locutusque/Llama-3-Yggdrasil-2.0-8B/2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json b/data/hfopenllm_v2/Locutusque/Llama-3-Yggdrasil-2.0-8B/2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json deleted file mode 100644 index 04066b41d..000000000 --- a/data/hfopenllm_v2/Locutusque/Llama-3-Yggdrasil-2.0-8B/2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Locutusque_Llama-3-Yggdrasil-2.0-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Yggdrasil-2.0-8B", - "id": "Locutusque/Llama-3-Yggdrasil-2.0-8B", - "developer": "Locutusque", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4772 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Locutusque/TinyMistral-248M-v2.5/8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json b/data/hfopenllm_v2/Locutusque/TinyMistral-248M-v2.5/8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json deleted file mode 100644 index f44380bf2..000000000 --- a/data/hfopenllm_v2/Locutusque/TinyMistral-248M-v2.5/8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Locutusque_TinyMistral-248M-v2.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyMistral-248M-v2.5", - "id": "Locutusque/TinyMistral-248M-v2.5", - "developer": "Locutusque", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 0.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3039 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1135 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Luni/StarDust-12b-v1/ce4cc270-57da-4d08-9130-62508b409cb2.json b/data/hfopenllm_v2/Luni/StarDust-12b-v1/ce4cc270-57da-4d08-9130-62508b409cb2.json deleted file mode 100644 index cba3a4a90..000000000 --- a/data/hfopenllm_v2/Luni/StarDust-12b-v1/ce4cc270-57da-4d08-9130-62508b409cb2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Luni_StarDust-12b-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "StarDust-12b-v1", - "id": "Luni/StarDust-12b-v1", - "developer": "Luni", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5459 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5366 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4324 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Luni/StarDust-12b-v2/4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json b/data/hfopenllm_v2/Luni/StarDust-12b-v2/4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json deleted file mode 100644 index b3c1d3a2d..000000000 --- a/data/hfopenllm_v2/Luni/StarDust-12b-v2/4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Luni_StarDust-12b-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "StarDust-12b-v2", - "id": "Luni/StarDust-12b-v2", - "developer": "Luni", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5629 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3439 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3/de3c949d-bab5-4430-bdd1-48e1b7860934.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3/de3c949d-bab5-4430-bdd1-48e1b7860934.json deleted file mode 100644 index a9ed180e4..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3/de3c949d-bab5-4430-bdd1-48e1b7860934.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v3", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7049 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6478 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4808 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5394 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4/011e53cd-409f-479b-9c3d-bfce75a1277b.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4/011e53cd-409f-479b-9c3d-bfce75a1277b.json deleted file mode 100644 index 77effc1ca..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4/011e53cd-409f-479b-9c3d-bfce75a1277b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v4", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6943 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.642 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3467 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4769 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5252 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5/1ff40e45-5be4-4625-9f66-5599a829903d.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5/1ff40e45-5be4-4625-9f66-5599a829903d.json deleted file mode 100644 index 5eaad0555..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5/1ff40e45-5be4-4625-9f66-5599a829903d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v5", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7485 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6467 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/fed97d94-2949-4383-8f25-fa79bd413508.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/fed97d94-2949-4383-8f25-fa79bd413508.json deleted file mode 100644 index 264dd2b3d..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/fed97d94-2949-4383-8f25-fa79bd413508.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4663 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4937 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5204 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6/f4820bc8-7dfd-4439-af95-21b6cc9367ac.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6/f4820bc8-7dfd-4439-af95-21b6cc9367ac.json deleted file mode 100644 index 37d866ec4..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6/f4820bc8-7dfd-4439-af95-21b6cc9367ac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v6", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6458 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3958 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5392 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/36e576bb-de50-49ec-a91f-f134c11bbe38.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/36e576bb-de50-49ec-a91f-f134c11bbe38.json deleted file mode 100644 index fbc0a8b94..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/36e576bb-de50-49ec-a91f-f134c11bbe38.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6931 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6423 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4888 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5277 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7/0edd388b-7a1b-4334-9b72-52d84653ff67.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7/0edd388b-7a1b-4334-9b72-52d84653ff67.json deleted file mode 100644 index a93457aeb..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7/0edd388b-7a1b-4334-9b72-52d84653ff67.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v7", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6794 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6531 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4834 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5/b3199674-328e-41a0-9aa4-bf39aec735bc.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5/b3199674-328e-41a0-9aa4-bf39aec735bc.json deleted file mode 100644 index 948f8741b..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5/b3199674-328e-41a0-9aa4-bf39aec735bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.5", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5929 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6451 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.477 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6/52db4d79-7040-4525-934e-0f33e4acec63.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6/52db4d79-7040-4525-934e-0f33e4acec63.json deleted file mode 100644 index 0b2f0fd81..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6/52db4d79-7040-4525-934e-0f33e4acec63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.6", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5919 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6457 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4953 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7/ee34821e-9182-433f-a8b0-745711e23738.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7/ee34821e-9182-433f-a8b0-745711e23738.json deleted file mode 100644 index 5a366fbb8..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7/ee34821e-9182-433f-a8b0-745711e23738.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.7", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7875 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6483 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5242 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8/10ef0990-5356-432f-b24c-dd107188ec5f.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8/10ef0990-5356-432f-b24c-dd107188ec5f.json deleted file mode 100644 index be4b45ebc..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8/10ef0990-5356-432f-b24c-dd107188ec5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.8", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7028 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6566 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4912 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5323 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9/47de680d-33b1-4441-92da-4b97a5fc513f.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9/47de680d-33b1-4441-92da-4b97a5fc513f.json deleted file mode 100644 index f10ff96d2..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9/47de680d-33b1-4441-92da-4b97a5fc513f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.9", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7993 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6483 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5199 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8/96ac0351-2ade-4d76-bcf9-bc0f633f8694.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8/96ac0351-2ade-4d76-bcf9-bc0f633f8694.json deleted file mode 100644 index e0b9d6239..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8/96ac0351-2ade-4d76-bcf9-bc0f633f8694.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7875 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5206 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/31aae266-c14b-451f-8bab-62ee7d5d382e.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/31aae266-c14b-451f-8bab-62ee7d5d382e.json deleted file mode 100644 index 0e46ee73a..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/31aae266-c14b-451f-8bab-62ee7d5d382e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9-stock", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6514 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6571 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4184 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1/f6edb102-e867-46d1-afdc-3c45166bd510.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1/f6edb102-e867-46d1-afdc-3c45166bd510.json deleted file mode 100644 index 45123ecdc..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1/f6edb102-e867-46d1-afdc-3c45166bd510.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9.1", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8003 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6555 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5251 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2/8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2/8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json deleted file mode 100644 index 3c1854b85..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2/8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9.2", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7862 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6538 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5283 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9/dcf33a22-5e57-4476-a2cb-ebd60407a920.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9/dcf33a22-5e57-4476-a2cb-ebd60407a920.json deleted file mode 100644 index f8f18e3db..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9/dcf33a22-5e57-4476-a2cb-ebd60407a920.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9", - "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6546 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4806 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5422 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion/15659480-be0b-41c8-a463-873be444b194.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion/15659480-be0b-41c8-a463-873be444b194.json deleted file mode 100644 index 179cbfc2b..000000000 --- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion/15659480-be0b-41c8-a463-873be444b194.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-OriginalFusion/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NQLSG-Qwen2.5-14B-OriginalFusion", - "id": "Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion", - "developer": "Lunzima", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6142 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6592 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4275 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5122 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5239 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json b/data/hfopenllm_v2/Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json deleted file mode 100644 index 3b3f7cc5e..000000000 --- a/data/hfopenllm_v2/Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lyte_Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3", - "id": "Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3", - "developer": "Lyte", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7098 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1903 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/93aa3a13-5069-410f-a1df-6944e0231e0e.json b/data/hfopenllm_v2/Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/93aa3a13-5069-410f-a1df-6944e0231e0e.json deleted file mode 100644 index 1d73f7a54..000000000 --- a/data/hfopenllm_v2/Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/93aa3a13-5069-410f-a1df-6944e0231e0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lyte_Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04", - "id": "Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04", - "developer": "Lyte", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5774 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0801 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1843 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Lyte/Llama-3.2-3B-Overthinker/427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json b/data/hfopenllm_v2/Lyte/Llama-3.2-3B-Overthinker/427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json deleted file mode 100644 index 4e1649e92..000000000 --- a/data/hfopenllm_v2/Lyte/Llama-3.2-3B-Overthinker/427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Lyte_Llama-3.2-3B-Overthinker/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Overthinker", - "id": "Lyte/Llama-3.2-3B-Overthinker", - "developer": "Lyte", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1563 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3419 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2985 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/M4-ai/TinyMistral-248M-v3/c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json b/data/hfopenllm_v2/M4-ai/TinyMistral-248M-v3/c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json deleted file mode 100644 index cbe484e9b..000000000 --- a/data/hfopenllm_v2/M4-ai/TinyMistral-248M-v3/c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/M4-ai_TinyMistral-248M-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyMistral-248M-v3", - "id": "M4-ai/TinyMistral-248M-v3", - "developer": "M4-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 0.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2885 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1132 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json b/data/hfopenllm_v2/MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json deleted file mode 100644 index 6ab96529f..000000000 --- a/data/hfopenllm_v2/MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MEscriva_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", - "id": "MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", - "developer": "MEscriva", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0866 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3057 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4017 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1154 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MLP-KTLim/llama-3-Korean-Bllossom-8B/5b5d42d7-8012-46f1-826f-32d839806048.json b/data/hfopenllm_v2/MLP-KTLim/llama-3-Korean-Bllossom-8B/5b5d42d7-8012-46f1-826f-32d839806048.json deleted file mode 100644 index a8760b3f6..000000000 --- a/data/hfopenllm_v2/MLP-KTLim/llama-3-Korean-Bllossom-8B/5b5d42d7-8012-46f1-826f-32d839806048.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MLP-KTLim_llama-3-Korean-Bllossom-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-Korean-Bllossom-8B", - "id": "MLP-KTLim/llama-3-Korean-Bllossom-8B", - "developer": "MLP-KTLim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5113 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3675 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3594 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MTSAIR/Cotype-Nano/5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json b/data/hfopenllm_v2/MTSAIR/Cotype-Nano/5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json deleted file mode 100644 index fb6c77edb..000000000 --- a/data/hfopenllm_v2/MTSAIR/Cotype-Nano/5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MTSAIR_Cotype-Nano/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cotype-Nano", - "id": "MTSAIR/Cotype-Nano", - "developer": "MTSAIR", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3748 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3865 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MTSAIR/MultiVerse_70B/21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json b/data/hfopenllm_v2/MTSAIR/MultiVerse_70B/21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json deleted file mode 100644 index 4f7b2b0b1..000000000 --- a/data/hfopenllm_v2/MTSAIR/MultiVerse_70B/21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MTSAIR_MultiVerse_70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiVerse_70B", - "id": "MTSAIR/MultiVerse_70B", - "developer": "MTSAIR", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 72.289 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5249 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6183 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1/c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1/c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json deleted file mode 100644 index 115db7b96..000000000 --- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1/c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Magpie-Align-SFT-v0.1", - "id": "Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1", - "developer": "Magpie-Align", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4361 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4615 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2863 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3/6586fa94-9f43-4814-8c8a-8ed244ac94e7.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3/6586fa94-9f43-4814-8c8a-8ed244ac94e7.json deleted file mode 100644 index 20e5b11c3..000000000 --- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3/6586fa94-9f43-4814-8c8a-8ed244ac94e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Magpie-Align-SFT-v0.3", - "id": "Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3", - "developer": "Magpie-Align", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4572 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3424 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2902 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/df7d7db2-867e-47f0-9abf-d71b79e97630.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/df7d7db2-867e-47f0-9abf-d71b79e97630.json deleted file mode 100644 index bbdb79de2..000000000 --- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/df7d7db2-867e-47f0-9abf-d71b79e97630.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Magpie-Align-v0.1", - "id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1", - "developer": "Magpie-Align", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4811 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3047 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3006 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/e2502e7e-3a10-49f3-b5c6-b20496fed998.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/e2502e7e-3a10-49f3-b5c6-b20496fed998.json deleted file mode 100644 index 5f345d139..000000000 --- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/e2502e7e-3a10-49f3-b5c6-b20496fed998.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Magpie-Align-v0.1", - "id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1", - "developer": "Magpie-Align", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4027 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4789 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3001 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.3/51cde18f-09b0-4b66-a962-811ee49e192f.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.3/51cde18f-09b0-4b66-a962-811ee49e192f.json deleted file mode 100644 index f30e78827..000000000 --- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.3/51cde18f-09b0-4b66-a962-811ee49e192f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Magpie-Align-v0.3", - "id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.3", - "developer": "Magpie-Align", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4497 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.457 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3134 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1/4ea48b42-8026-4799-b35d-46757fd2753f.json b/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1/4ea48b42-8026-4799-b35d-46757fd2753f.json deleted file mode 100644 index aeb4cabcb..000000000 --- a/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1/4ea48b42-8026-4799-b35d-46757fd2753f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3.1-8B-Magpie-Align-SFT-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Magpie-Align-SFT-v0.1", - "id": "Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1", - "developer": "Magpie-Align", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4782 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4764 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3397 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2943 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1/52e9b4ae-9119-4f26-87e4-6532d1148ecd.json b/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1/52e9b4ae-9119-4f26-87e4-6532d1148ecd.json deleted file mode 100644 index a67ee6b2e..000000000 --- a/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1/52e9b4ae-9119-4f26-87e4-6532d1148ecd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3.1-8B-Magpie-Align-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Magpie-Align-v0.1", - "id": "Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1", - "developer": "Magpie-Align", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4458 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4622 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3141 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-Chat-v0.1/4bda68c0-cc09-4945-961b-48776b7b5fc8.json b/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-Chat-v0.1/4bda68c0-cc09-4945-961b-48776b7b5fc8.json deleted file mode 100644 index 590475654..000000000 --- a/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-Chat-v0.1/4bda68c0-cc09-4945-961b-48776b7b5fc8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Magpie-Align_MagpieLM-8B-Chat-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MagpieLM-8B-Chat-v0.1", - "id": "Magpie-Align/MagpieLM-8B-Chat-v0.1", - "developer": "Magpie-Align", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4172 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3195 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-SFT-v0.1/18ea0ad0-a216-4906-a96c-c8b040398dbd.json b/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-SFT-v0.1/18ea0ad0-a216-4906-a96c-c8b040398dbd.json deleted file mode 100644 index 3ffe20bca..000000000 --- a/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-SFT-v0.1/18ea0ad0-a216-4906-a96c-c8b040398dbd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Magpie-Align_MagpieLM-8B-SFT-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MagpieLM-8B-SFT-v0.1", - "id": "Magpie-Align/MagpieLM-8B-SFT-v0.1", - "developer": "Magpie-Align", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4721 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4553 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MagusCorp/grpo_lora_enem_llama3_7b/1e2321f6-93bd-4acf-9f5b-c82807a40233.json b/data/hfopenllm_v2/MagusCorp/grpo_lora_enem_llama3_7b/1e2321f6-93bd-4acf-9f5b-c82807a40233.json deleted file mode 100644 index 7178d1d47..000000000 --- a/data/hfopenllm_v2/MagusCorp/grpo_lora_enem_llama3_7b/1e2321f6-93bd-4acf-9f5b-c82807a40233.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MagusCorp_grpo_lora_enem_llama3_7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "grpo_lora_enem_llama3_7b", - "id": "MagusCorp/grpo_lora_enem_llama3_7b", - "developer": "MagusCorp", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4724 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4801 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3971 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ManoloPueblo/ContentCuisine_1-7B-slerp/13032961-52a1-43cf-b69d-1802c43e1bcc.json b/data/hfopenllm_v2/ManoloPueblo/ContentCuisine_1-7B-slerp/13032961-52a1-43cf-b69d-1802c43e1bcc.json deleted file mode 100644 index 6a60d0dd4..000000000 --- a/data/hfopenllm_v2/ManoloPueblo/ContentCuisine_1-7B-slerp/13032961-52a1-43cf-b69d-1802c43e1bcc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ManoloPueblo_ContentCuisine_1-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContentCuisine_1-7B-slerp", - "id": "ManoloPueblo/ContentCuisine_1-7B-slerp", - "developer": "ManoloPueblo", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3907 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5188 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4672 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC2/9d444061-2c29-499a-8906-77ef58aba34d.json b/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC2/9d444061-2c29-499a-8906-77ef58aba34d.json deleted file mode 100644 index 2ebddc2a3..000000000 --- a/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC2/9d444061-2c29-499a-8906-77ef58aba34d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ManoloPueblo_LLM_MERGE_CC2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLM_MERGE_CC2", - "id": "ManoloPueblo/LLM_MERGE_CC2", - "developer": "ManoloPueblo", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3853 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4593 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3032 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC3/1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json b/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC3/1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json deleted file mode 100644 index 7e1671f54..000000000 --- a/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC3/1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ManoloPueblo_LLM_MERGE_CC3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLM_MERGE_CC3", - "id": "ManoloPueblo/LLM_MERGE_CC3", - "developer": "ManoloPueblo", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3959 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5246 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4672 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MarinaraSpaghetti/NemoReRemix-12B/8ce733ea-e6e9-4f9b-ab28-f93202507265.json b/data/hfopenllm_v2/MarinaraSpaghetti/NemoReRemix-12B/8ce733ea-e6e9-4f9b-ab28-f93202507265.json deleted file mode 100644 index 805297be2..000000000 --- a/data/hfopenllm_v2/MarinaraSpaghetti/NemoReRemix-12B/8ce733ea-e6e9-4f9b-ab28-f93202507265.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MarinaraSpaghetti_NemoReRemix-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NemoReRemix-12B", - "id": "MarinaraSpaghetti/NemoReRemix-12B", - "developer": "MarinaraSpaghetti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3343 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5537 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3598 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MarinaraSpaghetti/Nemomix-v4.0-12B/0e88aa91-609c-4d2d-9296-25b06eeb0342.json b/data/hfopenllm_v2/MarinaraSpaghetti/Nemomix-v4.0-12B/0e88aa91-609c-4d2d-9296-25b06eeb0342.json deleted file mode 100644 index 6824fedbe..000000000 --- a/data/hfopenllm_v2/MarinaraSpaghetti/Nemomix-v4.0-12B/0e88aa91-609c-4d2d-9296-25b06eeb0342.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MarinaraSpaghetti_Nemomix-v4.0-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemomix-v4.0-12B", - "id": "MarinaraSpaghetti/Nemomix-v4.0-12B", - "developer": "MarinaraSpaghetti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3613 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json b/data/hfopenllm_v2/Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json deleted file mode 100644 index a4acc5d5f..000000000 --- a/data/hfopenllm_v2/Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Marsouuu_MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniMathExpert-2_61B-ECE-PRYMMAL-Martial", - "id": "Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial", - "developer": "Marsouuu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2548 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3953 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4083 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2274 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial/5e31a55c-f222-4192-b031-27bb40ba56fa.json b/data/hfopenllm_v2/Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial/5e31a55c-f222-4192-b031-27bb40ba56fa.json deleted file mode 100644 index fe44d4d77..000000000 --- a/data/hfopenllm_v2/Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial/5e31a55c-f222-4192-b031-27bb40ba56fa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Marsouuu_MiniQwenMathExpert-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniQwenMathExpert-ECE-PRYMMAL-Martial", - "id": "Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial", - "developer": "Marsouuu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2795 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2922 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json b/data/hfopenllm_v2/Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json deleted file mode 100644 index 8cb29fb63..000000000 --- a/data/hfopenllm_v2/Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Marsouuu_MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial", - "id": "Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial", - "developer": "Marsouuu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.16 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1697 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3464 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Marsouuu/general3B-ECE-PRYMMAL-Martial/8e721067-898d-45ca-b4f5-9f523c4ce3d3.json b/data/hfopenllm_v2/Marsouuu/general3B-ECE-PRYMMAL-Martial/8e721067-898d-45ca-b4f5-9f523c4ce3d3.json deleted file mode 100644 index 7e5d5f9c9..000000000 --- a/data/hfopenllm_v2/Marsouuu/general3B-ECE-PRYMMAL-Martial/8e721067-898d-45ca-b4f5-9f523c4ce3d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Marsouuu_general3B-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "general3B-ECE-PRYMMAL-Martial", - "id": "Marsouuu/general3B-ECE-PRYMMAL-Martial", - "developer": "Marsouuu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2722 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5394 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4701 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Marsouuu/general3Bv2-ECE-PRYMMAL-Martial/be5d5480-ce4c-4ade-8c6a-c08cd2826909.json b/data/hfopenllm_v2/Marsouuu/general3Bv2-ECE-PRYMMAL-Martial/be5d5480-ce4c-4ade-8c6a-c08cd2826909.json deleted file mode 100644 index 2888af590..000000000 --- a/data/hfopenllm_v2/Marsouuu/general3Bv2-ECE-PRYMMAL-Martial/be5d5480-ce4c-4ade-8c6a-c08cd2826909.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Marsouuu_general3Bv2-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "general3Bv2-ECE-PRYMMAL-Martial", - "id": "Marsouuu/general3Bv2-ECE-PRYMMAL-Martial", - "developer": "Marsouuu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5693 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5637 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4498 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial/54dec074-29f8-4863-be37-2c08f6f2c3cb.json b/data/hfopenllm_v2/Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial/54dec074-29f8-4863-be37-2c08f6f2c3cb.json deleted file mode 100644 index 21ecaf0b1..000000000 --- a/data/hfopenllm_v2/Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial/54dec074-29f8-4863-be37-2c08f6f2c3cb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Marsouuu_lareneg1_78B-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lareneg1_78B-ECE-PRYMMAL-Martial", - "id": "Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial", - "developer": "Marsouuu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2795 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2922 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Marsouuu/lareneg3B-ECE-PRYMMAL-Martial/88a15025-556b-469d-be77-c773f2c61038.json b/data/hfopenllm_v2/Marsouuu/lareneg3B-ECE-PRYMMAL-Martial/88a15025-556b-469d-be77-c773f2c61038.json deleted file mode 100644 index f634cf074..000000000 --- a/data/hfopenllm_v2/Marsouuu/lareneg3B-ECE-PRYMMAL-Martial/88a15025-556b-469d-be77-c773f2c61038.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Marsouuu_lareneg3B-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lareneg3B-ECE-PRYMMAL-Martial", - "id": "Marsouuu/lareneg3B-ECE-PRYMMAL-Martial", - "developer": "Marsouuu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3303 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5453 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1518 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4725 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial/b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json b/data/hfopenllm_v2/Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial/b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json deleted file mode 100644 index 829e669c7..000000000 --- a/data/hfopenllm_v2/Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial/b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Marsouuu_lareneg3Bv2-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lareneg3Bv2-ECE-PRYMMAL-Martial", - "id": "Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial", - "developer": "Marsouuu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5753 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5623 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4369 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4511 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.1/97ce858e-a64f-4881-b6d0-0a2c0814336d.json b/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.1/97ce858e-a64f-4881-b6d0-0a2c0814336d.json deleted file mode 100644 index e5c5e6482..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.1/97ce858e-a64f-4881-b6d0-0a2c0814336d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Calme-4x7B-MoE-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calme-4x7B-MoE-v0.1", - "id": "MaziyarPanahi/Calme-4x7B-MoE-v0.1", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4315 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5103 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0801 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3057 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.2/1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json b/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.2/1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json deleted file mode 100644 index d772d36c9..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.2/1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Calme-4x7B-MoE-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calme-4x7B-MoE-v0.2", - "id": "MaziyarPanahi/Calme-4x7B-MoE-v0.2", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4294 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4318 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3058 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-70B-Instruct-v0.1/337bb321-9c6e-4751-9c9b-d8ba0120dd07.json b/data/hfopenllm_v2/MaziyarPanahi/Llama-3-70B-Instruct-v0.1/337bb321-9c6e-4751-9c9b-d8ba0120dd07.json deleted file mode 100644 index b12b4dd7c..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-70B-Instruct-v0.1/337bb321-9c6e-4751-9c9b-d8ba0120dd07.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Llama-3-70B-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-70B-Instruct-v0.1", - "id": "MaziyarPanahi/Llama-3-70B-Instruct-v0.1", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4714 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5366 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4433 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4618 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.10/cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json b/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.10/cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json deleted file mode 100644 index 2084bb4eb..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.10/cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Llama-3-8B-Instruct-v0.10/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-v0.10", - "id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.10", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7667 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4214 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3862 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.8/6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json b/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.8/6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json deleted file mode 100644 index 9ec03b55c..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.8/6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Llama-3-8B-Instruct-v0.8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-v0.8", - "id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.8", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7528 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4963 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3853 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.9/6cc4404a-f3e1-47b9-b56b-34e4269e1261.json b/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.9/6cc4404a-f3e1-47b9-b56b-34e4269e1261.json deleted file mode 100644 index 9730bfb53..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.9/6cc4404a-f3e1-47b9-b56b-34e4269e1261.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Llama-3-8B-Instruct-v0.9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-v0.9", - "id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.9", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4936 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4148 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3846 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow/8d820e43-ff42-4247-9ad0-4ed8e70672b4.json b/data/hfopenllm_v2/MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow/8d820e43-ff42-4247-9ad0-4ed8e70672b4.json deleted file mode 100644 index 25b553bee..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow/8d820e43-ff42-4247-9ad0-4ed8e70672b4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Qwen1.5-MoE-A2.7B-Wikihow/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-MoE-A2.7B-Wikihow", - "id": "MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 14.316 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2954 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0823 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.238 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.1/d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json b/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.1/d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json deleted file mode 100644 index 1ec344d87..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.1/d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Qwen2-7B-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-7B-Instruct-v0.1", - "id": "MaziyarPanahi/Qwen2-7B-Instruct-v0.1", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5123 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3857 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.8/9813dd88-ff70-4d9e-86c5-9b73444275c5.json b/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.8/9813dd88-ff70-4d9e-86c5-9b73444275c5.json deleted file mode 100644 index 1b9f32279..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.8/9813dd88-ff70-4d9e-86c5-9b73444275c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Qwen2-7B-Instruct-v0.8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-7B-Instruct-v0.8", - "id": "MaziyarPanahi/Qwen2-7B-Instruct-v0.8", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2775 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1767 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3566 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-llama3.1-70b/ac677432-e7d1-4439-9c05-426059c285ef.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-llama3.1-70b/ac677432-e7d1-4439-9c05-426059c285ef.json deleted file mode 100644 index 3872599b8..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-llama3.1-70b/ac677432-e7d1-4439-9c05-426059c285ef.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-llama3.1-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.1-llama3.1-70b", - "id": "MaziyarPanahi/calme-2.1-llama3.1-70b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8434 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.438 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5283 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3-4b/018f270f-3cfe-403c-a236-483038a0b04e.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3-4b/018f270f-3cfe-403c-a236-483038a0b04e.json deleted file mode 100644 index d1bb5a3bb..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3-4b/018f270f-3cfe-403c-a236-483038a0b04e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-phi3-4b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.1-phi3-4b", - "id": "MaziyarPanahi/calme-2.1-phi3-4b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5525 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5595 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3746 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3.5-4b/718a40ea-26b1-4cf4-9584-57be798640ae.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3.5-4b/718a40ea-26b1-4cf4-9584-57be798640ae.json deleted file mode 100644 index 6ca32abe5..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3.5-4b/718a40ea-26b1-4cf4-9584-57be798640ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-phi3.5-4b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.1-phi3.5-4b", - "id": "MaziyarPanahi/calme-2.1-phi3.5-4b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5659 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5484 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2039 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3995 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-72b/207a28a9-ae24-4a31-be95-96296b2e466d.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-72b/207a28a9-ae24-4a31-be95-96296b2e466d.json deleted file mode 100644 index 847508496..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-72b/207a28a9-ae24-4a31-be95-96296b2e466d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-qwen2-72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.1-qwen2-72b", - "id": "MaziyarPanahi/calme-2.1-qwen2-72b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.699 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8163 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6966 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4079 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4732 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5415 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-7b/72efedb8-d456-41ed-b1ae-4887cb6c18f8.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-7b/72efedb8-d456-41ed-b1ae-4887cb6c18f8.json deleted file mode 100644 index e64c522fa..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-7b/72efedb8-d456-41ed-b1ae-4887cb6c18f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.1-qwen2-7b", - "id": "MaziyarPanahi/calme-2.1-qwen2-7b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3816 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5046 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2311 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2.5-72b/ac91fb37-5742-4a3d-b93a-86c63b90cad5.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2.5-72b/ac91fb37-5742-4a3d-b93a-86c63b90cad5.json deleted file mode 100644 index d23d575cb..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2.5-72b/ac91fb37-5742-4a3d-b93a-86c63b90cad5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-qwen2.5-72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.1-qwen2.5-72b", - "id": "MaziyarPanahi/calme-2.1-qwen2.5-72b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.7 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8662 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5619 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-rys-78b/c71d025d-e954-4420-b397-e07c3644d1f4.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-rys-78b/c71d025d-e954-4420-b397-e07c3644d1f4.json deleted file mode 100644 index d2b0fca22..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-rys-78b/c71d025d-e954-4420-b397-e07c3644d1f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-rys-78b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.1-rys-78b", - "id": "MaziyarPanahi/calme-2.1-rys-78b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.965 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8136 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7098 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4693 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5444 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3-70b/968c3759-de5f-4255-ba95-cafc7a3c70a7.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3-70b/968c3759-de5f-4255-ba95-cafc7a3c70a7.json deleted file mode 100644 index 6a8c1b645..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3-70b/968c3759-de5f-4255-ba95-cafc7a3c70a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-llama3-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.2-llama3-70b", - "id": "MaziyarPanahi/calme-2.2-llama3-70b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2394 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3.1-70b/5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3.1-70b/5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json deleted file mode 100644 index 27dee1772..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3.1-70b/5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-llama3.1-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.2-llama3.1-70b", - "id": "MaziyarPanahi/calme-2.2-llama3.1-70b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8593 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6793 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4542 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5415 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-phi3-4b/1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-phi3-4b/1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json deleted file mode 100644 index 76e64261b..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-phi3-4b/1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-phi3-4b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.2-phi3-4b", - "id": "MaziyarPanahi/calme-2.2-phi3-4b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5069 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.553 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3976 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-72b/7908f572-8886-4add-ae84-b4ec0ec17c26.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-72b/7908f572-8886-4add-ae84-b4ec0ec17c26.json deleted file mode 100644 index ac04e14a8..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-72b/7908f572-8886-4add-ae84-b4ec0ec17c26.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-qwen2-72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.2-qwen2-72b", - "id": "MaziyarPanahi/calme-2.2-qwen2-72b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8008 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4532 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4508 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-7b/9e04ec5c-2208-4569-9b63-4768ed4262b9.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-7b/9e04ec5c-2208-4569-9b63-4768ed4262b9.json deleted file mode 100644 index 1a6f153fa..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-7b/9e04ec5c-2208-4569-9b63-4768ed4262b9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.2-qwen2-7b", - "id": "MaziyarPanahi/calme-2.2-qwen2-7b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3597 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3899 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2.5-72b/ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2.5-72b/ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json deleted file mode 100644 index 6b2655c95..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2.5-72b/ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-qwen2.5-72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.2-qwen2.5-72b", - "id": "MaziyarPanahi/calme-2.2-qwen2.5-72b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.7 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8477 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7276 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5618 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-rys-78b/c7579616-0c21-443a-a149-0c51a0ae92ac.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-rys-78b/c7579616-0c21-443a-a149-0c51a0ae92ac.json deleted file mode 100644 index 554ca11d3..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-rys-78b/c7579616-0c21-443a-a149-0c51a0ae92ac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-rys-78b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.2-rys-78b", - "id": "MaziyarPanahi/calme-2.2-rys-78b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.965 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7986 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7081 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4069 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4536 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3-70b/ef7a1429-db2f-433b-a606-339a9d868e7a.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3-70b/ef7a1429-db2f-433b-a606-339a9d868e7a.json deleted file mode 100644 index 84b0cfd4e..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3-70b/ef7a1429-db2f-433b-a606-339a9d868e7a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-llama3-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.3-llama3-70b", - "id": "MaziyarPanahi/calme-2.3-llama3-70b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.801 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6399 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2326 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4261 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5204 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3.1-70b/f531e13c-79ed-45da-a246-857fd2c884c1.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3.1-70b/f531e13c-79ed-45da-a246-857fd2c884c1.json deleted file mode 100644 index 9e97146e6..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3.1-70b/f531e13c-79ed-45da-a246-857fd2c884c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-llama3.1-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.3-llama3.1-70b", - "id": "MaziyarPanahi/calme-2.3-llama3.1-70b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8605 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6872 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3927 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4568 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5363 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-phi3-4b/0f525d93-663a-442c-9a51-1ad3a5054172.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-phi3-4b/0f525d93-663a-442c-9a51-1ad3a5054172.json deleted file mode 100644 index bc2d4c497..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-phi3-4b/0f525d93-663a-442c-9a51-1ad3a5054172.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-phi3-4b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.3-phi3-4b", - "id": "MaziyarPanahi/calme-2.3-phi3-4b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4926 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5538 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1473 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3988 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3828 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-72b/15af21e1-3193-47fa-a3fc-1f087216d4d9.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-72b/15af21e1-3193-47fa-a3fc-1f087216d4d9.json deleted file mode 100644 index f937b7720..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-72b/15af21e1-3193-47fa-a3fc-1f087216d4d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-qwen2-72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.3-qwen2-72b", - "id": "MaziyarPanahi/calme-2.3-qwen2-72b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.385 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6576 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4112 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5419 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-7b/67b270d9-3422-4770-9957-7bde65acca0a.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-7b/67b270d9-3422-4770-9957-7bde65acca0a.json deleted file mode 100644 index ed4ee9992..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-7b/67b270d9-3422-4770-9957-7bde65acca0a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.3-qwen2-7b", - "id": "MaziyarPanahi/calme-2.3-qwen2-7b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5064 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2069 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3611 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-rys-78b/e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-rys-78b/e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json deleted file mode 100644 index 90960df8c..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-rys-78b/e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-rys-78b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.3-rys-78b", - "id": "MaziyarPanahi/calme-2.3-rys-78b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.965 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8066 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7108 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4044 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4549 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5475 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-llama3-70b/4ff256af-73c7-4a5a-96da-19546a786c59.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-llama3-70b/4ff256af-73c7-4a5a-96da-19546a786c59.json deleted file mode 100644 index 5cac03e59..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-llama3-70b/4ff256af-73c7-4a5a-96da-19546a786c59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.4-llama3-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.4-llama3-70b", - "id": "MaziyarPanahi/calme-2.4-llama3-70b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5027 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6418 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2447 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5204 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-qwen2-7b/225cbeef-1d0d-40fc-949d-4ba6696fb690.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-qwen2-7b/225cbeef-1d0d-40fc-949d-4ba6696fb690.json deleted file mode 100644 index b84becf6e..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-qwen2-7b/225cbeef-1d0d-40fc-949d-4ba6696fb690.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.4-qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.4-qwen2-7b", - "id": "MaziyarPanahi/calme-2.4-qwen2-7b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5101 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2032 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4453 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-rys-78b/24fcd662-5abb-4bf8-b8df-1c21b048cd92.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-rys-78b/24fcd662-5abb-4bf8-b8df-1c21b048cd92.json deleted file mode 100644 index 6535c029d..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-rys-78b/24fcd662-5abb-4bf8-b8df-1c21b048cd92.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.4-rys-78b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.4-rys-78b", - "id": "MaziyarPanahi/calme-2.4-rys-78b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.965 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8011 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.728 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4027 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5771 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7002 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.5-qwen2-7b/7badcb45-7826-4fd1-b964-c697fbda76cc.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.5-qwen2-7b/7badcb45-7826-4fd1-b964-c697fbda76cc.json deleted file mode 100644 index 76077262d..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.5-qwen2-7b/7badcb45-7826-4fd1-b964-c697fbda76cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.5-qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.5-qwen2-7b", - "id": "MaziyarPanahi/calme-2.5-qwen2-7b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3145 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4887 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2258 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4565 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3682 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.6-qwen2-7b/bfb532f1-3319-46ff-80ae-0ca783a18bb6.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.6-qwen2-7b/bfb532f1-3319-46ff-80ae-0ca783a18bb6.json deleted file mode 100644 index 901ba7227..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.6-qwen2-7b/bfb532f1-3319-46ff-80ae-0ca783a18bb6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.6-qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.6-qwen2-7b", - "id": "MaziyarPanahi/calme-2.6-qwen2-7b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3443 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4586 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3732 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.7-qwen2-7b/ea304515-b41f-4e96-a0ec-78c897ebf9a4.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.7-qwen2-7b/ea304515-b41f-4e96-a0ec-78c897ebf9a4.json deleted file mode 100644 index 4bb85f9fc..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.7-qwen2-7b/ea304515-b41f-4e96-a0ec-78c897ebf9a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.7-qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-2.7-qwen2-7b", - "id": "MaziyarPanahi/calme-2.7-qwen2-7b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3592 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4883 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1382 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4824 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3705 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-baguette-3b/1fe79ea5-1922-4a5e-8857-1c832353b0a6.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-baguette-3b/1fe79ea5-1922-4a5e-8857-1c832353b0a6.json deleted file mode 100644 index 1217267f3..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-baguette-3b/1fe79ea5-1922-4a5e-8857-1c832353b0a6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.1-baguette-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-3.1-baguette-3b", - "id": "MaziyarPanahi/calme-3.1-baguette-3b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.085 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6234 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4683 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.256 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4008 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3399 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-3b/9098d70f-cbcd-4f6c-bcba-0b1da743396e.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-3b/9098d70f-cbcd-4f6c-bcba-0b1da743396e.json deleted file mode 100644 index dfeeb7050..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-3b/9098d70f-cbcd-4f6c-bcba-0b1da743396e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.1-instruct-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-3.1-instruct-3b", - "id": "MaziyarPanahi/calme-3.1-instruct-3b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.085 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4813 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3952 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-78b/df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-78b/df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json deleted file mode 100644 index 8dfc528f4..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-78b/df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.1-instruct-78b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-3.1-instruct-78b", - "id": "MaziyarPanahi/calme-3.1-instruct-78b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.965 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8136 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7305 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3927 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5891 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7185 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-llamaloi-3b/f68957d5-20a1-438f-9931-6a787aaed467.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-llamaloi-3b/f68957d5-20a1-438f-9931-6a787aaed467.json deleted file mode 100644 index d3b477ce7..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-llamaloi-3b/f68957d5-20a1-438f-9931-6a787aaed467.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.1-llamaloi-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-3.1-llamaloi-3b", - "id": "MaziyarPanahi/calme-3.1-llamaloi-3b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7375 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4587 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.173 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-baguette-3b/416e0c04-9119-4230-ba71-b0f47e2d4997.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-baguette-3b/416e0c04-9119-4230-ba71-b0f47e2d4997.json deleted file mode 100644 index 339284c10..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-baguette-3b/416e0c04-9119-4230-ba71-b0f47e2d4997.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.2-baguette-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-3.2-baguette-3b", - "id": "MaziyarPanahi/calme-3.2-baguette-3b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.085 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6338 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4709 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2825 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3338 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-3b/d57780e2-154e-437d-ac2f-0007e1f9140e.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-3b/d57780e2-154e-437d-ac2f-0007e1f9140e.json deleted file mode 100644 index b0efacce9..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-3b/d57780e2-154e-437d-ac2f-0007e1f9140e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.2-instruct-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-3.2-instruct-3b", - "id": "MaziyarPanahi/calme-3.2-instruct-3b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5533 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4866 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2168 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4047 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3653 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-78b/027d464b-1375-4de7-aa57-e1473d16ba89.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-78b/027d464b-1375-4de7-aa57-e1473d16ba89.json deleted file mode 100644 index 41995ee2f..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-78b/027d464b-1375-4de7-aa57-e1473d16ba89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.2-instruct-78b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-3.2-instruct-78b", - "id": "MaziyarPanahi/calme-3.2-instruct-78b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.965 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8063 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7319 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4033 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4027 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6024 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7303 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-baguette-3b/a81f20fa-57e8-498c-a162-6d8a9be09ee6.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-baguette-3b/a81f20fa-57e8-498c-a162-6d8a9be09ee6.json deleted file mode 100644 index 3d87a48e4..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-baguette-3b/a81f20fa-57e8-498c-a162-6d8a9be09ee6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.3-baguette-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-3.3-baguette-3b", - "id": "MaziyarPanahi/calme-3.3-baguette-3b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.636 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4678 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-instruct-3b/d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-instruct-3b/d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json deleted file mode 100644 index 49ca1c4b5..000000000 --- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-instruct-3b/d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.3-instruct-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calme-3.3-instruct-3b", - "id": "MaziyarPanahi/calme-3.3-instruct-3b", - "developer": "MaziyarPanahi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6423 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4693 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4074 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Minami-su/Amara-o1-7B-Qwen/f681d612-f574-4641-b34e-95b6de97f9e8.json b/data/hfopenllm_v2/Minami-su/Amara-o1-7B-Qwen/f681d612-f574-4641-b34e-95b6de97f9e8.json deleted file mode 100644 index ad3d67280..000000000 --- a/data/hfopenllm_v2/Minami-su/Amara-o1-7B-Qwen/f681d612-f574-4641-b34e-95b6de97f9e8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Minami-su_Amara-o1-7B-Qwen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amara-o1-7B-Qwen", - "id": "Minami-su/Amara-o1-7B-Qwen", - "developer": "Minami-su", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.739 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5199 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4007 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4083 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Minami-su/Amara-o2-7B-Qwen/cae1adaf-e424-4dcd-943b-5bbb708aca57.json b/data/hfopenllm_v2/Minami-su/Amara-o2-7B-Qwen/cae1adaf-e424-4dcd-943b-5bbb708aca57.json deleted file mode 100644 index d8fef8df3..000000000 --- a/data/hfopenllm_v2/Minami-su/Amara-o2-7B-Qwen/cae1adaf-e424-4dcd-943b-5bbb708aca57.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Minami-su_Amara-o2-7B-Qwen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Amara-o2-7B-Qwen", - "id": "Minami-su/Amara-o2-7B-Qwen", - "developer": "Minami-su", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7147 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5173 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4165 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Minami-su/test-7B-00/969ac825-92f2-448c-899a-226e69dee377.json b/data/hfopenllm_v2/Minami-su/test-7B-00/969ac825-92f2-448c-899a-226e69dee377.json deleted file mode 100644 index 813aba2e0..000000000 --- a/data/hfopenllm_v2/Minami-su/test-7B-00/969ac825-92f2-448c-899a-226e69dee377.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Minami-su_test-7B-00/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-7B-00", - "id": "Minami-su/test-7B-00", - "developer": "Minami-su", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.669 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4126 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3588 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Minami-su/test-7B-01/e108ad28-c155-4162-852c-0f588a136bdc.json b/data/hfopenllm_v2/Minami-su/test-7B-01/e108ad28-c155-4162-852c-0f588a136bdc.json deleted file mode 100644 index 878626d0b..000000000 --- a/data/hfopenllm_v2/Minami-su/test-7B-01/e108ad28-c155-4162-852c-0f588a136bdc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Minami-su_test-7B-01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-7B-01", - "id": "Minami-su/test-7B-01", - "developer": "Minami-su", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6736 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4422 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4554 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4153 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3536 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Minami-su/test-v2-7B-00/93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json b/data/hfopenllm_v2/Minami-su/test-v2-7B-00/93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json deleted file mode 100644 index 0b0afc074..000000000 --- a/data/hfopenllm_v2/Minami-su/test-v2-7B-00/93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Minami-su_test-v2-7B-00/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-v2-7B-00", - "id": "Minami-su/test-v2-7B-00", - "developer": "Minami-su", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6747 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4418 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3472 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/c1b16b84-9392-48f3-b483-0a9786925506.json b/data/hfopenllm_v2/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/c1b16b84-9392-48f3-b483-0a9786925506.json deleted file mode 100644 index 0a18b9f48..000000000 --- a/data/hfopenllm_v2/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/c1b16b84-9392-48f3-b483-0a9786925506.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ModelCloud_Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1", - "id": "ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1", - "developer": "ModelCloud", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 5.453 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5269 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3253 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ModelSpace/GemmaX2-28-9B-v0.1/b0c6e08d-b426-49d5-8a66-ee3d70131b62.json b/data/hfopenllm_v2/ModelSpace/GemmaX2-28-9B-v0.1/b0c6e08d-b426-49d5-8a66-ee3d70131b62.json deleted file mode 100644 index 3ce7f3cf6..000000000 --- a/data/hfopenllm_v2/ModelSpace/GemmaX2-28-9B-v0.1/b0c6e08d-b426-49d5-8a66-ee3d70131b62.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ModelSpace_GemmaX2-28-9B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GemmaX2-28-9B-v0.1", - "id": "ModelSpace/GemmaX2-28-9B-v0.1", - "developer": "ModelSpace", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0039 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3537 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2231 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MoonRide/Llama-3.2-3B-Khelavaster/6a6651a3-b34e-404d-ac25-42c151fb9ba3.json b/data/hfopenllm_v2/MoonRide/Llama-3.2-3B-Khelavaster/6a6651a3-b34e-404d-ac25-42c151fb9ba3.json deleted file mode 100644 index 7575cd4c1..000000000 --- a/data/hfopenllm_v2/MoonRide/Llama-3.2-3B-Khelavaster/6a6651a3-b34e-404d-ac25-42c151fb9ba3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MoonRide_Llama-3.2-3B-Khelavaster/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Khelavaster", - "id": "MoonRide/Llama-3.2-3B-Khelavaster", - "developer": "MoonRide", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4925 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1616 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged/da63b789-5571-4ed8-976e-146d385b18e2.json b/data/hfopenllm_v2/Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged/da63b789-5571-4ed8-976e-146d385b18e2.json deleted file mode 100644 index 47726cb82..000000000 --- a/data/hfopenllm_v2/Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged/da63b789-5571-4ed8-976e-146d385b18e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Mostafa8Mehrabi_llama-3.2-1b-Insomnia-ChatBot-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3.2-1b-Insomnia-ChatBot-merged", - "id": "Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged", - "developer": "Mostafa8Mehrabi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2366 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLong-8b-v4i/87b900e7-3bab-4e60-b0ef-349667cb2656.json b/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLong-8b-v4i/87b900e7-3bab-4e60-b0ef-349667cb2656.json deleted file mode 100644 index 87434c005..000000000 --- a/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLong-8b-v4i/87b900e7-3bab-4e60-b0ef-349667cb2656.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MrRobotoAI_MrRoboto-ProLong-8b-v4i/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MrRoboto-ProLong-8b-v4i", - "id": "MrRobotoAI/MrRoboto-ProLong-8b-v4i", - "developer": "MrRobotoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3835 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4585 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4014 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3068 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b/c9fd4740-4990-4174-b782-9b63c34d6407.json b/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b/c9fd4740-4990-4174-b782-9b63c34d6407.json deleted file mode 100644 index 91f97c6f4..000000000 --- a/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b/c9fd4740-4990-4174-b782-9b63c34d6407.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MrRobotoAI_MrRoboto-ProLongBASE-pt8-unaligned-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MrRoboto-ProLongBASE-pt8-unaligned-8b", - "id": "MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b", - "developer": "MrRobotoAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2566 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1211-3B/2582a049-e940-408b-b2d9-7a7bdf470e49.json b/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1211-3B/2582a049-e940-408b-b2d9-7a7bdf470e49.json deleted file mode 100644 index 5001bc314..000000000 --- a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1211-3B/2582a049-e940-408b-b2d9-7a7bdf470e49.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MultivexAI_Gladiator-Mini-Exp-1211-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gladiator-Mini-Exp-1211-3B", - "id": "MultivexAI/Gladiator-Mini-Exp-1211-3B", - "developer": "MultivexAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6876 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4484 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1375 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.326 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2/99310118-d2ec-4647-85db-fcc22aee9161.json b/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2/99310118-d2ec-4647-85db-fcc22aee9161.json deleted file mode 100644 index 22f18ce11..000000000 --- a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2/99310118-d2ec-4647-85db-fcc22aee9161.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gladiator-Mini-Exp-1221-3B-Instruct-V2", - "id": "MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2", - "developer": "MultivexAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6215 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4389 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3008 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3025 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct/bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json b/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct/bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json deleted file mode 100644 index b849a1f53..000000000 --- a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct/bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gladiator-Mini-Exp-1221-3B-Instruct", - "id": "MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct", - "developer": "MultivexAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6079 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1352 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3115 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3049 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct/6767e14a-bbfa-4a0d-8120-1f48a565474e.json b/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct/6767e14a-bbfa-4a0d-8120-1f48a565474e.json deleted file mode 100644 index 37852b3f6..000000000 --- a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct/6767e14a-bbfa-4a0d-8120-1f48a565474e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MultivexAI_Gladiator-Mini-Exp-1222-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gladiator-Mini-Exp-1222-3B-Instruct", - "id": "MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct", - "developer": "MultivexAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6163 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3128 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3017 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/70260aac-1bbf-4913-9dcc-58633d055314.json b/data/hfopenllm_v2/MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/70260aac-1bbf-4913-9dcc-58633d055314.json deleted file mode 100644 index f8ab724bd..000000000 --- a/data/hfopenllm_v2/MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/70260aac-1bbf-4913-9dcc-58633d055314.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/MultivexAI_Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF", - "id": "MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF", - "developer": "MultivexAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.144 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2908 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3642 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1.1/fba6e1a2-c197-4731-91ea-f6d059ba8b16.json b/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1.1/fba6e1a2-c197-4731-91ea-f6d059ba8b16.json deleted file mode 100644 index 9fe318e68..000000000 --- a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1.1/fba6e1a2-c197-4731-91ea-f6d059ba8b16.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-0.3B-Instruct-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NanoLM-0.3B-Instruct-v1.1", - "id": "Mxode/NanoLM-0.3B-Instruct-v1.1", - "developer": "Mxode", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.315 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1783 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3014 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1121 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1/22e74d0c-70d6-43c5-be4d-62842d93fedf.json b/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1/22e74d0c-70d6-43c5-be4d-62842d93fedf.json deleted file mode 100644 index fd4f80330..000000000 --- a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1/22e74d0c-70d6-43c5-be4d-62842d93fedf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-0.3B-Instruct-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NanoLM-0.3B-Instruct-v1", - "id": "Mxode/NanoLM-0.3B-Instruct-v1", - "developer": "Mxode", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.315 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1537 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3028 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4155 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v2/f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json b/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v2/f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json deleted file mode 100644 index f92c110e8..000000000 --- a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v2/f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-0.3B-Instruct-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NanoLM-0.3B-Instruct-v2", - "id": "Mxode/NanoLM-0.3B-Instruct-v2", - "developer": "Mxode", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.315 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1668 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2921 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3955 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1134 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v1.1/ecdb4661-426a-46be-aefc-7e04483cebc0.json b/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v1.1/ecdb4661-426a-46be-aefc-7e04483cebc0.json deleted file mode 100644 index a4f75b709..000000000 --- a/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v1.1/ecdb4661-426a-46be-aefc-7e04483cebc0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-1B-Instruct-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NanoLM-1B-Instruct-v1.1", - "id": "Mxode/NanoLM-1B-Instruct-v1.1", - "developer": "Mxode", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.076 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2395 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3184 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3433 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1215 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v2/236976b3-af46-45ac-a8a5-f5897e3468a1.json b/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v2/236976b3-af46-45ac-a8a5-f5897e3468a1.json deleted file mode 100644 index 3c6004de5..000000000 --- a/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v2/236976b3-af46-45ac-a8a5-f5897e3468a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-1B-Instruct-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NanoLM-1B-Instruct-v2", - "id": "Mxode/NanoLM-1B-Instruct-v2", - "developer": "Mxode", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.076 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.263 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3123 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3552 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1238 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v-0.1.0/fd175296-a5f6-4914-80e9-b8b75bc659de.json b/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v-0.1.0/fd175296-a5f6-4914-80e9-b8b75bc659de.json deleted file mode 100644 index 55309810d..000000000 --- a/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v-0.1.0/fd175296-a5f6-4914-80e9-b8b75bc659de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-gemma-2-27b-v-0.1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "naps-gemma-2-27b-v-0.1.0", - "id": "NAPS-ai/naps-gemma-2-27b-v-0.1.0", - "developer": "NAPS-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2912 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1168 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v0.1.0/d910bbaa-d55c-4b00-9320-856a8a6713c0.json b/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v0.1.0/d910bbaa-d55c-4b00-9320-856a8a6713c0.json deleted file mode 100644 index 8fed04ca8..000000000 --- a/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v0.1.0/d910bbaa-d55c-4b00-9320-856a8a6713c0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-gemma-2-27b-v0.1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "naps-gemma-2-27b-v0.1.0", - "id": "NAPS-ai/naps-gemma-2-27b-v0.1.0", - "developer": "NAPS-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2912 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1168 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.3/99a5f123-5d2e-469b-884e-c9a64c6bc197.json b/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.3/99a5f123-5d2e-469b-884e-c9a64c6bc197.json deleted file mode 100644 index 027340ffc..000000000 --- a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.3/99a5f123-5d2e-469b-884e-c9a64c6bc197.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama-3_1-8b-instruct-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "naps-llama-3_1-8b-instruct-v0.3", - "id": "NAPS-ai/naps-llama-3_1-8b-instruct-v0.3", - "developer": "NAPS-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4901 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1903 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3787 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.4/ed17a715-f0ae-461c-9618-ac952c450ec5.json b/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.4/ed17a715-f0ae-461c-9618-ac952c450ec5.json deleted file mode 100644 index 778af4ae2..000000000 --- a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.4/ed17a715-f0ae-461c-9618-ac952c450ec5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama-3_1-8b-instruct-v0.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "naps-llama-3_1-8b-instruct-v0.4", - "id": "NAPS-ai/naps-llama-3_1-8b-instruct-v0.4", - "developer": "NAPS-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7344 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4862 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-instruct-v0.5.0/3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json b/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-instruct-v0.5.0/3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json deleted file mode 100644 index 3ae07feca..000000000 --- a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-instruct-v0.5.0/3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama-3_1-instruct-v0.5.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "naps-llama-3_1-instruct-v0.5.0", - "id": "NAPS-ai/naps-llama-3_1-instruct-v0.5.0", - "developer": "NAPS-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4148 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2614 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1_instruct-v0.6.0/b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json b/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1_instruct-v0.6.0/b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json deleted file mode 100644 index 5d319d532..000000000 --- a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1_instruct-v0.6.0/b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama-3_1_instruct-v0.6.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "naps-llama-3_1_instruct-v0.6.0", - "id": "NAPS-ai/naps-llama-3_1_instruct-v0.6.0", - "developer": "NAPS-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4528 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3241 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama3.1-70B-v0.2-fp16/39893637-552a-48d8-9b83-433415eb26c3.json b/data/hfopenllm_v2/NAPS-ai/naps-llama3.1-70B-v0.2-fp16/39893637-552a-48d8-9b83-433415eb26c3.json deleted file mode 100644 index fa9e2a0a2..000000000 --- a/data/hfopenllm_v2/NAPS-ai/naps-llama3.1-70B-v0.2-fp16/39893637-552a-48d8-9b83-433415eb26c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama3.1-70B-v0.2-fp16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "naps-llama3.1-70B-v0.2-fp16", - "id": "NAPS-ai/naps-llama3.1-70B-v0.2-fp16", - "developer": "NAPS-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.761 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1845 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3041 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1099 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NCSOFT/Llama-VARCO-8B-Instruct/f9549713-f487-4e26-bfeb-ec6d394b7014.json b/data/hfopenllm_v2/NCSOFT/Llama-VARCO-8B-Instruct/f9549713-f487-4e26-bfeb-ec6d394b7014.json deleted file mode 100644 index 515a08f92..000000000 --- a/data/hfopenllm_v2/NCSOFT/Llama-VARCO-8B-Instruct/f9549713-f487-4e26-bfeb-ec6d394b7014.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NCSOFT_Llama-VARCO-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-VARCO-8B-Instruct", - "id": "NCSOFT/Llama-VARCO-8B-Instruct", - "developer": "NCSOFT", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.447 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5023 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3841 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.319 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NJS26/NJS_777/02579c41-f117-4412-9c00-ee7db3e9ab97.json b/data/hfopenllm_v2/NJS26/NJS_777/02579c41-f117-4412-9c00-ee7db3e9ab97.json deleted file mode 100644 index bf0dad5d2..000000000 --- a/data/hfopenllm_v2/NJS26/NJS_777/02579c41-f117-4412-9c00-ee7db3e9ab97.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NJS26_NJS_777/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NJS_777", - "id": "NJS26/NJS_777", - "developer": "NJS26", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 10.362 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1881 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2064 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NLPark/AnFeng_v3.1-Avocet/bfa1d761-00aa-4438-a5de-972d934c63d5.json b/data/hfopenllm_v2/NLPark/AnFeng_v3.1-Avocet/bfa1d761-00aa-4438-a5de-972d934c63d5.json deleted file mode 100644 index d4c3c3a62..000000000 --- a/data/hfopenllm_v2/NLPark/AnFeng_v3.1-Avocet/bfa1d761-00aa-4438-a5de-972d934c63d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NLPark_AnFeng_v3.1-Avocet/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AnFeng_v3.1-Avocet", - "id": "NLPark/AnFeng_v3.1-Avocet", - "developer": "NLPark", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.393 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5096 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5829 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1594 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4476 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4438 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NLPark/B-and-W_Flycatcher-3AD1E/20a84d88-05c2-4e02-8c84-2afa84cc659f.json b/data/hfopenllm_v2/NLPark/B-and-W_Flycatcher-3AD1E/20a84d88-05c2-4e02-8c84-2afa84cc659f.json deleted file mode 100644 index 17941cc70..000000000 --- a/data/hfopenllm_v2/NLPark/B-and-W_Flycatcher-3AD1E/20a84d88-05c2-4e02-8c84-2afa84cc659f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NLPark_B-and-W_Flycatcher-3AD1E/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "B-and-W_Flycatcher-3AD1E", - "id": "NLPark/B-and-W_Flycatcher-3AD1E", - "developer": "NLPark", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4908 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6065 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2379 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4423 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4741 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NLPark/Shi-Ci-Robin-Test_3AD80/84eedce3-3a93-4630-b914-aa281fd2efda.json b/data/hfopenllm_v2/NLPark/Shi-Ci-Robin-Test_3AD80/84eedce3-3a93-4630-b914-aa281fd2efda.json deleted file mode 100644 index b7883ce1e..000000000 --- a/data/hfopenllm_v2/NLPark/Shi-Ci-Robin-Test_3AD80/84eedce3-3a93-4630-b914-aa281fd2efda.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NLPark_Shi-Ci-Robin-Test_3AD80/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Shi-Ci-Robin-Test_3AD80", - "id": "NLPark/Shi-Ci-Robin-Test_3AD80", - "developer": "NLPark", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7227 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6705 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4696 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5121 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NTQAI/NxMobileLM-1.5B-SFT/b3b7b62f-ac82-4ef9-9634-afb81645ec19.json b/data/hfopenllm_v2/NTQAI/NxMobileLM-1.5B-SFT/b3b7b62f-ac82-4ef9-9634-afb81645ec19.json deleted file mode 100644 index ddf2ce282..000000000 --- a/data/hfopenllm_v2/NTQAI/NxMobileLM-1.5B-SFT/b3b7b62f-ac82-4ef9-9634-afb81645ec19.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NTQAI_NxMobileLM-1.5B-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NxMobileLM-1.5B-SFT", - "id": "NTQAI/NxMobileLM-1.5B-SFT", - "developer": "NTQAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6392 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3957 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NTQAI/Nxcode-CQ-7B-orpo/283c5166-b9c5-4d20-9653-0cd0346d87c1.json b/data/hfopenllm_v2/NTQAI/Nxcode-CQ-7B-orpo/283c5166-b9c5-4d20-9653-0cd0346d87c1.json deleted file mode 100644 index 61343896e..000000000 --- a/data/hfopenllm_v2/NTQAI/Nxcode-CQ-7B-orpo/283c5166-b9c5-4d20-9653-0cd0346d87c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NTQAI_Nxcode-CQ-7B-orpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nxcode-CQ-7B-orpo", - "id": "NTQAI/Nxcode-CQ-7B-orpo", - "developer": "NTQAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.25 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4007 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4143 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1612 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NYTK/PULI-GPTrio/478b54cd-6410-41e5-8a53-4e46bcd9d7af.json b/data/hfopenllm_v2/NYTK/PULI-GPTrio/478b54cd-6410-41e5-8a53-4e46bcd9d7af.json deleted file mode 100644 index 513daf9dc..000000000 --- a/data/hfopenllm_v2/NYTK/PULI-GPTrio/478b54cd-6410-41e5-8a53-4e46bcd9d7af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NYTK_PULI-GPTrio/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PULI-GPTrio", - "id": "NYTK/PULI-GPTrio", - "developer": "NYTK", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 7.673 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.218 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3819 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NYTK/PULI-LlumiX-32K/de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json b/data/hfopenllm_v2/NYTK/PULI-LlumiX-32K/de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json deleted file mode 100644 index 373fe82fa..000000000 --- a/data/hfopenllm_v2/NYTK/PULI-LlumiX-32K/de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NYTK_PULI-LlumiX-32K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PULI-LlumiX-32K", - "id": "NYTK/PULI-LlumiX-32K", - "developer": "NYTK", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.17 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3964 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1681 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Naveenpoliasetty/llama3-8B-V2/ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json b/data/hfopenllm_v2/Naveenpoliasetty/llama3-8B-V2/ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json deleted file mode 100644 index 6dbee699d..000000000 --- a/data/hfopenllm_v2/Naveenpoliasetty/llama3-8B-V2/ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Naveenpoliasetty_llama3-8B-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3-8B-V2", - "id": "Naveenpoliasetty/llama3-8B-V2", - "developer": "Naveenpoliasetty", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4123 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-Instruct/553fd36d-08dd-46a3-ab04-77b9039e7921.json b/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-Instruct/553fd36d-08dd-46a3-ab04-77b9039e7921.json deleted file mode 100644 index b291ca255..000000000 --- a/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-Instruct/553fd36d-08dd-46a3-ab04-77b9039e7921.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NbAiLab_nb-llama-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nb-llama-3.1-8B-Instruct", - "id": "NbAiLab/nb-llama-3.1-8B-Instruct", - "developer": "NbAiLab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1197 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-sft/e2bae853-cc0f-456a-a635-98d5f87ac47c.json b/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-sft/e2bae853-cc0f-456a-a635-98d5f87ac47c.json deleted file mode 100644 index c5be98399..000000000 --- a/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-sft/e2bae853-cc0f-456a-a635-98d5f87ac47c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NbAiLab_nb-llama-3.1-8B-sft/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nb-llama-3.1-8B-sft", - "id": "NbAiLab/nb-llama-3.1-8B-sft", - "developer": "NbAiLab", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3282 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1222 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-German-ORPO/d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json b/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-German-ORPO/d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json deleted file mode 100644 index 95ea21ef4..000000000 --- a/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-German-ORPO/d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nekochu_Llama-3.1-8B-German-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-German-ORPO", - "id": "Nekochu/Llama-3.1-8B-German-ORPO", - "developer": "Nekochu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4611 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4983 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4647 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-french-DPO/5d92e02f-b590-4b6b-8c64-30690f79e916.json b/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-french-DPO/5d92e02f-b590-4b6b-8c64-30690f79e916.json deleted file mode 100644 index ccbbfa608..000000000 --- a/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-french-DPO/5d92e02f-b590-4b6b-8c64-30690f79e916.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nekochu_Llama-3.1-8B-french-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-french-DPO", - "id": "Nekochu/Llama-3.1-8B-french-DPO", - "developer": "Nekochu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4656 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4216 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nekochu/Luminia-13B-v3/e10f38df-b5d5-47c6-924f-563c6f8a6616.json b/data/hfopenllm_v2/Nekochu/Luminia-13B-v3/e10f38df-b5d5-47c6-924f-563c6f8a6616.json deleted file mode 100644 index 5012f1af5..000000000 --- a/data/hfopenllm_v2/Nekochu/Luminia-13B-v3/e10f38df-b5d5-47c6-924f-563c6f8a6616.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nekochu_Luminia-13B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Luminia-13B-v3", - "id": "Nekochu/Luminia-13B-v3", - "developer": "Nekochu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.016 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2523 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3983 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2215 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nekochu/Luminia-8B-RP/27257dc9-750c-4673-8865-986434bc5c0e.json b/data/hfopenllm_v2/Nekochu/Luminia-8B-RP/27257dc9-750c-4673-8865-986434bc5c0e.json deleted file mode 100644 index 1ed86080e..000000000 --- a/data/hfopenllm_v2/Nekochu/Luminia-8B-RP/27257dc9-750c-4673-8865-986434bc5c0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nekochu_Luminia-8B-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Luminia-8B-RP", - "id": "Nekochu/Luminia-8B-RP", - "developer": "Nekochu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5574 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5218 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-12B/e599f3f8-e5eb-4bfe-a102-efc5a967434d.json b/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-12B/e599f3f8-e5eb-4bfe-a102-efc5a967434d.json deleted file mode 100644 index 69ad8d773..000000000 --- a/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-12B/e599f3f8-e5eb-4bfe-a102-efc5a967434d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NeverSleep_Lumimaid-v0.2-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lumimaid-v0.2-12B", - "id": "NeverSleep/Lumimaid-v0.2-12B", - "developer": "NeverSleep", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1099 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5396 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4821 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3511 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-8B/8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json b/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-8B/8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json deleted file mode 100644 index e40a35f70..000000000 --- a/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-8B/8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NeverSleep_Lumimaid-v0.2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lumimaid-v0.2-8B", - "id": "NeverSleep/Lumimaid-v0.2-8B", - "developer": "NeverSleep", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5038 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5238 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1435 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4303 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3636 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated/f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json b/data/hfopenllm_v2/Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated/f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json deleted file mode 100644 index f3573a1fa..000000000 --- a/data/hfopenllm_v2/Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated/f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Dolphin3.0-Llama3.1-1B-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dolphin3.0-Llama3.1-1B-abliterated", - "id": "Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5312 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3237 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1373 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0/2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0/2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json deleted file mode 100644 index cd74ed8ee..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0/2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DeepDive_3_Prev_v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DeepDive_3_Prev_v1.0", - "id": "Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6809 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5155 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1866 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3438 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/567f8f54-225f-4d9b-be06-f24091adc1e6.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/567f8f54-225f-4d9b-be06-f24091adc1e6.json deleted file mode 100644 index 7e171d5ad..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/567f8f54-225f-4d9b-be06-f24091adc1e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0", - "id": "Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7101 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3441 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R/ebb59730-9522-4c45-8f42-c0d941fd728c.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R/ebb59730-9522-4c45-8f42-c0d941fd728c.json deleted file mode 100644 index 2ed4134e3..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R/ebb59730-9522-4c45-8f42-c0d941fd728c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DobHerWild_R1_v1.1R/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DobHerWild_R1_v1.1R", - "id": "Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5257 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2319 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.01/2c44fa8c-ebd3-4ea6-8578-61da38965c09.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.01/2c44fa8c-ebd3-4ea6-8578-61da38965c09.json deleted file mode 100644 index d6626e33b..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.01/2c44fa8c-ebd3-4ea6-8578-61da38965c09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DoberWild_v2.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DoberWild_v2.01", - "id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7996 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5251 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2002 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4012 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3791 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.02/3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.02/3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json deleted file mode 100644 index f57ee9701..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.02/3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DoberWild_v2.02/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DoberWild_v2.02", - "id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.02", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7746 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5313 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1994 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3946 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.03/0ab721ba-fbda-44ca-a349-1d3abfaabe62.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.03/0ab721ba-fbda-44ca-a349-1d3abfaabe62.json deleted file mode 100644 index 7306b4374..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.03/0ab721ba-fbda-44ca-a349-1d3abfaabe62.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DoberWild_v2.03/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DoberWild_v2.03", - "id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.03", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7764 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5294 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3906 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3722 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.01/2fea1128-4f0c-40d8-be87-72c42c0648fb.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.01/2fea1128-4f0c-40d8-be87-72c42c0648fb.json deleted file mode 100644 index a603278e4..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.01/2fea1128-4f0c-40d8-be87-72c42c0648fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DodoWild_v2.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DodoWild_v2.01", - "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7978 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5253 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1986 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.02/db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.02/db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json deleted file mode 100644 index afa87e2bb..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.02/db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DodoWild_v2.02/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DodoWild_v2.02", - "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.02", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8017 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2273 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3971 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.03/28399fd0-840c-49d3-8179-407ed83d3bfc.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.03/28399fd0-840c-49d3-8179-407ed83d3bfc.json deleted file mode 100644 index 7503f6ebc..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.03/28399fd0-840c-49d3-8179-407ed83d3bfc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DodoWild_v2.03/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DodoWild_v2.03", - "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.03", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7941 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5308 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2221 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3959 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3786 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.10/d7108c13-e14a-4366-9a39-204f853b1bee.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.10/d7108c13-e14a-4366-9a39-204f853b1bee.json deleted file mode 100644 index cda2e5b7a..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.10/d7108c13-e14a-4366-9a39-204f853b1bee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DodoWild_v2.10/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_DodoWild_v2.10", - "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.10", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8054 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1971 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4157 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3855 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01/56152d05-9273-4701-8c0a-723e2cab618d.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01/56152d05-9273-4701-8c0a-723e2cab618d.json deleted file mode 100644 index 9d16585a7..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01/56152d05-9273-4701-8c0a-723e2cab618d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Dolermed_R1_V1.01", - "id": "Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7534 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5312 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2017 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3747 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03/55d2f23d-cb6c-42d2-8b57-837451d3c6df.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03/55d2f23d-cb6c-42d2-8b57-837451d3c6df.json deleted file mode 100644 index d30924818..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03/55d2f23d-cb6c-42d2-8b57-837451d3c6df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.03/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Dolermed_R1_V1.03", - "id": "Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2092 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_V1.01/7479ae87-e795-4e20-848a-291614176def.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_V1.01/7479ae87-e795-4e20-848a-291614176def.json deleted file mode 100644 index 3796fa305..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_V1.01/7479ae87-e795-4e20-848a-291614176def.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Dolermed_V1.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Dolermed_V1.01", - "id": "Nexesenex/Llama_3.1_8b_Dolermed_V1.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5087 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3945 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04/04ceb40e-bde8-487b-9d29-dc8f681af9be.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04/04ceb40e-bde8-487b-9d29-dc8f681af9be.json deleted file mode 100644 index baf40f42c..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04/04ceb40e-bde8-487b-9d29-dc8f681af9be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Dolerstormed_V1.04/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Dolerstormed_V1.04", - "id": "Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7889 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5195 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3889 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04/e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04/e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json deleted file mode 100644 index 59b222af2..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04/e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Hermedash_R1_V1.04/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Hermedash_R1_V1.04", - "id": "Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7872 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5192 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1866 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4111 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01/9954194c-69b5-4eb4-8b32-859845548cb0.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01/9954194c-69b5-4eb4-8b32-859845548cb0.json deleted file mode 100644 index db5b6578a..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01/9954194c-69b5-4eb4-8b32-859845548cb0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Hermedive_R1_V1.01", - "id": "Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5001 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5171 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4008 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3427 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03/2afbc279-242a-4276-85f0-facd29c2d89b.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03/2afbc279-242a-4276-85f0-facd29c2d89b.json deleted file mode 100644 index 18a502512..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03/2afbc279-242a-4276-85f0-facd29c2d89b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.03/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Hermedive_R1_V1.03", - "id": "Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6648 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5141 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3613 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3488 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_V1.01/ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_V1.01/ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json deleted file mode 100644 index fba2698e9..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_V1.01/ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Hermedive_V1.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Hermedive_V1.01", - "id": "Nexesenex/Llama_3.1_8b_Hermedive_V1.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5062 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4918 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1647 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Mediver_V1.01/d03c73ca-7364-4517-aea4-f0ac564c49df.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Mediver_V1.01/d03c73ca-7364-4517-aea4-f0ac564c49df.json deleted file mode 100644 index 9a356ff72..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Mediver_V1.01/d03c73ca-7364-4517-aea4-f0ac564c49df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Mediver_V1.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Mediver_V1.01", - "id": "Nexesenex/Llama_3.1_8b_Mediver_V1.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1885 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4415 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3898 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2994 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Medusa_v1.01/1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Medusa_v1.01/1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json deleted file mode 100644 index 08ab1831d..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Medusa_v1.01/1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Medusa_v1.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Medusa_v1.01", - "id": "Nexesenex/Llama_3.1_8b_Medusa_v1.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5018 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1465 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4067 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3531 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1/f2363099-c39a-4874-bf77-ccc0fa087680.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1/f2363099-c39a-4874-bf77-ccc0fa087680.json deleted file mode 100644 index 8f98cf96c..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1/f2363099-c39a-4874-bf77-ccc0fa087680.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Smarteaz_0.2_R1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Smarteaz_0.2_R1", - "id": "Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6346 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5113 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2606 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3645 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_V1.01/596eeee8-3600-4f8a-8888-978b610eb2ca.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_V1.01/596eeee8-3600-4f8a-8888-978b610eb2ca.json deleted file mode 100644 index c7e67d274..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_V1.01/596eeee8-3600-4f8a-8888-978b610eb2ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Smarteaz_V1.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Smarteaz_V1.01", - "id": "Nexesenex/Llama_3.1_8b_Smarteaz_V1.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8151 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2341 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3789 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3736 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Stormeder_v1.04/595ddba1-c450-4b69-85b7-0e3118c8c6c7.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Stormeder_v1.04/595ddba1-c450-4b69-85b7-0e3118c8c6c7.json deleted file mode 100644 index 0c3c8b8b1..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Stormeder_v1.04/595ddba1-c450-4b69-85b7-0e3118c8c6c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Stormeder_v1.04/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Stormeder_v1.04", - "id": "Nexesenex/Llama_3.1_8b_Stormeder_v1.04", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7853 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.185 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3949 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Typhoon_v1.03/64890314-bba0-4fb2-8c21-38b413cff4c8.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Typhoon_v1.03/64890314-bba0-4fb2-8c21-38b413cff4c8.json deleted file mode 100644 index 30c9c3431..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Typhoon_v1.03/64890314-bba0-4fb2-8c21-38b413cff4c8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Typhoon_v1.03/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.1_8b_Typhoon_v1.03", - "id": "Nexesenex/Llama_3.1_8b_Typhoon_v1.03", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8078 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2273 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3815 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.1/470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.1/470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json deleted file mode 100644 index 421f9e1fb..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.1/470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_AquaSyn_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_AquaSyn_0.1", - "id": "Nexesenex/Llama_3.2_1b_AquaSyn_0.1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2741 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3284 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1378 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.11/00a1579e-8636-4eca-9a63-c0b067a5f3dc.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.11/00a1579e-8636-4eca-9a63-c0b067a5f3dc.json deleted file mode 100644 index 9ccdfaea8..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.11/00a1579e-8636-4eca-9a63-c0b067a5f3dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_AquaSyn_0.11/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_AquaSyn_0.11", - "id": "Nexesenex/Llama_3.2_1b_AquaSyn_0.11", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2431 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1116 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Dolto_0.1/a52cc4c9-6d60-4083-ac77-591e247d86c9.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Dolto_0.1/a52cc4c9-6d60-4083-ac77-591e247d86c9.json deleted file mode 100644 index 8e04a3f3b..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Dolto_0.1/a52cc4c9-6d60-4083-ac77-591e247d86c9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Dolto_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_Dolto_0.1", - "id": "Nexesenex/Llama_3.2_1b_Dolto_0.1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5434 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.335 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2374 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1364 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1.01/ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1.01/ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json deleted file mode 100644 index d44a7f415..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1.01/ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Odyssea_V1.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_Odyssea_V1.01", - "id": "Nexesenex/Llama_3.2_1b_Odyssea_V1.01", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2495 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1/c4d11b01-ae5b-4198-b102-07160f100a41.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1/c4d11b01-ae5b-4198-b102-07160f100a41.json deleted file mode 100644 index 4efe276e5..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1/c4d11b01-ae5b-4198-b102-07160f100a41.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Odyssea_V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_Odyssea_V1", - "id": "Nexesenex/Llama_3.2_1b_Odyssea_V1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2553 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.301 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1153 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1/19405ead-2263-4613-8053-43beeafb4bfc.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1/19405ead-2263-4613-8053-43beeafb4bfc.json deleted file mode 100644 index d058404ed..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1/19405ead-2263-4613-8053-43beeafb4bfc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_OpenTree_R1_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_OpenTree_R1_0.1", - "id": "Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5366 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3131 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1675 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OrcaSun_V1/6c698a60-a813-4be7-b55f-b684029b492d.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OrcaSun_V1/6c698a60-a813-4be7-b55f-b684029b492d.json deleted file mode 100644 index 377ee7131..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OrcaSun_V1/6c698a60-a813-4be7-b55f-b684029b492d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_OrcaSun_V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_OrcaSun_V1", - "id": "Nexesenex/Llama_3.2_1b_OrcaSun_V1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5949 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.355 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2366 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1904 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1/b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1/b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json deleted file mode 100644 index 16d96b077..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1/b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_RandomLego_RP_R1_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_RandomLego_RP_R1_0.1", - "id": "Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5543 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3428 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1563 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_SunOrca_V1/a20a529e-c52e-41b7-a8ee-909167048bfb.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_SunOrca_V1/a20a529e-c52e-41b7-a8ee-909167048bfb.json deleted file mode 100644 index 2c9dd24ef..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_SunOrca_V1/a20a529e-c52e-41b7-a8ee-909167048bfb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_SunOrca_V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_SunOrca_V1", - "id": "Nexesenex/Llama_3.2_1b_SunOrca_V1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.543 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1884 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Sydonia_0.1/2735e6f4-839f-4ab1-8ede-3447891b1b26.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Sydonia_0.1/2735e6f4-839f-4ab1-8ede-3447891b1b26.json deleted file mode 100644 index 794afd099..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Sydonia_0.1/2735e6f4-839f-4ab1-8ede-3447891b1b26.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Sydonia_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_Sydonia_0.1", - "id": "Nexesenex/Llama_3.2_1b_Sydonia_0.1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2197 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2282 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Syneridol_0.2/e74e7e7f-8550-4cba-97cd-2626c82d6b29.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Syneridol_0.2/e74e7e7f-8550-4cba-97cd-2626c82d6b29.json deleted file mode 100644 index b39cc7c79..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Syneridol_0.2/e74e7e7f-8550-4cba-97cd-2626c82d6b29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Syneridol_0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_Syneridol_0.2", - "id": "Nexesenex/Llama_3.2_1b_Syneridol_0.2", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2157 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3139 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1227 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.1/14f4c00d-8915-413d-8e85-79f395127682.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.1/14f4c00d-8915-413d-8e85-79f395127682.json deleted file mode 100644 index 0b4dbadaf..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.1/14f4c00d-8915-413d-8e85-79f395127682.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Synopsys_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_Synopsys_0.1", - "id": "Nexesenex/Llama_3.2_1b_Synopsys_0.1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1764 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3162 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.11/9119b586-d3b2-4ce0-a243-d584e2087184.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.11/9119b586-d3b2-4ce0-a243-d584e2087184.json deleted file mode 100644 index ff16ce048..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.11/9119b586-d3b2-4ce0-a243-d584e2087184.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Synopsys_0.11/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b_Synopsys_0.11", - "id": "Nexesenex/Llama_3.2_1b_Synopsys_0.11", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2842 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3102 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3513 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v1/629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v1/629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json deleted file mode 100644 index 66193208d..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v1/629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_3b_Kermes_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_3b_Kermes_v1", - "id": "Nexesenex/Llama_3.2_3b_Kermes_v1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4852 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.441 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2547 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2.1/a6ac828c-904b-413a-a5fa-a5ed06a28143.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2.1/a6ac828c-904b-413a-a5fa-a5ed06a28143.json deleted file mode 100644 index feb5d98bb..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2.1/a6ac828c-904b-413a-a5fa-a5ed06a28143.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_3b_Kermes_v2.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_3b_Kermes_v2.1", - "id": "Nexesenex/Llama_3.2_3b_Kermes_v2.1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5584 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4464 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3964 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2692 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2/251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2/251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json deleted file mode 100644 index 294b9bac9..000000000 --- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2/251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_3b_Kermes_v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_3b_Kermes_v2", - "id": "Nexesenex/Llama_3.2_3b_Kermes_v2", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5754 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4455 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3778 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2734 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_Halo_0.1/962b48a3-23d7-4104-b34d-4e5c2af31d58.json b/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_Halo_0.1/962b48a3-23d7-4104-b34d-4e5c2af31d58.json deleted file mode 100644 index d9a752984..000000000 --- a/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_Halo_0.1/962b48a3-23d7-4104-b34d-4e5c2af31d58.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Nemotron_W_4b_Halo_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemotron_W_4b_Halo_0.1", - "id": "Nexesenex/Nemotron_W_4b_Halo_0.1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.513 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3627 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4165 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_MagLight_0.1/e4b0be31-6f9a-4a57-b433-e561da9bd827.json b/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_MagLight_0.1/e4b0be31-6f9a-4a57-b433-e561da9bd827.json deleted file mode 100644 index 7186f9001..000000000 --- a/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_MagLight_0.1/e4b0be31-6f9a-4a57-b433-e561da9bd827.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Nemotron_W_4b_MagLight_0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemotron_W_4b_MagLight_0.1", - "id": "Nexesenex/Nemotron_W_4b_MagLight_0.1", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.513 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4112 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2545 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a/9a31f208-b7d8-4baa-b96e-99926ecb35af.json b/data/hfopenllm_v2/Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a/9a31f208-b7d8-4baa-b96e-99926ecb35af.json deleted file mode 100644 index 8ea2f8444..000000000 --- a/data/hfopenllm_v2/Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a/9a31f208-b7d8-4baa-b96e-99926ecb35af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_Qwen_2.5_3b_Smarteaz_0.01a/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen_2.5_3b_Smarteaz_0.01a", - "id": "Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.085 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4012 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.286 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/8d933df1-60cb-471d-bfc3-b11c93150203.json b/data/hfopenllm_v2/Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/8d933df1-60cb-471d-bfc3-b11c93150203.json deleted file mode 100644 index 494bbaf5c..000000000 --- a/data/hfopenllm_v2/Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/8d933df1-60cb-471d-bfc3-b11c93150203.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexesenex_pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL", - "id": "Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL", - "developer": "Nexesenex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3562 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1803 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nexusflow/NexusRaven-V2-13B/35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json b/data/hfopenllm_v2/Nexusflow/NexusRaven-V2-13B/35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json deleted file mode 100644 index f72a2ef92..000000000 --- a/data/hfopenllm_v2/Nexusflow/NexusRaven-V2-13B/35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nexusflow_NexusRaven-V2-13B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NexusRaven-V2-13B", - "id": "Nexusflow/NexusRaven-V2-13B", - "developer": "Nexusflow", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1791 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3949 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3737 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1872 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-1epoch/3530db9a-0d61-4cf8-9fff-b15f6488c845.json b/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-1epoch/3530db9a-0d61-4cf8-9fff-b15f6488c845.json deleted file mode 100644 index 24a1eb210..000000000 --- a/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-1epoch/3530db9a-0d61-4cf8-9fff-b15f6488c845.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NikolaSigmoid_AceMath-1.5B-Instruct-1epoch/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceMath-1.5B-Instruct-1epoch", - "id": "NikolaSigmoid/AceMath-1.5B-Instruct-1epoch", - "developer": "NikolaSigmoid", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.791 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2849 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4263 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3051 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3925 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200/7d9901e0-eafe-4d49-a5bb-fab059708bcb.json b/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200/7d9901e0-eafe-4d49-a5bb-fab059708bcb.json deleted file mode 100644 index 797f0d872..000000000 --- a/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200/7d9901e0-eafe-4d49-a5bb-fab059708bcb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NikolaSigmoid_AceMath-1.5B-Instruct-dolphin-r1-200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceMath-1.5B-Instruct-dolphin-r1-200", - "id": "NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200", - "developer": "NikolaSigmoid", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.928 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1808 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2815 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500/ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json b/data/hfopenllm_v2/NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500/ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json deleted file mode 100644 index 149206bb9..000000000 --- a/data/hfopenllm_v2/NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500/ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NikolaSigmoid_DeepSeek-R1-Distill-Qwen-1.5B-500/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-1.5B-500", - "id": "NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500", - "developer": "NikolaSigmoid", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.157 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1749 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2602 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NikolaSigmoid/acemath-200/6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json b/data/hfopenllm_v2/NikolaSigmoid/acemath-200/6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json deleted file mode 100644 index 947e95059..000000000 --- a/data/hfopenllm_v2/NikolaSigmoid/acemath-200/6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NikolaSigmoid_acemath-200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "acemath-200", - "id": "NikolaSigmoid/acemath-200", - "developer": "NikolaSigmoid", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.791 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2849 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4263 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3051 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3925 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NikolaSigmoid/phi-4-14b/0aa7572c-1aa6-4997-a2a2-3b557fbde639.json b/data/hfopenllm_v2/NikolaSigmoid/phi-4-14b/0aa7572c-1aa6-4997-a2a2-3b557fbde639.json deleted file mode 100644 index 61e7f3fc3..000000000 --- a/data/hfopenllm_v2/NikolaSigmoid/phi-4-14b/0aa7572c-1aa6-4997-a2a2-3b557fbde639.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NikolaSigmoid_phi-4-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-14b", - "id": "NikolaSigmoid/phi-4-14b", - "developer": "NikolaSigmoid", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "", - "params_billions": 14.704 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0561 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6695 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2938 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4035 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5047 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NikolaSigmoid/phi-4-1steps/6f5df760-2d3e-47b1-b55e-4031a5f11d41.json b/data/hfopenllm_v2/NikolaSigmoid/phi-4-1steps/6f5df760-2d3e-47b1-b55e-4031a5f11d41.json deleted file mode 100644 index eba7dc221..000000000 --- a/data/hfopenllm_v2/NikolaSigmoid/phi-4-1steps/6f5df760-2d3e-47b1-b55e-4031a5f11d41.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NikolaSigmoid_phi-4-1steps/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-1steps", - "id": "NikolaSigmoid/phi-4-1steps", - "developer": "NikolaSigmoid", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "", - "params_billions": 14.704 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0528 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6707 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2983 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4018 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5273 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NikolaSigmoid/phi-4-300steps/ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json b/data/hfopenllm_v2/NikolaSigmoid/phi-4-300steps/ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json deleted file mode 100644 index d245536d7..000000000 --- a/data/hfopenllm_v2/NikolaSigmoid/phi-4-300steps/ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NikolaSigmoid_phi-4-300steps/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-300steps", - "id": "NikolaSigmoid/phi-4-300steps", - "developer": "NikolaSigmoid", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "", - "params_billions": 14.704 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0561 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6701 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2946 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4052 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5288 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420/2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json b/data/hfopenllm_v2/Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420/2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json deleted file mode 100644 index 021208aeb..000000000 --- a/data/hfopenllm_v2/Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420/2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain-Eris-BMO_Violent-GRPO-v0.420/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Captain-Eris-BMO_Violent-GRPO-v0.420", - "id": "Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420", - "developer": "Nitral-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6313 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5079 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3596 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_BMO-Violent-12B/95ebc5b8-a541-4fca-9e7c-692720e73362.json b/data/hfopenllm_v2/Nitral-AI/Captain-Eris_BMO-Violent-12B/95ebc5b8-a541-4fca-9e7c-692720e73362.json deleted file mode 100644 index 52a63c6ba..000000000 --- a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_BMO-Violent-12B/95ebc5b8-a541-4fca-9e7c-692720e73362.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain-Eris_BMO-Violent-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Captain-Eris_BMO-Violent-12B", - "id": "Nitral-AI/Captain-Eris_BMO-Violent-12B", - "developer": "Nitral-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6152 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5104 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4255 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3571 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-GRPO-v0.420/09a2508d-a171-493f-9ff2-e7f375815c91.json b/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-GRPO-v0.420/09a2508d-a171-493f-9ff2-e7f375815c91.json deleted file mode 100644 index 4d63cdc2b..000000000 --- a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-GRPO-v0.420/09a2508d-a171-493f-9ff2-e7f375815c91.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain-Eris_Violet-GRPO-v0.420/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Captain-Eris_Violet-GRPO-v0.420", - "id": "Nitral-AI/Captain-Eris_Violet-GRPO-v0.420", - "developer": "Nitral-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6262 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5159 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-V0.420-12B/12a4a921-5859-4fd6-9d64-677a7d8ef696.json b/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-V0.420-12B/12a4a921-5859-4fd6-9d64-677a7d8ef696.json deleted file mode 100644 index 8679f45f6..000000000 --- a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-V0.420-12B/12a4a921-5859-4fd6-9d64-677a7d8ef696.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain-Eris_Violet-V0.420-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Captain-Eris_Violet-V0.420-12B", - "id": "Nitral-AI/Captain-Eris_Violet-V0.420-12B", - "developer": "Nitral-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4339 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5478 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4331 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nitral-AI/Captain_BMO-12B/b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json b/data/hfopenllm_v2/Nitral-AI/Captain_BMO-12B/b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json deleted file mode 100644 index ab08d38f3..000000000 --- a/data/hfopenllm_v2/Nitral-AI/Captain_BMO-12B/b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain_BMO-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Captain_BMO-12B", - "id": "Nitral-AI/Captain_BMO-12B", - "developer": "Nitral-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4751 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5286 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3748 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3569 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nitral-AI/Hathor_Stable-v0.2-L3-8B/d162cf7c-3ef4-420f-aab4-789a98b1195a.json b/data/hfopenllm_v2/Nitral-AI/Hathor_Stable-v0.2-L3-8B/d162cf7c-3ef4-420f-aab4-789a98b1195a.json deleted file mode 100644 index 74bd3f7c0..000000000 --- a/data/hfopenllm_v2/Nitral-AI/Hathor_Stable-v0.2-L3-8B/d162cf7c-3ef4-420f-aab4-789a98b1195a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nitral-AI_Hathor_Stable-v0.2-L3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hathor_Stable-v0.2-L3-8B", - "id": "Nitral-AI/Hathor_Stable-v0.2-L3-8B", - "developer": "Nitral-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7175 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5286 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3696 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nitral-AI/Hathor_Tahsin-L3-8B-v0.85/7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json b/data/hfopenllm_v2/Nitral-AI/Hathor_Tahsin-L3-8B-v0.85/7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json deleted file mode 100644 index b03d999be..000000000 --- a/data/hfopenllm_v2/Nitral-AI/Hathor_Tahsin-L3-8B-v0.85/7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nitral-AI_Hathor_Tahsin-L3-8B-v0.85/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hathor_Tahsin-L3-8B-v0.85", - "id": "Nitral-AI/Hathor_Tahsin-L3-8B-v0.85", - "developer": "Nitral-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5279 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1005 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3647 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nitral-AI/Nera_Noctis-12B/24677f2a-ea89-4289-bcb6-13699de9782f.json b/data/hfopenllm_v2/Nitral-AI/Nera_Noctis-12B/24677f2a-ea89-4289-bcb6-13699de9782f.json deleted file mode 100644 index 2cbdf842f..000000000 --- a/data/hfopenllm_v2/Nitral-AI/Nera_Noctis-12B/24677f2a-ea89-4289-bcb6-13699de9782f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nitral-AI_Nera_Noctis-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nera_Noctis-12B", - "id": "Nitral-AI/Nera_Noctis-12B", - "developer": "Nitral-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3979 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3468 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.1/3e09df3c-2224-4a29-8e55-18a485db2b25.json b/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.1/3e09df3c-2224-4a29-8e55-18a485db2b25.json deleted file mode 100644 index a6a01b8ab..000000000 --- a/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.1/3e09df3c-2224-4a29-8e55-18a485db2b25.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nohobby_MS-Schisandra-22B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MS-Schisandra-22B-v0.1", - "id": "Nohobby/MS-Schisandra-22B-v0.1", - "developer": "Nohobby", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.579 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2228 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.2/cc0bd236-8fc4-43d3-a18f-4b2afb112946.json b/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.2/cc0bd236-8fc4-43d3-a18f-4b2afb112946.json deleted file mode 100644 index c97104188..000000000 --- a/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.2/cc0bd236-8fc4-43d3-a18f-4b2afb112946.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Nohobby_MS-Schisandra-22B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MS-Schisandra-22B-v0.2", - "id": "Nohobby/MS-Schisandra-22B-v0.2", - "developer": "Nohobby", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6383 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5841 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2032 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4075 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Norquinal/Alpha/5afd4c0f-b61d-452f-8c48-d298780d91d5.json b/data/hfopenllm_v2/Norquinal/Alpha/5afd4c0f-b61d-452f-8c48-d298780d91d5.json deleted file mode 100644 index a242e53cd..000000000 --- a/data/hfopenllm_v2/Norquinal/Alpha/5afd4c0f-b61d-452f-8c48-d298780d91d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Norquinal_Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Alpha", - "id": "Norquinal/Alpha", - "developer": "Norquinal", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2803 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3374 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Norquinal/Bravo/eac52141-4fd8-4e21-9c78-920ab8933e5a.json b/data/hfopenllm_v2/Norquinal/Bravo/eac52141-4fd8-4e21-9c78-920ab8933e5a.json deleted file mode 100644 index 4a140030a..000000000 --- a/data/hfopenllm_v2/Norquinal/Bravo/eac52141-4fd8-4e21-9c78-920ab8933e5a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Norquinal_Bravo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bravo", - "id": "Norquinal/Bravo", - "developer": "Norquinal", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3025 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3558 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3869 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Norquinal/Charlie/8449837f-64ac-4293-b1f8-210e62779202.json b/data/hfopenllm_v2/Norquinal/Charlie/8449837f-64ac-4293-b1f8-210e62779202.json deleted file mode 100644 index eb7553fa4..000000000 --- a/data/hfopenllm_v2/Norquinal/Charlie/8449837f-64ac-4293-b1f8-210e62779202.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Norquinal_Charlie/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Charlie", - "id": "Norquinal/Charlie", - "developer": "Norquinal", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3061 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3737 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Norquinal/Delta/ab8a665c-8234-484f-a8a9-8ee79d73edff.json b/data/hfopenllm_v2/Norquinal/Delta/ab8a665c-8234-484f-a8a9-8ee79d73edff.json deleted file mode 100644 index bbcdf56d1..000000000 --- a/data/hfopenllm_v2/Norquinal/Delta/ab8a665c-8234-484f-a8a9-8ee79d73edff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Norquinal_Delta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Delta", - "id": "Norquinal/Delta", - "developer": "Norquinal", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2538 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2959 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Norquinal/Echo/a954242f-41a6-49d7-a71d-3bfe940cdb92.json b/data/hfopenllm_v2/Norquinal/Echo/a954242f-41a6-49d7-a71d-3bfe940cdb92.json deleted file mode 100644 index b0482210d..000000000 --- a/data/hfopenllm_v2/Norquinal/Echo/a954242f-41a6-49d7-a71d-3bfe940cdb92.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Norquinal_Echo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Echo", - "id": "Norquinal/Echo", - "developer": "Norquinal", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3158 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.353 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Norquinal/Foxtrot/6d1c518f-3f42-49eb-9208-b30e27e7e87e.json b/data/hfopenllm_v2/Norquinal/Foxtrot/6d1c518f-3f42-49eb-9208-b30e27e7e87e.json deleted file mode 100644 index eb075ffce..000000000 --- a/data/hfopenllm_v2/Norquinal/Foxtrot/6d1c518f-3f42-49eb-9208-b30e27e7e87e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Norquinal_Foxtrot/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Foxtrot", - "id": "Norquinal/Foxtrot", - "developer": "Norquinal", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3558 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.305 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Norquinal/Golf/87931db7-42a4-48df-b5a5-8bd934061dbe.json b/data/hfopenllm_v2/Norquinal/Golf/87931db7-42a4-48df-b5a5-8bd934061dbe.json deleted file mode 100644 index a262d2157..000000000 --- a/data/hfopenllm_v2/Norquinal/Golf/87931db7-42a4-48df-b5a5-8bd934061dbe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Norquinal_Golf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Golf", - "id": "Norquinal/Golf", - "developer": "Norquinal", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3534 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3533 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3056 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Norquinal/Hotel/54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json b/data/hfopenllm_v2/Norquinal/Hotel/54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json deleted file mode 100644 index 21264522c..000000000 --- a/data/hfopenllm_v2/Norquinal/Hotel/54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Norquinal_Hotel/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hotel", - "id": "Norquinal/Hotel", - "developer": "Norquinal", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3215 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-1B-0929/7129efad-8ab2-4f7a-b6ed-055989b3e131.json b/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-1B-0929/7129efad-8ab2-4f7a-b6ed-055989b3e131.json deleted file mode 100644 index 0d3997068..000000000 --- a/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-1B-0929/7129efad-8ab2-4f7a-b6ed-055989b3e131.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NotASI_FineTome-Llama3.2-1B-0929/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FineTome-Llama3.2-1B-0929", - "id": "NotASI/FineTome-Llama3.2-1B-0929", - "developer": "NotASI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3991 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3246 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1429 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-3B-1002/cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json b/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-3B-1002/cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json deleted file mode 100644 index acaee36c4..000000000 --- a/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-3B-1002/cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NotASI_FineTome-Llama3.2-3B-1002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FineTome-Llama3.2-3B-1002", - "id": "NotASI/FineTome-Llama3.2-3B-1002", - "developer": "NotASI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5474 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4319 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3685 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2437 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-1B-1007/0f053a45-cd79-4e51-9b4c-ae5c51006c17.json b/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-1B-1007/0f053a45-cd79-4e51-9b4c-ae5c51006c17.json deleted file mode 100644 index eb26e5748..000000000 --- a/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-1B-1007/0f053a45-cd79-4e51-9b4c-ae5c51006c17.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NotASI_FineTome-v1.5-Llama3.2-1B-1007/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FineTome-v1.5-Llama3.2-1B-1007", - "id": "NotASI/FineTome-v1.5-Llama3.2-1B-1007", - "developer": "NotASI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3924 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1427 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-3B-1007/d8002b35-1454-4635-a31e-b419c7000b53.json b/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-3B-1007/d8002b35-1454-4635-a31e-b419c7000b53.json deleted file mode 100644 index d99453b0a..000000000 --- a/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-3B-1007/d8002b35-1454-4635-a31e-b419c7000b53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NotASI_FineTome-v1.5-Llama3.2-3B-1007/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FineTome-v1.5-Llama3.2-3B-1007", - "id": "NotASI/FineTome-v1.5-Llama3.2-3B-1007", - "developer": "NotASI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5508 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4312 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3645 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2448 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/DeepHermes-3-Mistral-24B-Preview/4c08530e-d529-49a1-a3fe-2351c422981a.json b/data/hfopenllm_v2/NousResearch/DeepHermes-3-Mistral-24B-Preview/4c08530e-d529-49a1-a3fe-2351c422981a.json deleted file mode 100644 index df49d3613..000000000 --- a/data/hfopenllm_v2/NousResearch/DeepHermes-3-Mistral-24B-Preview/4c08530e-d529-49a1-a3fe-2351c422981a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_DeepHermes-3-Mistral-24B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepHermes-3-Mistral-24B-Preview", - "id": "NousResearch/DeepHermes-3-Mistral-24B-Preview", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4536 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6488 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4503 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Llama-3-8B/d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json b/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Llama-3-8B/d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json deleted file mode 100644 index bbff9c46a..000000000 --- a/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Llama-3-8B/d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-2-Pro-Llama-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-2-Pro-Llama-3-8B", - "id": "NousResearch/Hermes-2-Pro-Llama-3-8B", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5362 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5071 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3052 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Mistral-7B/70656b13-e0a2-4ef4-af43-0d9995d57af6.json b/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Mistral-7B/70656b13-e0a2-4ef4-af43-0d9995d57af6.json deleted file mode 100644 index ec8e4de73..000000000 --- a/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Mistral-7B/70656b13-e0a2-4ef4-af43-0d9995d57af6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-2-Pro-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-2-Pro-Mistral-7B", - "id": "NousResearch/Hermes-2-Pro-Mistral-7B", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5668 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4995 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4376 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2946 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Hermes-2-Theta-Llama-3-8B/6544f1ca-02a6-4e58-98f0-e19cc6082682.json b/data/hfopenllm_v2/NousResearch/Hermes-2-Theta-Llama-3-8B/6544f1ca-02a6-4e58-98f0-e19cc6082682.json deleted file mode 100644 index b3a330363..000000000 --- a/data/hfopenllm_v2/NousResearch/Hermes-2-Theta-Llama-3-8B/6544f1ca-02a6-4e58-98f0-e19cc6082682.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-2-Theta-Llama-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-2-Theta-Llama-3-8B", - "id": "NousResearch/Hermes-2-Theta-Llama-3-8B", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3949 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-70B/5cd3796f-fb31-49c1-a974-019c5c5b20ae.json b/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-70B/5cd3796f-fb31-49c1-a974-019c5c5b20ae.json deleted file mode 100644 index 75935fc92..000000000 --- a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-70B/5cd3796f-fb31-49c1-a974-019c5c5b20ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-3-Llama-3.1-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-3-Llama-3.1-70B", - "id": "NousResearch/Hermes-3-Llama-3.1-70B", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7661 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6756 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4949 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-8B/49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json b/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-8B/49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json deleted file mode 100644 index 0ebe2c0ab..000000000 --- a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-8B/49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-3-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-3-Llama-3.1-8B", - "id": "NousResearch/Hermes-3-Llama-3.1-8B", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.617 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5177 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4369 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.2-3B/59720f7e-7e09-483f-8332-8dc7aa19ae78.json b/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.2-3B/59720f7e-7e09-483f-8332-8dc7aa19ae78.json deleted file mode 100644 index 248fd1575..000000000 --- a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.2-3B/59720f7e-7e09-483f-8332-8dc7aa19ae78.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-3-Llama-3.2-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-3-Llama-3.2-3B", - "id": "NousResearch/Hermes-3-Llama-3.2-3B", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4352 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2544 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/a3a89e4a-0589-4776-a1da-227552482e94.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/a3a89e4a-0589-4776-a1da-227552482e94.json deleted file mode 100644 index c982b95bd..000000000 --- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/a3a89e4a-0589-4776-a1da-227552482e94.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-2-Mistral-7B-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nous-Hermes-2-Mistral-7B-DPO", - "id": "NousResearch/Nous-Hermes-2-Mistral-7B-DPO", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4853 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3015 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json deleted file mode 100644 index 44ac522b4..000000000 --- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nous-Hermes-2-Mixtral-8x7B-DPO", - "id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5897 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/448fda35-bfdc-42ae-90f9-d44383e0a454.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/448fda35-bfdc-42ae-90f9-d44383e0a454.json deleted file mode 100644 index d14264647..000000000 --- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/448fda35-bfdc-42ae-90f9-d44383e0a454.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-2-Mixtral-8x7B-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nous-Hermes-2-Mixtral-8x7B-SFT", - "id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5731 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5058 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4214 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3066 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-SOLAR-10.7B/0d97542e-82b6-4f27-9822-62b67e7690c2.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-SOLAR-10.7B/0d97542e-82b6-4f27-9822-62b67e7690c2.json deleted file mode 100644 index d0f799596..000000000 --- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-SOLAR-10.7B/0d97542e-82b6-4f27-9822-62b67e7690c2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-2-SOLAR-10.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nous-Hermes-2-SOLAR-10.7B", - "id": "NousResearch/Nous-Hermes-2-SOLAR-10.7B", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5279 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5414 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3458 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-llama-2-7b/2725bd69-839d-4427-8e05-0e289fff70de.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-llama-2-7b/2725bd69-839d-4427-8e05-0e289fff70de.json deleted file mode 100644 index 07b68eec5..000000000 --- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-llama-2-7b/2725bd69-839d-4427-8e05-0e289fff70de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-llama-2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nous-Hermes-llama-2-7b", - "id": "NousResearch/Nous-Hermes-llama-2-7b", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1729 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3824 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4257 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.194 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-13b-128k/adb71488-adb8-4848-bf1d-aecd04cb6718.json b/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-13b-128k/adb71488-adb8-4848-bf1d-aecd04cb6718.json deleted file mode 100644 index d612c4d8c..000000000 --- a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-13b-128k/adb71488-adb8-4848-bf1d-aecd04cb6718.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Llama-2-13b-128k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yarn-Llama-2-13b-128k", - "id": "NousResearch/Yarn-Llama-2-13b-128k", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1655 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3827 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3458 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.232 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-128k/c7736577-c4c3-4233-9308-a4bb9b2dbb89.json b/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-128k/c7736577-c4c3-4233-9308-a4bb9b2dbb89.json deleted file mode 100644 index ca3a9ae2e..000000000 --- a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-128k/c7736577-c4c3-4233-9308-a4bb9b2dbb89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Llama-2-7b-128k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yarn-Llama-2-7b-128k", - "id": "NousResearch/Yarn-Llama-2-7b-128k", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1485 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3967 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1791 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-64k/76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json b/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-64k/76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json deleted file mode 100644 index a38d9e7b9..000000000 --- a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-64k/76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Llama-2-7b-64k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yarn-Llama-2-7b-64k", - "id": "NousResearch/Yarn-Llama-2-7b-64k", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.17 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3326 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1799 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-128k/1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json b/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-128k/1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json deleted file mode 100644 index 8725d08a6..000000000 --- a/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-128k/1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Mistral-7b-128k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yarn-Mistral-7b-128k", - "id": "NousResearch/Yarn-Mistral-7b-128k", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1934 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2893 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-64k/5e1513f1-4375-4380-85fa-b96a419c013b.json b/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-64k/5e1513f1-4375-4380-85fa-b96a419c013b.json deleted file mode 100644 index 6002e3e56..000000000 --- a/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-64k/5e1513f1-4375-4380-85fa-b96a419c013b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Mistral-7b-64k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yarn-Mistral-7b-64k", - "id": "NousResearch/Yarn-Mistral-7b-64k", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2914 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-32k/fadbf3b2-283a-4f8e-9acf-463d75924b97.json b/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-32k/fadbf3b2-283a-4f8e-9acf-463d75924b97.json deleted file mode 100644 index a3f6cc8a8..000000000 --- a/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-32k/fadbf3b2-283a-4f8e-9acf-463d75924b97.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Solar-10b-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yarn-Solar-10b-32k", - "id": "NousResearch/Yarn-Solar-10b-32k", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1942 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4987 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4146 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-64k/c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json b/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-64k/c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json deleted file mode 100644 index 6b8acf7a6..000000000 --- a/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-64k/c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Solar-10b-64k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yarn-Solar-10b-64k", - "id": "NousResearch/Yarn-Solar-10b-64k", - "developer": "NousResearch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1989 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4922 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4014 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3148 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/ASTAROTH-3.2-1B/a9aa164e-386b-4987-9f49-2dde64ade45c.json b/data/hfopenllm_v2/Novaciano/ASTAROTH-3.2-1B/a9aa164e-386b-4987-9f49-2dde64ade45c.json deleted file mode 100644 index da6642839..000000000 --- a/data/hfopenllm_v2/Novaciano/ASTAROTH-3.2-1B/a9aa164e-386b-4987-9f49-2dde64ade45c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_ASTAROTH-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ASTAROTH-3.2-1B", - "id": "Novaciano/ASTAROTH-3.2-1B", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5613 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3543 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1909 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/BLAST_PROCESSING-3.2-1B/e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json b/data/hfopenllm_v2/Novaciano/BLAST_PROCESSING-3.2-1B/e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json deleted file mode 100644 index 3f01feccd..000000000 --- a/data/hfopenllm_v2/Novaciano/BLAST_PROCESSING-3.2-1B/e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_BLAST_PROCESSING-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BLAST_PROCESSING-3.2-1B", - "id": "Novaciano/BLAST_PROCESSING-3.2-1B", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3351 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1941 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/Cerberus-3.2-1B/1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json b/data/hfopenllm_v2/Novaciano/Cerberus-3.2-1B/1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json deleted file mode 100644 index 850ea6a63..000000000 --- a/data/hfopenllm_v2/Novaciano/Cerberus-3.2-1B/1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_Cerberus-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cerberus-3.2-1B", - "id": "Novaciano/Cerberus-3.2-1B", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5017 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4165 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1663 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/Cultist-3.2-1B/80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json b/data/hfopenllm_v2/Novaciano/Cultist-3.2-1B/80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json deleted file mode 100644 index 25acd6b95..000000000 --- a/data/hfopenllm_v2/Novaciano/Cultist-3.2-1B/80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_Cultist-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cultist-3.2-1B", - "id": "Novaciano/Cultist-3.2-1B", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5295 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3399 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.333 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1714 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP/afb24bf8-3c47-4278-9b84-19b05017745b.json b/data/hfopenllm_v2/Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP/afb24bf8-3c47-4278-9b84-19b05017745b.json deleted file mode 100644 index cfdd2f4d5..000000000 --- a/data/hfopenllm_v2/Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP/afb24bf8-3c47-4278-9b84-19b05017745b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_FuseChat-3.2-1B-GRPO_Creative_RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FuseChat-3.2-1B-GRPO_Creative_RP", - "id": "Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5598 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3488 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0801 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3329 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1735 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative/4f8cda4d-959b-41ab-a79d-d2b35968eb89.json b/data/hfopenllm_v2/Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative/4f8cda4d-959b-41ab-a79d-d2b35968eb89.json deleted file mode 100644 index 7acb2bdcc..000000000 --- a/data/hfopenllm_v2/Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative/4f8cda4d-959b-41ab-a79d-d2b35968eb89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_Fusetrix-3.2-1B-GRPO_RP_Creative/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fusetrix-3.2-1B-GRPO_RP_Creative", - "id": "Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5366 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1758 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json b/data/hfopenllm_v2/Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json deleted file mode 100644 index 4b1fc2436..000000000 --- a/data/hfopenllm_v2/Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP", - "id": "Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5343 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3502 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3183 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1823 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/HarmfulProject-3.2-1B/6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json b/data/hfopenllm_v2/Novaciano/HarmfulProject-3.2-1B/6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json deleted file mode 100644 index 2b69d3c05..000000000 --- a/data/hfopenllm_v2/Novaciano/HarmfulProject-3.2-1B/6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_HarmfulProject-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HarmfulProject-3.2-1B", - "id": "Novaciano/HarmfulProject-3.2-1B", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3274 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3419 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1823 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/LEWD-Mental-Cultist-3.2-1B/38cb02a8-862d-40e1-922a-e65f537df87e.json b/data/hfopenllm_v2/Novaciano/LEWD-Mental-Cultist-3.2-1B/38cb02a8-862d-40e1-922a-e65f537df87e.json deleted file mode 100644 index c5ba6cc4d..000000000 --- a/data/hfopenllm_v2/Novaciano/LEWD-Mental-Cultist-3.2-1B/38cb02a8-862d-40e1-922a-e65f537df87e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_LEWD-Mental-Cultist-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LEWD-Mental-Cultist-3.2-1B", - "id": "Novaciano/LEWD-Mental-Cultist-3.2-1B", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5309 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3513 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3223 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1769 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/La_Mejor_Mezcla-3.2-1B/f816e2a7-2629-4abe-9ed0-3d1299e95194.json b/data/hfopenllm_v2/Novaciano/La_Mejor_Mezcla-3.2-1B/f816e2a7-2629-4abe-9ed0-3d1299e95194.json deleted file mode 100644 index 6d6061a90..000000000 --- a/data/hfopenllm_v2/Novaciano/La_Mejor_Mezcla-3.2-1B/f816e2a7-2629-4abe-9ed0-3d1299e95194.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_La_Mejor_Mezcla-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "La_Mejor_Mezcla-3.2-1B", - "id": "Novaciano/La_Mejor_Mezcla-3.2-1B", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.551 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3488 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1829 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Novaciano/Sigil-Of-Satan-3.2-1B/286fae5b-544a-4033-9092-d633fc80f47b.json b/data/hfopenllm_v2/Novaciano/Sigil-Of-Satan-3.2-1B/286fae5b-544a-4033-9092-d633fc80f47b.json deleted file mode 100644 index b801b00b1..000000000 --- a/data/hfopenllm_v2/Novaciano/Sigil-Of-Satan-3.2-1B/286fae5b-544a-4033-9092-d633fc80f47b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Novaciano_Sigil-Of-Satan-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sigil-Of-Satan-3.2-1B", - "id": "Novaciano/Sigil-Of-Satan-3.2-1B", - "developer": "Novaciano", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5494 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3546 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3276 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1855 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NucleusAI/nucleus-22B-token-500B/93477bf6-ea00-418b-8a2f-975a9554263e.json b/data/hfopenllm_v2/NucleusAI/nucleus-22B-token-500B/93477bf6-ea00-418b-8a2f-975a9554263e.json deleted file mode 100644 index 9e4f3e608..000000000 --- a/data/hfopenllm_v2/NucleusAI/nucleus-22B-token-500B/93477bf6-ea00-418b-8a2f-975a9554263e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NucleusAI_nucleus-22B-token-500B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nucleus-22B-token-500B", - "id": "NucleusAI/nucleus-22B-token-500B", - "developer": "NucleusAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 21.828 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3511 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1162 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/NyxKrage/Microsoft_Phi-4/3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json b/data/hfopenllm_v2/NyxKrage/Microsoft_Phi-4/3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json deleted file mode 100644 index a36e190d6..000000000 --- a/data/hfopenllm_v2/NyxKrage/Microsoft_Phi-4/3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/NyxKrage_Microsoft_Phi-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Microsoft_Phi-4", - "id": "NyxKrage/Microsoft_Phi-4", - "developer": "NyxKrage", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0585 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6691 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2991 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5287 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OEvortex/Emotional-llama-8B/d1e9a242-941f-4461-b75b-7043c2c01ef7.json b/data/hfopenllm_v2/OEvortex/Emotional-llama-8B/d1e9a242-941f-4461-b75b-7043c2c01ef7.json deleted file mode 100644 index 66b1bbf40..000000000 --- a/data/hfopenllm_v2/OEvortex/Emotional-llama-8B/d1e9a242-941f-4461-b75b-7043c2c01ef7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OEvortex_Emotional-llama-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Emotional-llama-8B", - "id": "OEvortex/Emotional-llama-8B", - "developer": "OEvortex", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3516 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4839 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3659 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OEvortex/HelpingAI-15B/e39661af-ad93-41d7-8892-1230064f1a1c.json b/data/hfopenllm_v2/OEvortex/HelpingAI-15B/e39661af-ad93-41d7-8892-1230064f1a1c.json deleted file mode 100644 index 58003624d..000000000 --- a/data/hfopenllm_v2/OEvortex/HelpingAI-15B/e39661af-ad93-41d7-8892-1230064f1a1c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OEvortex_HelpingAI-15B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HelpingAI-15B", - "id": "OEvortex/HelpingAI-15B", - "developer": "OEvortex", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 15.323 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.203 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3619 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OEvortex/HelpingAI-3B-reloaded/595b61b2-5220-48f6-91a0-3aa0d37c63d8.json b/data/hfopenllm_v2/OEvortex/HelpingAI-3B-reloaded/595b61b2-5220-48f6-91a0-3aa0d37c63d8.json deleted file mode 100644 index c50a519bb..000000000 --- a/data/hfopenllm_v2/OEvortex/HelpingAI-3B-reloaded/595b61b2-5220-48f6-91a0-3aa0d37c63d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OEvortex_HelpingAI-3B-reloaded/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HelpingAI-3B-reloaded", - "id": "OEvortex/HelpingAI-3B-reloaded", - "developer": "OEvortex", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.81 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4647 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4129 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3524 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2595 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OEvortex/HelpingAI2-9B/3173263e-2a42-4e8d-956e-8175ef464e76.json b/data/hfopenllm_v2/OEvortex/HelpingAI2-9B/3173263e-2a42-4e8d-956e-8175ef464e76.json deleted file mode 100644 index daf836ff0..000000000 --- a/data/hfopenllm_v2/OEvortex/HelpingAI2-9B/3173263e-2a42-4e8d-956e-8175ef464e76.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OEvortex_HelpingAI2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HelpingAI2-9B", - "id": "OEvortex/HelpingAI2-9B", - "developer": "OEvortex", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.903 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4845 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3711 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OEvortex/HelpingAI2.5-10B/f77f8291-1573-4fb6-a984-1cc099c09621.json b/data/hfopenllm_v2/OEvortex/HelpingAI2.5-10B/f77f8291-1573-4fb6-a984-1cc099c09621.json deleted file mode 100644 index e0efa3c20..000000000 --- a/data/hfopenllm_v2/OEvortex/HelpingAI2.5-10B/f77f8291-1573-4fb6-a984-1cc099c09621.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OEvortex_HelpingAI2.5-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HelpingAI2.5-10B", - "id": "OEvortex/HelpingAI2.5-10B", - "developer": "OEvortex", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.211 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3277 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4496 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2575 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OliveiraJLT/Sagui-7B-Instruct-v0.1/c4681e14-513c-4e5e-af8c-88ca11849176.json b/data/hfopenllm_v2/OliveiraJLT/Sagui-7B-Instruct-v0.1/c4681e14-513c-4e5e-af8c-88ca11849176.json deleted file mode 100644 index 1f0b31152..000000000 --- a/data/hfopenllm_v2/OliveiraJLT/Sagui-7B-Instruct-v0.1/c4681e14-513c-4e5e-af8c-88ca11849176.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OliveiraJLT_Sagui-7B-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sagui-7B-Instruct-v0.1", - "id": "OliveiraJLT/Sagui-7B-Instruct-v0.1", - "developer": "OliveiraJLT", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2892 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3111 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4191 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1485 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Omkar1102/code-yi/0c220edd-2563-4fec-99a4-ef8c210ca5ce.json b/data/hfopenllm_v2/Omkar1102/code-yi/0c220edd-2563-4fec-99a4-ef8c210ca5ce.json deleted file mode 100644 index 032bf1e35..000000000 --- a/data/hfopenllm_v2/Omkar1102/code-yi/0c220edd-2563-4fec-99a4-ef8c210ca5ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Omkar1102_code-yi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "code-yi", - "id": "Omkar1102/code-yi", - "developer": "Omkar1102", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.084 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2254 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3762 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Omkar1102/code-yi/bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json b/data/hfopenllm_v2/Omkar1102/code-yi/bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json deleted file mode 100644 index aa34a77d8..000000000 --- a/data/hfopenllm_v2/Omkar1102/code-yi/bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Omkar1102_code-yi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "code-yi", - "id": "Omkar1102/code-yi", - "developer": "Omkar1102", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.084 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2148 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3802 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OmnicromsBrain/NeuralStar_FusionWriter_4x7b/85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json b/data/hfopenllm_v2/OmnicromsBrain/NeuralStar_FusionWriter_4x7b/85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json deleted file mode 100644 index 0f861c536..000000000 --- a/data/hfopenllm_v2/OmnicromsBrain/NeuralStar_FusionWriter_4x7b/85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OmnicromsBrain_NeuralStar_FusionWriter_4x7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralStar_FusionWriter_4x7b", - "id": "OmnicromsBrain/NeuralStar_FusionWriter_4x7b", - "developer": "OmnicromsBrain", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5964 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4776 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2606 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OnlyCheeini/greesychat-turbo/f180fddd-077f-43f9-b2d9-38c5f33be44d.json b/data/hfopenllm_v2/OnlyCheeini/greesychat-turbo/f180fddd-077f-43f9-b2d9-38c5f33be44d.json deleted file mode 100644 index f63187b2d..000000000 --- a/data/hfopenllm_v2/OnlyCheeini/greesychat-turbo/f180fddd-077f-43f9-b2d9-38c5f33be44d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OnlyCheeini_greesychat-turbo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "greesychat-turbo", - "id": "OnlyCheeini/greesychat-turbo", - "developer": "OnlyCheeini", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0233 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3092 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1138 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Open-Orca/Mistral-7B-OpenOrca/ef384329-8406-4767-ac1a-3eba3131f726.json b/data/hfopenllm_v2/Open-Orca/Mistral-7B-OpenOrca/ef384329-8406-4767-ac1a-3eba3131f726.json deleted file mode 100644 index db0c0f414..000000000 --- a/data/hfopenllm_v2/Open-Orca/Mistral-7B-OpenOrca/ef384329-8406-4767-ac1a-3eba3131f726.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Open-Orca_Mistral-7B-OpenOrca/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-OpenOrca", - "id": "Open-Orca/Mistral-7B-OpenOrca", - "developer": "Open-Orca", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3858 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2653 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenAssistant/oasst-sft-1-pythia-12b/2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json b/data/hfopenllm_v2/OpenAssistant/oasst-sft-1-pythia-12b/2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json deleted file mode 100644 index db24eb0bb..000000000 --- a/data/hfopenllm_v2/OpenAssistant/oasst-sft-1-pythia-12b/2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenAssistant_oasst-sft-1-pythia-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "oasst-sft-1-pythia-12b", - "id": "OpenAssistant/oasst-sft-1-pythia-12b", - "developer": "OpenAssistant", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 12.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1055 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3147 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3327 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-falcon3-10b-v24.2-131k/38b2dbbe-be86-4ef0-a39b-89841f662141.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-falcon3-10b-v24.2-131k/38b2dbbe-be86-4ef0-a39b-89841f662141.json deleted file mode 100644 index 133f23b3d..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-falcon3-10b-v24.2-131k/38b2dbbe-be86-4ef0-a39b-89841f662141.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-falcon3-10b-v24.2-131k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-falcon3-10b-v24.2-131k", - "id": "OpenBuddy/openbuddy-falcon3-10b-v24.2-131k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.34 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5086 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6004 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-70b-v21.2-32k/999a8091-22bd-4c08-bee1-772202e7edde.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-70b-v21.2-32k/999a8091-22bd-4c08-bee1-772202e7edde.json deleted file mode 100644 index e76e849ca..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-70b-v21.2-32k/999a8091-22bd-4c08-bee1-772202e7edde.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3-70b-v21.2-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-llama3-70b-v21.2-32k", - "id": "OpenBuddy/openbuddy-llama3-70b-v21.2-32k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.701 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2032 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.458 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4832 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.1-8k/fda91d98-d259-430c-929b-78852cab64ec.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.1-8k/fda91d98-d259-430c-929b-78852cab64ec.json deleted file mode 100644 index 38eedcf18..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.1-8k/fda91d98-d259-430c-929b-78852cab64ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3-8b-v21.1-8k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-llama3-8b-v21.1-8k", - "id": "OpenBuddy/openbuddy-llama3-8b-v21.1-8k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.557 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4788 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3988 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2955 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.2-32k/535bfa4f-ab63-4832-9f17-7b245ff2b2af.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.2-32k/535bfa4f-ab63-4832-9f17-7b245ff2b2af.json deleted file mode 100644 index abbc1f40c..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.2-32k/535bfa4f-ab63-4832-9f17-7b245ff2b2af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3-8b-v21.2-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-llama3-8b-v21.2-32k", - "id": "OpenBuddy/openbuddy-llama3-8b-v21.2-32k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6192 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4856 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3779 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k/681a6cc5-5519-4b13-8b50-93adcab4a3f7.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k/681a6cc5-5519-4b13-8b50-93adcab4a3f7.json deleted file mode 100644 index e126a1c30..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k/681a6cc5-5519-4b13-8b50-93adcab4a3f7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.1-70b-v22.1-131k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-llama3.1-70b-v22.1-131k", - "id": "OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7333 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6698 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5304 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k/141dd12c-6901-4a96-a051-f35647ddcc73.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k/141dd12c-6901-4a96-a051-f35647ddcc73.json deleted file mode 100644 index c11010c3b..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k/141dd12c-6901-4a96-a051-f35647ddcc73.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.1-8b-v22.2-131k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-llama3.1-8b-v22.2-131k", - "id": "OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6657 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5007 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k/5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k/5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json deleted file mode 100644 index e33abbdfc..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k/5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.1-8b-v22.3-131k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-llama3.1-8b-v22.3-131k", - "id": "OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5997 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3277 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k/7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k/7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json deleted file mode 100644 index 55a086e2b..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k/7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.2-1b-v23.1-131k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-llama3.2-1b-v23.1-131k", - "id": "OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.359 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3267 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.184 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k/7938a00e-4e11-4223-a900-fa53df168ab7.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k/7938a00e-4e11-4223-a900-fa53df168ab7.json deleted file mode 100644 index c20150989..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k/7938a00e-4e11-4223-a900-fa53df168ab7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.2-3b-v23.2-131k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-llama3.2-3b-v23.2-131k", - "id": "OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4319 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2479 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k/8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k/8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json deleted file mode 100644 index 97d54e843..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k/8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.3-70b-v24.1-131k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-llama3.3-70b-v24.1-131k", - "id": "OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8121 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6858 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4411 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4346 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4869 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5327 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/a334d998-21a5-4108-96e3-9935507a9f8f.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/a334d998-21a5-4108-96e3-9935507a9f8f.json deleted file mode 100644 index 25e129f68..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/a334d998-21a5-4108-96e3-9935507a9f8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-mixtral-7bx8-v18.1-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-mixtral-7bx8-v18.1-32k", - "id": "OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.741 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5493 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4656 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.1-131k/941e27c6-81da-4ce1-b1c8-544c1426cd11.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.1-131k/941e27c6-81da-4ce1-b1c8-544c1426cd11.json deleted file mode 100644 index 83c193063..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.1-131k/941e27c6-81da-4ce1-b1c8-544c1426cd11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-nemotron-70b-v23.1-131k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-nemotron-70b-v23.1-131k", - "id": "OpenBuddy/openbuddy-nemotron-70b-v23.1-131k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6749 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.321 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5175 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k/e409a374-685b-482d-82e4-2436dca37309.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k/e409a374-685b-482d-82e4-2436dca37309.json deleted file mode 100644 index d6fb1a342..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k/e409a374-685b-482d-82e4-2436dca37309.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-nemotron-70b-v23.2-131k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-nemotron-70b-v23.2-131k", - "id": "OpenBuddy/openbuddy-nemotron-70b-v23.2-131k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7227 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6705 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4696 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5121 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k/84713625-97b6-4fad-982d-41b5c500d73a.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k/84713625-97b6-4fad-982d-41b5c500d73a.json deleted file mode 100644 index d8cf89afa..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k/84713625-97b6-4fad-982d-41b5c500d73a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.1-200k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-qwen2.5llamaify-14b-v23.1-200k", - "id": "OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6309 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6013 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2538 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.424 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4673 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k/b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k/b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json deleted file mode 100644 index e24023046..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k/b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.3-200k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-qwen2.5llamaify-14b-v23.3-200k", - "id": "OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6131 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6081 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2311 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4795 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k/ec896115-21ef-4337-9fdd-32a04c574a05.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k/ec896115-21ef-4337-9fdd-32a04c574a05.json deleted file mode 100644 index 56c75d371..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k/ec896115-21ef-4337-9fdd-32a04c574a05.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwen2.5llamaify-7b-v23.1-200k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-qwen2.5llamaify-7b-v23.1-200k", - "id": "OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.615 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5673 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5509 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4363 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3948 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.1-200k/d8e5f49b-7bf3-41d4-a91e-c566219609f6.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.1-200k/d8e5f49b-7bf3-41d4-a91e-c566219609f6.json deleted file mode 100644 index 45a21bb0f..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.1-200k/d8e5f49b-7bf3-41d4-a91e-c566219609f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwq-32b-v24.1-200k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-qwq-32b-v24.1-200k", - "id": "OpenBuddy/openbuddy-qwq-32b-v24.1-200k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5937 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6798 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4849 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.2-200k/ce1a92a3-6bec-410f-ab42-c567c5d23856.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.2-200k/ce1a92a3-6bec-410f-ab42-c567c5d23856.json deleted file mode 100644 index 74da68126..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.2-200k/ce1a92a3-6bec-410f-ab42-c567c5d23856.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwq-32b-v24.2-200k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-qwq-32b-v24.2-200k", - "id": "OpenBuddy/openbuddy-qwq-32b-v24.2-200k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.597 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6772 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4718 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5446 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k/0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k/0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json deleted file mode 100644 index ee9bc7580..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k/0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-yi1.5-34b-v21.3-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-yi1.5-34b-v21.3-32k", - "id": "OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.407 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6163 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4439 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4599 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-14b-v22.3-32k/aeee0165-ac7e-4da6-8102-ba60f43587de.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-14b-v22.3-32k/aeee0165-ac7e-4da6-8102-ba60f43587de.json deleted file mode 100644 index fe9fb2d74..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-14b-v22.3-32k/aeee0165-ac7e-4da6-8102-ba60f43587de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-zero-14b-v22.3-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-zero-14b-v22.3-32k", - "id": "OpenBuddy/openbuddy-zero-14b-v22.3-32k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.022 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4166 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3187 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-3b-v21.2-32k/b47b8666-2556-45df-ba5b-9a5e94186784.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-3b-v21.2-32k/b47b8666-2556-45df-ba5b-9a5e94186784.json deleted file mode 100644 index 8ba48e7a5..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-3b-v21.2-32k/b47b8666-2556-45df-ba5b-9a5e94186784.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-zero-3b-v21.2-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-zero-3b-v21.2-32k", - "id": "OpenBuddy/openbuddy-zero-3b-v21.2-32k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.769 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3802 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3566 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2034 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-56b-v21.2-32k/0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-56b-v21.2-32k/0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json deleted file mode 100644 index bcede16a0..000000000 --- a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-56b-v21.2-32k/0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-zero-56b-v21.2-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbuddy-zero-56b-v21.2-32k", - "id": "OpenBuddy/openbuddy-zero-56b-v21.2-32k", - "developer": "OpenBuddy", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 56.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5057 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6128 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1624 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4305 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4399 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenGenerativeAI/Bifrost-14B/86599961-3ec2-4837-89a4-809f1dd7226c.json b/data/hfopenllm_v2/OpenGenerativeAI/Bifrost-14B/86599961-3ec2-4837-89a4-809f1dd7226c.json deleted file mode 100644 index e930a3e87..000000000 --- a/data/hfopenllm_v2/OpenGenerativeAI/Bifrost-14B/86599961-3ec2-4837-89a4-809f1dd7226c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenGenerativeAI_Bifrost-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bifrost-14B", - "id": "OpenGenerativeAI/Bifrost-14B", - "developer": "OpenGenerativeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6615 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6845 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2356 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5074 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenGenerativeAI/Bifrost/dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json b/data/hfopenllm_v2/OpenGenerativeAI/Bifrost/dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json deleted file mode 100644 index 9269d76e2..000000000 --- a/data/hfopenllm_v2/OpenGenerativeAI/Bifrost/dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenGenerativeAI_Bifrost/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bifrost", - "id": "OpenGenerativeAI/Bifrost", - "developer": "OpenGenerativeAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6348 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6849 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2545 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4598 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.516 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-human-data/cd77d407-3be3-4b84-8a73-34a15744de93.json b/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-human-data/cd77d407-3be3-4b84-8a73-34a15744de93.json deleted file mode 100644 index 1f47bfd03..000000000 --- a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-human-data/cd77d407-3be3-4b84-8a73-34a15744de93.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenLLM-France_Lucie-7B-Instruct-human-data/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lucie-7B-Instruct-human-data", - "id": "OpenLLM-France/Lucie-7B-Instruct-human-data", - "developer": "OpenLLM-France", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2946 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3284 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3729 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-v1.1/1cd20db5-0225-4724-b1f9-7c32eae456e1.json b/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-v1.1/1cd20db5-0225-4724-b1f9-7c32eae456e1.json deleted file mode 100644 index 0a5903536..000000000 --- a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-v1.1/1cd20db5-0225-4724-b1f9-7c32eae456e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenLLM-France_Lucie-7B-Instruct-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lucie-7B-Instruct-v1.1", - "id": "OpenLLM-France/Lucie-7B-Instruct-v1.1", - "developer": "OpenLLM-France", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3039 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3816 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1864 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct/dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json b/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct/dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json deleted file mode 100644 index 16a3cc3c6..000000000 --- a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct/dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenLLM-France_Lucie-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lucie-7B-Instruct", - "id": "OpenLLM-France/Lucie-7B-Instruct", - "developer": "OpenLLM-France", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2796 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3254 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3662 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B/3da2a408-672c-47b8-be32-61f56a15e9f3.json b/data/hfopenllm_v2/OpenLLM-France/Lucie-7B/3da2a408-672c-47b8-be32-61f56a15e9f3.json deleted file mode 100644 index be1189595..000000000 --- a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B/3da2a408-672c-47b8-be32-61f56a15e9f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenLLM-France_Lucie-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lucie-7B", - "id": "OpenLLM-France/Lucie-7B", - "developer": "OpenLLM-France", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3923 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1498 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenLeecher/llama3-8b-lima/94700c3c-f18d-4f96-a794-65bcf483fca9.json b/data/hfopenllm_v2/OpenLeecher/llama3-8b-lima/94700c3c-f18d-4f96-a794-65bcf483fca9.json deleted file mode 100644 index 63e1ec99a..000000000 --- a/data/hfopenllm_v2/OpenLeecher/llama3-8b-lima/94700c3c-f18d-4f96-a794-65bcf483fca9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenLeecher_llama3-8b-lima/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3-8b-lima", - "id": "OpenLeecher/llama3-8b-lima", - "developer": "OpenLeecher", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4296 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2383 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/OpenScholar/Llama-3.1_OpenScholar-8B/6f3481d4-076f-45bd-8564-d485109c7a63.json b/data/hfopenllm_v2/OpenScholar/Llama-3.1_OpenScholar-8B/6f3481d4-076f-45bd-8564-d485109c7a63.json deleted file mode 100644 index 38d1da240..000000000 --- a/data/hfopenllm_v2/OpenScholar/Llama-3.1_OpenScholar-8B/6f3481d4-076f-45bd-8564-d485109c7a63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/OpenScholar_Llama-3.1_OpenScholar-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1_OpenScholar-8B", - "id": "OpenScholar/Llama-3.1_OpenScholar-8B", - "developer": "OpenScholar", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5208 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1654 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2/9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json b/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2/9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json deleted file mode 100644 index 3461ecd6d..000000000 --- a/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2/9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Orenguteng_Llama-3.1-8B-Lexi-Uncensored-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Lexi-Uncensored-V2", - "id": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2", - "developer": "Orenguteng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7792 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1971 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored/f1932041-263a-4841-9c8b-c6cc9fa50c21.json b/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored/f1932041-263a-4841-9c8b-c6cc9fa50c21.json deleted file mode 100644 index e8f2ea31c..000000000 --- a/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored/f1932041-263a-4841-9c8b-c6cc9fa50c21.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Orenguteng_Llama-3.1-8B-Lexi-Uncensored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Lexi-Uncensored", - "id": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored", - "developer": "Orenguteng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7777 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5057 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1571 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3871 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Orion-zhen/Qwen2.5-7B-Instruct-Uncensored/691bef38-bc9e-4f8d-b774-9d7c62eec72b.json b/data/hfopenllm_v2/Orion-zhen/Qwen2.5-7B-Instruct-Uncensored/691bef38-bc9e-4f8d-b774-9d7c62eec72b.json deleted file mode 100644 index 1eff33056..000000000 --- a/data/hfopenllm_v2/Orion-zhen/Qwen2.5-7B-Instruct-Uncensored/691bef38-bc9e-4f8d-b774-9d7c62eec72b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Orion-zhen_Qwen2.5-7B-Instruct-Uncensored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Instruct-Uncensored", - "id": "Orion-zhen/Qwen2.5-7B-Instruct-Uncensored", - "developer": "Orion-zhen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7204 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5474 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4773 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4361 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4427 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Orion-zhen/phi-4-abliterated/5795f693-9ebc-47c6-9d2c-185dd0d32044.json b/data/hfopenllm_v2/Orion-zhen/phi-4-abliterated/5795f693-9ebc-47c6-9d2c-185dd0d32044.json deleted file mode 100644 index 93b48af5e..000000000 --- a/data/hfopenllm_v2/Orion-zhen/phi-4-abliterated/5795f693-9ebc-47c6-9d2c-185dd0d32044.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Orion-zhen_phi-4-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-abliterated", - "id": "Orion-zhen/phi-4-abliterated", - "developer": "Orion-zhen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0576 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6698 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3021 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4044 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5006 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5292 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/P0x0/Astra-v1-12B/eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json b/data/hfopenllm_v2/P0x0/Astra-v1-12B/eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json deleted file mode 100644 index 0751e8835..000000000 --- a/data/hfopenllm_v2/P0x0/Astra-v1-12B/eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/P0x0_Astra-v1-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Astra-v1-12B", - "id": "P0x0/Astra-v1-12B", - "developer": "P0x0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2806 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4052 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3461 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B/f93b2053-11c4-4868-860f-90fbfe8288fc.json b/data/hfopenllm_v2/PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B/f93b2053-11c4-4868-860f-90fbfe8288fc.json deleted file mode 100644 index b26fc54c0..000000000 --- a/data/hfopenllm_v2/PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B/f93b2053-11c4-4868-860f-90fbfe8288fc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers-Dev_L3.2-Instruct-Thinking-v0.1-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.2-Instruct-Thinking-v0.1-1B", - "id": "PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B", - "developer": "PJMixers-Dev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4628 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1483 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json deleted file mode 100644 index a3d8dd0e7..000000000 --- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMa-3.1-Instruct-Interleaved-Zeroed-13B", - "id": "PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B", - "developer": "PJMixers-Dev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.047 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7871 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2002 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B/a0f6f5de-578c-4290-85b5-c51aed985074.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B/a0f6f5de-578c-4290-85b5-c51aed985074.json deleted file mode 100644 index 1df7617e1..000000000 --- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B/a0f6f5de-578c-4290-85b5-c51aed985074.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.1-RomboTiesTest-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMa-3.1-RomboTiesTest-8B", - "id": "PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B", - "developer": "PJMixers-Dev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7825 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2002 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B/8ccc76ff-25c9-4706-b6a8-31b49f8be813.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B/8ccc76ff-25c9-4706-b6a8-31b49f8be813.json deleted file mode 100644 index 3d4794050..000000000 --- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B/8ccc76ff-25c9-4706-b6a8-31b49f8be813.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.1-RomboTiesTest2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMa-3.1-RomboTiesTest2-8B", - "id": "PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B", - "developer": "PJMixers-Dev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7825 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2002 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json deleted file mode 100644 index e62bf8d2d..000000000 --- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B", - "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B", - "developer": "PJMixers-Dev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6931 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4556 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/8e7dfd9f-350d-406c-811d-453f1744dd53.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/8e7dfd9f-350d-406c-811d-453f1744dd53.json deleted file mode 100644 index ddc4092b0..000000000 --- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/8e7dfd9f-350d-406c-811d-453f1744dd53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B", - "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B", - "developer": "PJMixers-Dev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6292 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4581 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3659 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/b713d1d2-351f-43a1-b77d-27723e1d4267.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/b713d1d2-351f-43a1-b77d-27723e1d4267.json deleted file mode 100644 index 68fb66922..000000000 --- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/b713d1d2-351f-43a1-b77d-27723e1d4267.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B", - "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B", - "developer": "PJMixers-Dev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6504 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4511 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/322a9442-174f-4223-b839-6f8f9664d5e5.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/322a9442-174f-4223-b839-6f8f9664d5e5.json deleted file mode 100644 index b3614926d..000000000 --- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/322a9442-174f-4223-b839-6f8f9664d5e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMa-3.2-Instruct-JankMixBread-v0.1-3B", - "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B", - "developer": "PJMixers-Dev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5041 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4483 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3516 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3083 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers-Dev/Qwen2.5-RomboTiesTest-7B/b12e71d1-c435-4172-a28f-38e26791dadb.json b/data/hfopenllm_v2/PJMixers-Dev/Qwen2.5-RomboTiesTest-7B/b12e71d1-c435-4172-a28f-38e26791dadb.json deleted file mode 100644 index 16e94042b..000000000 --- a/data/hfopenllm_v2/PJMixers-Dev/Qwen2.5-RomboTiesTest-7B/b12e71d1-c435-4172-a28f-38e26791dadb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers-Dev_Qwen2.5-RomboTiesTest-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-RomboTiesTest-7B", - "id": "PJMixers-Dev/Qwen2.5-RomboTiesTest-7B", - "developer": "PJMixers-Dev", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.808 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7558 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5399 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4285 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PJMixers/LLaMa-3-CursedStock-v2.0-8B/ad33b0e8-39c8-4118-81bd-bc86b482f122.json b/data/hfopenllm_v2/PJMixers/LLaMa-3-CursedStock-v2.0-8B/ad33b0e8-39c8-4118-81bd-bc86b482f122.json deleted file mode 100644 index 4d593a418..000000000 --- a/data/hfopenllm_v2/PJMixers/LLaMa-3-CursedStock-v2.0-8B/ad33b0e8-39c8-4118-81bd-bc86b482f122.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PJMixers_LLaMa-3-CursedStock-v2.0-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMa-3-CursedStock-v2.0-8B", - "id": "PJMixers/LLaMa-3-CursedStock-v2.0-8B", - "developer": "PJMixers", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5271 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3856 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3556 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Parissa3/test-model/db8a7864-293b-45e9-995b-5301071c902d.json b/data/hfopenllm_v2/Parissa3/test-model/db8a7864-293b-45e9-995b-5301071c902d.json deleted file mode 100644 index f98fab749..000000000 --- a/data/hfopenllm_v2/Parissa3/test-model/db8a7864-293b-45e9-995b-5301071c902d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Parissa3_test-model/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-model", - "id": "Parissa3/test-model", - "developer": "Parissa3", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4685 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3057 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/31e3beea-28dc-4b47-a5e9-5fafc89226db.json b/data/hfopenllm_v2/Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/31e3beea-28dc-4b47-a5e9-5fafc89226db.json deleted file mode 100644 index d4a800079..000000000 --- a/data/hfopenllm_v2/Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/31e3beea-28dc-4b47-a5e9-5fafc89226db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pinkstack_PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B", - "id": "Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B", - "developer": "Pinkstack", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5085 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4711 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1692 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3511 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ/49315a95-394f-4508-8e6c-7c1d5547c257.json b/data/hfopenllm_v2/Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ/49315a95-394f-4508-8e6c-7c1d5547c257.json deleted file mode 100644 index 13d84a54a..000000000 --- a/data/hfopenllm_v2/Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ/49315a95-394f-4508-8e6c-7c1d5547c257.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pinkstack_SuperThoughts-CoT-14B-16k-o1-QwQ/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SuperThoughts-CoT-14B-16k-o1-QwQ", - "id": "Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ", - "developer": "Pinkstack", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4914 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5268 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-1.8B-experimental-o1/375d3a94-97af-47ef-82af-afd7581663d4.json b/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-1.8B-experimental-o1/375d3a94-97af-47ef-82af-afd7581663d4.json deleted file mode 100644 index c0e931e91..000000000 --- a/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-1.8B-experimental-o1/375d3a94-97af-47ef-82af-afd7581663d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pinkstack_Superthoughts-lite-1.8B-experimental-o1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Superthoughts-lite-1.8B-experimental-o1", - "id": "Pinkstack/Superthoughts-lite-1.8B-experimental-o1", - "developer": "Pinkstack", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.812 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0375 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1851 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-v1/77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json b/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-v1/77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json deleted file mode 100644 index 7b240a43f..000000000 --- a/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-v1/77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pinkstack_Superthoughts-lite-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Superthoughts-lite-v1", - "id": "Pinkstack/Superthoughts-lite-v1", - "developer": "Pinkstack", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.711 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1659 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3672 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1755 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PocketDoc/Dans-Instruct-CoreCurriculum-12b/3d69ec7d-9999-4e16-8dc9-99fad35e156e.json b/data/hfopenllm_v2/PocketDoc/Dans-Instruct-CoreCurriculum-12b/3d69ec7d-9999-4e16-8dc9-99fad35e156e.json deleted file mode 100644 index cb170495e..000000000 --- a/data/hfopenllm_v2/PocketDoc/Dans-Instruct-CoreCurriculum-12b/3d69ec7d-9999-4e16-8dc9-99fad35e156e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-Instruct-CoreCurriculum-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-Instruct-CoreCurriculum-12b", - "id": "PocketDoc/Dans-Instruct-CoreCurriculum-12b", - "developer": "PocketDoc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2191 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3789 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1219 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.1.0-12b/d2a7459b-8a12-4529-b978-c7237979f16b.json b/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.1.0-12b/d2a7459b-8a12-4529-b978-c7237979f16b.json deleted file mode 100644 index 54790f749..000000000 --- a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.1.0-12b/d2a7459b-8a12-4529-b978-c7237979f16b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-PersonalityEngine-V1.1.0-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-PersonalityEngine-V1.1.0-12b", - "id": "PocketDoc/Dans-PersonalityEngine-V1.1.0-12b", - "developer": "PocketDoc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7075 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5361 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4587 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.2.0-24b/e7a228ad-69de-471a-9f31-6bdc7221999c.json b/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.2.0-24b/e7a228ad-69de-471a-9f31-6bdc7221999c.json deleted file mode 100644 index 55de50f7e..000000000 --- a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.2.0-24b/e7a228ad-69de-471a-9f31-6bdc7221999c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-PersonalityEngine-V1.2.0-24b", - "id": "PocketDoc/Dans-PersonalityEngine-V1.2.0-24b", - "developer": "PocketDoc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7886 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6421 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2455 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5026 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b/9196ae39-adb0-4d53-8399-0ccd4d628065.json b/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b/9196ae39-adb0-4d53-8399-0ccd4d628065.json deleted file mode 100644 index 28534a7f1..000000000 --- a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b/9196ae39-adb0-4d53-8399-0ccd4d628065.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-PersonalityEngine-v1.0.0-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-PersonalityEngine-v1.0.0-8b", - "id": "PocketDoc/Dans-PersonalityEngine-v1.0.0-8b", - "developer": "PocketDoc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4982 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4733 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3065 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PocketDoc/Dans-SakuraKaze-V1.0.0-12b/ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json b/data/hfopenllm_v2/PocketDoc/Dans-SakuraKaze-V1.0.0-12b/ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json deleted file mode 100644 index 4fbe90b2e..000000000 --- a/data/hfopenllm_v2/PocketDoc/Dans-SakuraKaze-V1.0.0-12b/ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-SakuraKaze-V1.0.0-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dans-SakuraKaze-V1.0.0-12b", - "id": "PocketDoc/Dans-SakuraKaze-V1.0.0-12b", - "developer": "PocketDoc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5405 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4745 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.356 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PowerInfer/SmallThinker-3B-Preview/05f69fd6-a77e-478d-ad86-3e83e615e892.json b/data/hfopenllm_v2/PowerInfer/SmallThinker-3B-Preview/05f69fd6-a77e-478d-ad86-3e83e615e892.json deleted file mode 100644 index d49ad061e..000000000 --- a/data/hfopenllm_v2/PowerInfer/SmallThinker-3B-Preview/05f69fd6-a77e-478d-ad86-3e83e615e892.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PowerInfer_SmallThinker-3B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmallThinker-3B-Preview", - "id": "PowerInfer/SmallThinker-3B-Preview", - "developer": "PowerInfer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2779 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3525 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3018 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PranavHarshan/LaMistral-V4/5b8e9508-befb-4674-bd84-9c722a0864ce.json b/data/hfopenllm_v2/PranavHarshan/LaMistral-V4/5b8e9508-befb-4674-bd84-9c722a0864ce.json deleted file mode 100644 index 8281de999..000000000 --- a/data/hfopenllm_v2/PranavHarshan/LaMistral-V4/5b8e9508-befb-4674-bd84-9c722a0864ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PranavHarshan_LaMistral-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LaMistral-V4", - "id": "PranavHarshan/LaMistral-V4", - "developer": "PranavHarshan", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6239 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5184 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3643 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PranavHarshan/MedNarra-X1/8beb3730-23e8-4b89-933d-2d3f1a1d1365.json b/data/hfopenllm_v2/PranavHarshan/MedNarra-X1/8beb3730-23e8-4b89-933d-2d3f1a1d1365.json deleted file mode 100644 index 3cf872d63..000000000 --- a/data/hfopenllm_v2/PranavHarshan/MedNarra-X1/8beb3730-23e8-4b89-933d-2d3f1a1d1365.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PranavHarshan_MedNarra-X1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MedNarra-X1", - "id": "PranavHarshan/MedNarra-X1", - "developer": "PranavHarshan", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4338 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended/07417712-1933-4920-8964-67ba74bf6d01.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended/07417712-1933-4920-8964-67ba74bf6d01.json deleted file mode 100644 index d98817118..000000000 --- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended/07417712-1933-4920-8964-67ba74bf6d01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Appended/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenChat-3.5-0106_10.7B_48Layers-Appended", - "id": "Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended", - "developer": "Pretergeek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved/ae4cc05d-a65a-4f18-a99c-f133603686d1.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved/ae4cc05d-a65a-4f18-a99c-f133603686d1.json deleted file mode 100644 index 80b91addb..000000000 --- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved/ae4cc05d-a65a-4f18-a99c-f133603686d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Interleaved/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenChat-3.5-0106_10.7B_48Layers-Interleaved", - "id": "Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved", - "developer": "Pretergeek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_32K-PoSE/54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_32K-PoSE/54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json deleted file mode 100644 index f50ca0914..000000000 --- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_32K-PoSE/54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_32K-PoSE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenChat-3.5-0106_32K-PoSE", - "id": "Pretergeek/OpenChat-3.5-0106_32K-PoSE", - "developer": "Pretergeek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3969 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3471 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4205 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2031 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended/a717d466-9157-4991-8459-f39847d914a2.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended/a717d466-9157-4991-8459-f39847d914a2.json deleted file mode 100644 index a32aae183..000000000 --- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended/a717d466-9157-4991-8459-f39847d914a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Appended/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenChat-3.5-0106_8.11B_36Layers-Appended", - "id": "Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended", - "developer": "Pretergeek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.114 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5976 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved/15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved/15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json deleted file mode 100644 index eea5d5d83..000000000 --- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved/15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Interleaved/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenChat-3.5-0106_8.11B_36Layers-Interleaved", - "id": "Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved", - "developer": "Pretergeek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.114 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended/921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended/921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json deleted file mode 100644 index 8ecc037ff..000000000 --- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended/921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Appended/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenChat-3.5-0106_8.99B_40Layers-Appended", - "id": "Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended", - "developer": "Pretergeek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.987 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved/863969d9-e567-43cc-a0a9-7f80eaba374a.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved/863969d9-e567-43cc-a0a9-7f80eaba374a.json deleted file mode 100644 index dde7d977d..000000000 --- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved/863969d9-e567-43cc-a0a9-7f80eaba374a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Interleaved/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenChat-3.5-0106_8.99B_40Layers-Interleaved", - "id": "Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved", - "developer": "Pretergeek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.987 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5976 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended/2987fa45-363e-4a07-8e9f-db01586a135b.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended/2987fa45-363e-4a07-8e9f-db01586a135b.json deleted file mode 100644 index 465c9e212..000000000 --- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended/2987fa45-363e-4a07-8e9f-db01586a135b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_9.86B_44Layers-Appended/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenChat-3.5-0106_9.86B_44Layers-Appended", - "id": "Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended", - "developer": "Pretergeek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 9.859 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2/3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json b/data/hfopenllm_v2/Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2/3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json deleted file mode 100644 index cf3bb48e6..000000000 --- a/data/hfopenllm_v2/Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2/3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Pretergeek_openchat-3.5-0106_Rebased_Mistral-7B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openchat-3.5-0106_Rebased_Mistral-7B-v0.2", - "id": "Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2", - "developer": "Pretergeek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3706 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3627 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.484 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.283 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1-Instruct/0cacf042-6b62-4b67-8821-97cd703788d0.json b/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1-Instruct/0cacf042-6b62-4b67-8821-97cd703788d0.json deleted file mode 100644 index 10fb00e0e..000000000 --- a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1-Instruct/0cacf042-6b62-4b67-8821-97cd703788d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PrimeIntellect_INTELLECT-1-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "INTELLECT-1-Instruct", - "id": "PrimeIntellect/INTELLECT-1-Instruct", - "developer": "PrimeIntellect", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.211 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.287 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3577 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1064 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json b/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json deleted file mode 100644 index 978dc1bf9..000000000 --- a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PrimeIntellect_INTELLECT-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "INTELLECT-1", - "id": "PrimeIntellect/INTELLECT-1", - "developer": "PrimeIntellect", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.211 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/c1308f95-6d55-4ff6-b14e-1bd09b467d99.json b/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/c1308f95-6d55-4ff6-b14e-1bd09b467d99.json deleted file mode 100644 index 5993f7b41..000000000 --- a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/c1308f95-6d55-4ff6-b14e-1bd09b467d99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PrimeIntellect_INTELLECT-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "INTELLECT-1", - "id": "PrimeIntellect/INTELLECT-1", - "developer": "PrimeIntellect", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.211 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.274 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PuxAI/LUA_model/4ab16120-8d39-4dea-aa76-5c249506848d.json b/data/hfopenllm_v2/PuxAI/LUA_model/4ab16120-8d39-4dea-aa76-5c249506848d.json deleted file mode 100644 index 31a3fa054..000000000 --- a/data/hfopenllm_v2/PuxAI/LUA_model/4ab16120-8d39-4dea-aa76-5c249506848d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PuxAI_LUA_model/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LUA_model", - "id": "PuxAI/LUA_model", - "developer": "PuxAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.386 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2282 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2877 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3484 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/PygmalionAI/pygmalion-6b/f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json b/data/hfopenllm_v2/PygmalionAI/pygmalion-6b/f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json deleted file mode 100644 index e8e7d6daf..000000000 --- a/data/hfopenllm_v2/PygmalionAI/pygmalion-6b/f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/PygmalionAI_pygmalion-6b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pygmalion-6b", - "id": "PygmalionAI/pygmalion-6b", - "developer": "PygmalionAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTJForCausalLM", - "params_billions": 6.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2091 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3199 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1184 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Q-bert/MetaMath-1B/c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json b/data/hfopenllm_v2/Q-bert/MetaMath-1B/c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json deleted file mode 100644 index c76a87d90..000000000 --- a/data/hfopenllm_v2/Q-bert/MetaMath-1B/c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Q-bert_MetaMath-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MetaMath-1B", - "id": "Q-bert/MetaMath-1B", - "developer": "Q-bert", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3451 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1495 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/1up-14b/9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json b/data/hfopenllm_v2/Quazim0t0/1up-14b/9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json deleted file mode 100644 index 3622e1d5d..000000000 --- a/data/hfopenllm_v2/Quazim0t0/1up-14b/9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_1up-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "1up-14b", - "id": "Quazim0t0/1up-14b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6888 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6921 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4583 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Adamant-14B-sce/8b303795-557b-4fa1-bbc6-d36bd77ee739.json b/data/hfopenllm_v2/Quazim0t0/Adamant-14B-sce/8b303795-557b-4fa1-bbc6-d36bd77ee739.json deleted file mode 100644 index 620dc27f3..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Adamant-14B-sce/8b303795-557b-4fa1-bbc6-d36bd77ee739.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Adamant-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Adamant-14B-sce", - "id": "Quazim0t0/Adamant-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6858 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6859 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3988 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4558 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Alice-14B/7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json b/data/hfopenllm_v2/Quazim0t0/Alice-14B/7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json deleted file mode 100644 index bb5caa0bb..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Alice-14B/7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Alice-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Alice-14B", - "id": "Quazim0t0/Alice-14B", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6836 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6938 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4569 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5419 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Alien-CoT-14B-sce/5a09783b-82da-43ae-a607-2cfea550d931.json b/data/hfopenllm_v2/Quazim0t0/Alien-CoT-14B-sce/5a09783b-82da-43ae-a607-2cfea550d931.json deleted file mode 100644 index e5578e0bc..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Alien-CoT-14B-sce/5a09783b-82da-43ae-a607-2cfea550d931.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Alien-CoT-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Alien-CoT-14B-sce", - "id": "Quazim0t0/Alien-CoT-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0749 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6395 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3918 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4785 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Aura-8B-Linear/6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json b/data/hfopenllm_v2/Quazim0t0/Aura-8B-Linear/6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json deleted file mode 100644 index 3a532220d..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Aura-8B-Linear/6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Aura-8B-Linear/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aura-8B-Linear", - "id": "Quazim0t0/Aura-8B-Linear", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7948 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5074 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3801 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/121cb5fc-2fa2-4718-b325-c40014802e40.json b/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/121cb5fc-2fa2-4718-b325-c40014802e40.json deleted file mode 100644 index 90b774872..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/121cb5fc-2fa2-4718-b325-c40014802e40.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Casa-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Casa-14b-sce", - "id": "Quazim0t0/Casa-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6718 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6891 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4985 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5408 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json b/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json deleted file mode 100644 index 1cd19c932..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Casa-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Casa-14b-sce", - "id": "Quazim0t0/Casa-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6654 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6901 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4698 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5426 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Charlie-8B-Linear/c8891914-c9fb-4b4d-9592-826f04520e7b.json b/data/hfopenllm_v2/Quazim0t0/Charlie-8B-Linear/c8891914-c9fb-4b4d-9592-826f04520e7b.json deleted file mode 100644 index 4db9f876c..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Charlie-8B-Linear/c8891914-c9fb-4b4d-9592-826f04520e7b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Charlie-8B-Linear/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Charlie-8B-Linear", - "id": "Quazim0t0/Charlie-8B-Linear", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5141 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3485 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3573 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Chromatic-8b-sce/e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json b/data/hfopenllm_v2/Quazim0t0/Chromatic-8b-sce/e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json deleted file mode 100644 index 3005f0449..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Chromatic-8b-sce/e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Chromatic-8b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chromatic-8b-sce", - "id": "Quazim0t0/Chromatic-8b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5085 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5063 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4051 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3755 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/CoT_Phi/da237415-f34e-4cbb-9a94-3ff621f3df8d.json b/data/hfopenllm_v2/Quazim0t0/CoT_Phi/da237415-f34e-4cbb-9a94-3ff621f3df8d.json deleted file mode 100644 index c2af11b27..000000000 --- a/data/hfopenllm_v2/Quazim0t0/CoT_Phi/da237415-f34e-4cbb-9a94-3ff621f3df8d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_CoT_Phi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CoT_Phi", - "id": "Quazim0t0/CoT_Phi", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6159 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6751 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3308 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4901 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Dyson-14b/479f3bfa-d614-46a9-88c7-9891852b0d8c.json b/data/hfopenllm_v2/Quazim0t0/Dyson-14b/479f3bfa-d614-46a9-88c7-9891852b0d8c.json deleted file mode 100644 index 05cb9e800..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Dyson-14b/479f3bfa-d614-46a9-88c7-9891852b0d8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Dyson-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dyson-14b", - "id": "Quazim0t0/Dyson-14b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5857 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6863 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4259 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5399 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Edu-14B-Linear/f5f0c7da-fb03-4023-81a7-801b0729a19d.json b/data/hfopenllm_v2/Quazim0t0/Edu-14B-Linear/f5f0c7da-fb03-4023-81a7-801b0729a19d.json deleted file mode 100644 index ce336e83e..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Edu-14B-Linear/f5f0c7da-fb03-4023-81a7-801b0729a19d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Edu-14B-Linear/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Edu-14B-Linear", - "id": "Quazim0t0/Edu-14B-Linear", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6158 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6758 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2447 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4378 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5086 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Fugazi14b/40f51424-2922-498d-bbbc-d500667a8554.json b/data/hfopenllm_v2/Quazim0t0/Fugazi14b/40f51424-2922-498d-bbbc-d500667a8554.json deleted file mode 100644 index d3ba7ceca..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Fugazi14b/40f51424-2922-498d-bbbc-d500667a8554.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Fugazi14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fugazi14b", - "id": "Quazim0t0/Fugazi14b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6998 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6941 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4653 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4546 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5417 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/GZA-14B-sce/4f25d177-6bcf-4864-87a4-1beb21a7373d.json b/data/hfopenllm_v2/Quazim0t0/GZA-14B-sce/4f25d177-6bcf-4864-87a4-1beb21a7373d.json deleted file mode 100644 index f440931a3..000000000 --- a/data/hfopenllm_v2/Quazim0t0/GZA-14B-sce/4f25d177-6bcf-4864-87a4-1beb21a7373d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_GZA-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GZA-14B-sce", - "id": "Quazim0t0/GZA-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6274 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6687 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4721 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4285 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5232 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Geedorah-14B/b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json b/data/hfopenllm_v2/Quazim0t0/Geedorah-14B/b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json deleted file mode 100644 index 36ede1fd0..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Geedorah-14B/b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Geedorah-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Geedorah-14B", - "id": "Quazim0t0/Geedorah-14B", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6873 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6964 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4449 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4547 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5421 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/GivingTree-8b-sce/d497a7e3-11c2-4e0c-8788-091caabede56.json b/data/hfopenllm_v2/Quazim0t0/GivingTree-8b-sce/d497a7e3-11c2-4e0c-8788-091caabede56.json deleted file mode 100644 index cab765743..000000000 --- a/data/hfopenllm_v2/Quazim0t0/GivingTree-8b-sce/d497a7e3-11c2-4e0c-8788-091caabede56.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_GivingTree-8b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GivingTree-8b-sce", - "id": "Quazim0t0/GivingTree-8b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5006 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.504 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1526 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4051 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/GuiltySpark-14B-ties/4a55bcf2-e1c1-4fce-8f79-472dae869b26.json b/data/hfopenllm_v2/Quazim0t0/GuiltySpark-14B-ties/4a55bcf2-e1c1-4fce-8f79-472dae869b26.json deleted file mode 100644 index 00575bbe7..000000000 --- a/data/hfopenllm_v2/Quazim0t0/GuiltySpark-14B-ties/4a55bcf2-e1c1-4fce-8f79-472dae869b26.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_GuiltySpark-14B-ties/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GuiltySpark-14B-ties", - "id": "Quazim0t0/GuiltySpark-14B-ties", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6854 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6914 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3837 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4557 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Halo-14B-sce/5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json b/data/hfopenllm_v2/Quazim0t0/Halo-14B-sce/5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json deleted file mode 100644 index 8957c0b18..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Halo-14B-sce/5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Halo-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Halo-14B-sce", - "id": "Quazim0t0/Halo-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6754 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6876 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Heretic1.5b/1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json b/data/hfopenllm_v2/Quazim0t0/Heretic1.5b/1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json deleted file mode 100644 index 171487262..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Heretic1.5b/1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Heretic1.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Heretic1.5b", - "id": "Quazim0t0/Heretic1.5b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.73 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2062 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.244 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3511 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1728 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Hyde-14b-sce/75065074-7ef6-41ac-be7c-496cc458640a.json b/data/hfopenllm_v2/Quazim0t0/Hyde-14b-sce/75065074-7ef6-41ac-be7c-496cc458640a.json deleted file mode 100644 index 305107133..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Hyde-14b-sce/75065074-7ef6-41ac-be7c-496cc458640a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Hyde-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hyde-14b-sce", - "id": "Quazim0t0/Hyde-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6715 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6885 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2734 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Imagine-v0.5-16bit/49a0287b-48d7-44db-bf20-a084919d332f.json b/data/hfopenllm_v2/Quazim0t0/Imagine-v0.5-16bit/49a0287b-48d7-44db-bf20-a084919d332f.json deleted file mode 100644 index f52a4474a..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Imagine-v0.5-16bit/49a0287b-48d7-44db-bf20-a084919d332f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Imagine-v0.5-16bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Imagine-v0.5-16bit", - "id": "Quazim0t0/Imagine-v0.5-16bit", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2759 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6769 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4349 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Imbue-14b/7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json b/data/hfopenllm_v2/Quazim0t0/Imbue-14b/7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json deleted file mode 100644 index 8fbf439a2..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Imbue-14b/7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Imbue-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Imbue-14b", - "id": "Quazim0t0/Imbue-14b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6845 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4167 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Insom/628542f9-fac6-42a7-8ec5-5cd93f977a7e.json b/data/hfopenllm_v2/Quazim0t0/Insom/628542f9-fac6-42a7-8ec5-5cd93f977a7e.json deleted file mode 100644 index b3398ce09..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Insom/628542f9-fac6-42a7-8ec5-5cd93f977a7e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Insom/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Insom", - "id": "Quazim0t0/Insom", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6818 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6881 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3498 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4311 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/InspectorDeck-14B-sce/5b0924ae-cf52-4245-a687-91e4b1742c16.json b/data/hfopenllm_v2/Quazim0t0/InspectorDeck-14B-sce/5b0924ae-cf52-4245-a687-91e4b1742c16.json deleted file mode 100644 index 6da062e91..000000000 --- a/data/hfopenllm_v2/Quazim0t0/InspectorDeck-14B-sce/5b0924ae-cf52-4245-a687-91e4b1742c16.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_InspectorDeck-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "InspectorDeck-14B-sce", - "id": "Quazim0t0/InspectorDeck-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3241 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6668 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3165 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3982 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5261 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Jekyl-8b-sce/459c2b98-c3af-4334-a4bc-13334efe49b8.json b/data/hfopenllm_v2/Quazim0t0/Jekyl-8b-sce/459c2b98-c3af-4334-a4bc-13334efe49b8.json deleted file mode 100644 index 6119a139c..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Jekyl-8b-sce/459c2b98-c3af-4334-a4bc-13334efe49b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Jekyl-8b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jekyl-8b-sce", - "id": "Quazim0t0/Jekyl-8b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4697 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4994 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1616 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4197 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Jigsaw-14B-Linear/b2780aa3-d299-4180-8441-dd54e94255cb.json b/data/hfopenllm_v2/Quazim0t0/Jigsaw-14B-Linear/b2780aa3-d299-4180-8441-dd54e94255cb.json deleted file mode 100644 index ab44c9d65..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Jigsaw-14B-Linear/b2780aa3-d299-4180-8441-dd54e94255cb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Jigsaw-14B-Linear/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jigsaw-14B-Linear", - "id": "Quazim0t0/Jigsaw-14B-Linear", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.648 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6865 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4483 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5234 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Katana-8b-sce/f55d398d-0555-4e89-a37c-def04741a0dd.json b/data/hfopenllm_v2/Quazim0t0/Katana-8b-sce/f55d398d-0555-4e89-a37c-def04741a0dd.json deleted file mode 100644 index 3136f0083..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Katana-8b-sce/f55d398d-0555-4e89-a37c-def04741a0dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Katana-8b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Katana-8b-sce", - "id": "Quazim0t0/Katana-8b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5107 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1511 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4038 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3771 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Knot-CoT-14B-sce/63caf8f8-9e55-4ef6-ae76-ee7184a50675.json b/data/hfopenllm_v2/Quazim0t0/Knot-CoT-14B-sce/63caf8f8-9e55-4ef6-ae76-ee7184a50675.json deleted file mode 100644 index 12f493aa0..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Knot-CoT-14B-sce/63caf8f8-9e55-4ef6-ae76-ee7184a50675.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Knot-CoT-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Knot-CoT-14B-sce", - "id": "Quazim0t0/Knot-CoT-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4832 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6616 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3995 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5154 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Lineage-14B/f82ccde3-bd3b-499c-8b8c-182822392cea.json b/data/hfopenllm_v2/Quazim0t0/Lineage-14B/f82ccde3-bd3b-499c-8b8c-182822392cea.json deleted file mode 100644 index 2aa939b3a..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Lineage-14B/f82ccde3-bd3b-499c-8b8c-182822392cea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Lineage-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lineage-14B", - "id": "Quazim0t0/Lineage-14B", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.707 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6934 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4245 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4597 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Lo-Phi-14b/8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json b/data/hfopenllm_v2/Quazim0t0/Lo-Phi-14b/8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json deleted file mode 100644 index 26d2ee23e..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Lo-Phi-14b/8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Lo-Phi-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lo-Phi-14b", - "id": "Quazim0t0/Lo-Phi-14b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4941 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6852 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Loke-14B-sce/b7cbc2fb-2c52-4c13-9266-52103421f2ee.json b/data/hfopenllm_v2/Quazim0t0/Loke-14B-sce/b7cbc2fb-2c52-4c13-9266-52103421f2ee.json deleted file mode 100644 index 65be87e3f..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Loke-14B-sce/b7cbc2fb-2c52-4c13-9266-52103421f2ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Loke-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Loke-14B-sce", - "id": "Quazim0t0/Loke-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6848 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6924 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3905 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5401 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/MFDOOM-14B/f4474361-e897-4dbb-a89e-5451a4724474.json b/data/hfopenllm_v2/Quazim0t0/MFDOOM-14B/f4474361-e897-4dbb-a89e-5451a4724474.json deleted file mode 100644 index dc38a1ad9..000000000 --- a/data/hfopenllm_v2/Quazim0t0/MFDOOM-14B/f4474361-e897-4dbb-a89e-5451a4724474.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_MFDOOM-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFDOOM-14B", - "id": "Quazim0t0/MFDOOM-14B", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6736 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6916 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4377 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5426 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/MFGRIMM-14B/de257b5e-4629-4f8a-b08d-d2ca372593e2.json b/data/hfopenllm_v2/Quazim0t0/MFGRIMM-14B/de257b5e-4629-4f8a-b08d-d2ca372593e2.json deleted file mode 100644 index ed96fac2b..000000000 --- a/data/hfopenllm_v2/Quazim0t0/MFGRIMM-14B/de257b5e-4629-4f8a-b08d-d2ca372593e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_MFGRIMM-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFGRIMM-14B", - "id": "Quazim0t0/MFGRIMM-14B", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6894 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6909 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4361 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Math_Phi4_Reason/a37aada3-104a-488a-898f-245ff257de46.json b/data/hfopenllm_v2/Quazim0t0/Math_Phi4_Reason/a37aada3-104a-488a-898f-245ff257de46.json deleted file mode 100644 index 79be95de5..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Math_Phi4_Reason/a37aada3-104a-488a-898f-245ff257de46.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Math_Phi4_Reason/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Math_Phi4_Reason", - "id": "Quazim0t0/Math_Phi4_Reason", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.322 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.624 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3278 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.503 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Mithril-14B-sce/d9d655d1-d94c-483a-a3a2-ca196e1391d1.json b/data/hfopenllm_v2/Quazim0t0/Mithril-14B-sce/d9d655d1-d94c-483a-a3a2-ca196e1391d1.json deleted file mode 100644 index 0346b1864..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Mithril-14B-sce/d9d655d1-d94c-483a-a3a2-ca196e1391d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Mithril-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mithril-14B-sce", - "id": "Quazim0t0/Mithril-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6958 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6926 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3822 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3691 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4611 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5403 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Mononoke-14B-sce/77bf7126-0cb9-43ef-8d23-5f1395f91642.json b/data/hfopenllm_v2/Quazim0t0/Mononoke-14B-sce/77bf7126-0cb9-43ef-8d23-5f1395f91642.json deleted file mode 100644 index 24ae15f0e..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Mononoke-14B-sce/77bf7126-0cb9-43ef-8d23-5f1395f91642.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Mononoke-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mononoke-14B-sce", - "id": "Quazim0t0/Mononoke-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3502 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6744 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4698 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4155 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5298 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Motion-8B-Linear/73f410be-3084-4994-8406-f8ac70880626.json b/data/hfopenllm_v2/Quazim0t0/Motion-8B-Linear/73f410be-3084-4994-8406-f8ac70880626.json deleted file mode 100644 index 6934a4cda..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Motion-8B-Linear/73f410be-3084-4994-8406-f8ac70880626.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Motion-8B-Linear/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Motion-8B-Linear", - "id": "Quazim0t0/Motion-8B-Linear", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7686 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3606 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3785 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Mouse-9B/24caad7a-15fa-4820-91cc-0f544a34d173.json b/data/hfopenllm_v2/Quazim0t0/Mouse-9B/24caad7a-15fa-4820-91cc-0f544a34d173.json deleted file mode 100644 index cd04f17a6..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Mouse-9B/24caad7a-15fa-4820-91cc-0f544a34d173.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Mouse-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mouse-9B", - "id": "Quazim0t0/Mouse-9B", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 9.207 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1325 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2979 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.347 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Nova-14b-sce/e087b221-f813-4688-8d98-17980f98ac5b.json b/data/hfopenllm_v2/Quazim0t0/Nova-14b-sce/e087b221-f813-4688-8d98-17980f98ac5b.json deleted file mode 100644 index 5316de52d..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Nova-14b-sce/e087b221-f813-4688-8d98-17980f98ac5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Nova-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nova-14b-sce", - "id": "Quazim0t0/Nova-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7022 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6935 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4571 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/NovaScotia-14b-stock/f4d03bff-3b34-497f-a17f-0379bc562f11.json b/data/hfopenllm_v2/Quazim0t0/NovaScotia-14b-stock/f4d03bff-3b34-497f-a17f-0379bc562f11.json deleted file mode 100644 index b80b45290..000000000 --- a/data/hfopenllm_v2/Quazim0t0/NovaScotia-14b-stock/f4d03bff-3b34-497f-a17f-0379bc562f11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_NovaScotia-14b-stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NovaScotia-14b-stock", - "id": "Quazim0t0/NovaScotia-14b-stock", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6787 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6935 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4493 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5409 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/2ca21612-ea90-41f3-b618-3ea81c09c3ae.json b/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/2ca21612-ea90-41f3-b618-3ea81c09c3ae.json deleted file mode 100644 index 78c79f5cf..000000000 --- a/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/2ca21612-ea90-41f3-b618-3ea81c09c3ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ODB-14B-sce", - "id": "Quazim0t0/ODB-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Unknown", - "params_billions": 0.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2922 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6559 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2545 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3929 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/d4dc2088-9911-4966-afe9-022df89dd522.json b/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/d4dc2088-9911-4966-afe9-022df89dd522.json deleted file mode 100644 index a373a9579..000000000 --- a/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/d4dc2088-9911-4966-afe9-022df89dd522.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ODB-14b-sce", - "id": "Quazim0t0/ODB-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7016 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6942 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4116 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4571 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Oasis-14B-ties/ad03a075-8f24-46f6-ae04-5a04eb7061c1.json b/data/hfopenllm_v2/Quazim0t0/Oasis-14B-ties/ad03a075-8f24-46f6-ae04-5a04eb7061c1.json deleted file mode 100644 index 5c2da9f94..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Oasis-14B-ties/ad03a075-8f24-46f6-ae04-5a04eb7061c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Oasis-14B-ties/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Oasis-14B-ties", - "id": "Quazim0t0/Oasis-14B-ties", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6937 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6915 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4571 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5405 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Origami-14B-sce/2d1da226-e65c-48a0-aabb-46b1cf670a82.json b/data/hfopenllm_v2/Quazim0t0/Origami-14B-sce/2d1da226-e65c-48a0-aabb-46b1cf670a82.json deleted file mode 100644 index c9e2044e6..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Origami-14B-sce/2d1da226-e65c-48a0-aabb-46b1cf670a82.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Origami-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Origami-14B-sce", - "id": "Quazim0t0/Origami-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3259 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2915 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4035 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5244 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill.16bit/7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json b/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill.16bit/7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json deleted file mode 100644 index 75adf6c43..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill.16bit/7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Phi4.Turn.R1Distill.16bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi4.Turn.R1Distill.16bit", - "id": "Quazim0t0/Phi4.Turn.R1Distill.16bit", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3126 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6563 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2311 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3902 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5257 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors/87018726-9f81-47b1-883e-609afea7fb37.json b/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors/87018726-9f81-47b1-883e-609afea7fb37.json deleted file mode 100644 index 5b87ab22e..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors/87018726-9f81-47b1-883e-609afea7fb37.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Phi4.Turn.R1Distill_v1.5.1-Tensors/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi4.Turn.R1Distill_v1.5.1-Tensors", - "id": "Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6456 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3929 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Phi4Basis-14B-sce/292b9333-96c7-4fc7-bf35-78bbce9f10d3.json b/data/hfopenllm_v2/Quazim0t0/Phi4Basis-14B-sce/292b9333-96c7-4fc7-bf35-78bbce9f10d3.json deleted file mode 100644 index 0f7720935..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Phi4Basis-14B-sce/292b9333-96c7-4fc7-bf35-78bbce9f10d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Phi4Basis-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi4Basis-14B-sce", - "id": "Quazim0t0/Phi4Basis-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6502 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6909 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4789 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Ponder-14B-linear/b44224c3-ed2c-4120-9e2a-e6286358a4da.json b/data/hfopenllm_v2/Quazim0t0/Ponder-14B-linear/b44224c3-ed2c-4120-9e2a-e6286358a4da.json deleted file mode 100644 index e66c3086b..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Ponder-14B-linear/b44224c3-ed2c-4120-9e2a-e6286358a4da.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Ponder-14B-linear/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ponder-14B-linear", - "id": "Quazim0t0/Ponder-14B-linear", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6906 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6943 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4282 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4558 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5408 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/RZA-14B-sce/f7a2c9af-c55c-4307-bfef-1ca709525d82.json b/data/hfopenllm_v2/Quazim0t0/RZA-14B-sce/f7a2c9af-c55c-4307-bfef-1ca709525d82.json deleted file mode 100644 index eedf689a1..000000000 --- a/data/hfopenllm_v2/Quazim0t0/RZA-14B-sce/f7a2c9af-c55c-4307-bfef-1ca709525d82.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_RZA-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RZA-14B-sce", - "id": "Quazim0t0/RZA-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4774 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6686 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4113 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5383 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Rosemary-14b/d9655f35-edfd-4c53-b359-559870e8019e.json b/data/hfopenllm_v2/Quazim0t0/Rosemary-14b/d9655f35-edfd-4c53-b359-559870e8019e.json deleted file mode 100644 index 38d6ae792..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Rosemary-14b/d9655f35-edfd-4c53-b359-559870e8019e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Rosemary-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rosemary-14b", - "id": "Quazim0t0/Rosemary-14b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6915 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6955 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4388 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4492 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Rune-14b/afdd962d-652a-4395-92f7-c16dc874a779.json b/data/hfopenllm_v2/Quazim0t0/Rune-14b/afdd962d-652a-4395-92f7-c16dc874a779.json deleted file mode 100644 index b20f38d6e..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Rune-14b/afdd962d-652a-4395-92f7-c16dc874a779.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Rune-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rune-14b", - "id": "Quazim0t0/Rune-14b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7016 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6937 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4585 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4533 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/SZA-14B-sce/2594e917-3ebd-428b-8f36-cb0da668695d.json b/data/hfopenllm_v2/Quazim0t0/SZA-14B-sce/2594e917-3ebd-428b-8f36-cb0da668695d.json deleted file mode 100644 index b8cc374a1..000000000 --- a/data/hfopenllm_v2/Quazim0t0/SZA-14B-sce/2594e917-3ebd-428b-8f36-cb0da668695d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_SZA-14B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SZA-14B-sce", - "id": "Quazim0t0/SZA-14B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5659 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6889 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4339 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Sake-20b/91a86644-ad96-4c66-8691-1c0b531b572c.json b/data/hfopenllm_v2/Quazim0t0/Sake-20b/91a86644-ad96-4c66-8691-1c0b531b572c.json deleted file mode 100644 index a8bf797b3..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Sake-20b/91a86644-ad96-4c66-8691-1c0b531b572c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Sake-20b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sake-20b", - "id": "Quazim0t0/Sake-20b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 21.475 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6693 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4653 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4494 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Spok-14b-sce/331f56ce-5e45-46d8-9143-3f66be20b699.json b/data/hfopenllm_v2/Quazim0t0/Spok-14b-sce/331f56ce-5e45-46d8-9143-3f66be20b699.json deleted file mode 100644 index ee6d0d2f3..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Spok-14b-sce/331f56ce-5e45-46d8-9143-3f66be20b699.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Spok-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Spok-14b-sce", - "id": "Quazim0t0/Spok-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6682 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6899 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5298 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Sumatra-20b/6138ebe0-8483-4cfb-8d95-b334bb09e831.json b/data/hfopenllm_v2/Quazim0t0/Sumatra-20b/6138ebe0-8483-4cfb-8d95-b334bb09e831.json deleted file mode 100644 index e73a1374c..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Sumatra-20b/6138ebe0-8483-4cfb-8d95-b334bb09e831.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Sumatra-20b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sumatra-20b", - "id": "Quazim0t0/Sumatra-20b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 21.475 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6738 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6855 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.456 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5415 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/SuperNova14b/4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json b/data/hfopenllm_v2/Quazim0t0/SuperNova14b/4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json deleted file mode 100644 index ae0580d15..000000000 --- a/data/hfopenllm_v2/Quazim0t0/SuperNova14b/4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_SuperNova14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SuperNova14b", - "id": "Quazim0t0/SuperNova14b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7076 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6937 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4545 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/TB0-8B-sce/a6b0f2bf-08da-472f-b858-8be967a44cdc.json b/data/hfopenllm_v2/Quazim0t0/TB0-8B-sce/a6b0f2bf-08da-472f-b858-8be967a44cdc.json deleted file mode 100644 index 3f9a2e43e..000000000 --- a/data/hfopenllm_v2/Quazim0t0/TB0-8B-sce/a6b0f2bf-08da-472f-b858-8be967a44cdc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_TB0-8B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TB0-8B-sce", - "id": "Quazim0t0/TB0-8B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5107 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1511 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4038 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3771 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/TBL-8B-sce/57c7553d-f3e5-4a31-8c16-66aae570d8ec.json b/data/hfopenllm_v2/Quazim0t0/TBL-8B-sce/57c7553d-f3e5-4a31-8c16-66aae570d8ec.json deleted file mode 100644 index dbe71df0d..000000000 --- a/data/hfopenllm_v2/Quazim0t0/TBL-8B-sce/57c7553d-f3e5-4a31-8c16-66aae570d8ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_TBL-8B-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TBL-8B-sce", - "id": "Quazim0t0/TBL-8B-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4581 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3689 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/ThinkPhi1.1-Tensors/58c31bdd-f86f-4fbb-8549-191bb9f46f02.json b/data/hfopenllm_v2/Quazim0t0/ThinkPhi1.1-Tensors/58c31bdd-f86f-4fbb-8549-191bb9f46f02.json deleted file mode 100644 index 2cd01867f..000000000 --- a/data/hfopenllm_v2/Quazim0t0/ThinkPhi1.1-Tensors/58c31bdd-f86f-4fbb-8549-191bb9f46f02.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_ThinkPhi1.1-Tensors/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ThinkPhi1.1-Tensors", - "id": "Quazim0t0/ThinkPhi1.1-Tensors", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3908 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6449 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4908 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Venti-20b/dd25c1dd-0edf-44ca-b18c-633dbd47368f.json b/data/hfopenllm_v2/Quazim0t0/Venti-20b/dd25c1dd-0edf-44ca-b18c-633dbd47368f.json deleted file mode 100644 index 34f0d31fe..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Venti-20b/dd25c1dd-0edf-44ca-b18c-633dbd47368f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Venti-20b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Venti-20b", - "id": "Quazim0t0/Venti-20b", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 21.475 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6641 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6901 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3391 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.448 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Venti-Blend-sce/2a030613-b5f7-4393-ac39-d2d072c913dc.json b/data/hfopenllm_v2/Quazim0t0/Venti-Blend-sce/2a030613-b5f7-4393-ac39-d2d072c913dc.json deleted file mode 100644 index 2f424d5ab..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Venti-Blend-sce/2a030613-b5f7-4393-ac39-d2d072c913dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Venti-Blend-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Venti-Blend-sce", - "id": "Quazim0t0/Venti-Blend-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 21.475 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6843 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4056 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4389 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5414 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Vine-14b-sce/f8c73290-c400-4f1f-a00a-516592497b0d.json b/data/hfopenllm_v2/Quazim0t0/Vine-14b-sce/f8c73290-c400-4f1f-a00a-516592497b0d.json deleted file mode 100644 index 947f1d53a..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Vine-14b-sce/f8c73290-c400-4f1f-a00a-516592497b0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Vine-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Vine-14b-sce", - "id": "Quazim0t0/Vine-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6733 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6891 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5408 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Wendy-14B/b31908fc-5e7e-45d6-835f-4e86a05b23fb.json b/data/hfopenllm_v2/Quazim0t0/Wendy-14B/b31908fc-5e7e-45d6-835f-4e86a05b23fb.json deleted file mode 100644 index 385c8931d..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Wendy-14B/b31908fc-5e7e-45d6-835f-4e86a05b23fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Wendy-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Wendy-14B", - "id": "Quazim0t0/Wendy-14B", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6772 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6958 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4834 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/Wu-14b-sce/4320cb98-7f9f-4510-bb88-448ce231bae8.json b/data/hfopenllm_v2/Quazim0t0/Wu-14b-sce/4320cb98-7f9f-4510-bb88-448ce231bae8.json deleted file mode 100644 index 8fbcfb8f2..000000000 --- a/data/hfopenllm_v2/Quazim0t0/Wu-14b-sce/4320cb98-7f9f-4510-bb88-448ce231bae8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_Wu-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Wu-14b-sce", - "id": "Quazim0t0/Wu-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6718 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6885 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2613 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5293 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/bloom-14b-stock/28b986d1-2e67-4462-9165-6cb8f260b6c6.json b/data/hfopenllm_v2/Quazim0t0/bloom-14b-stock/28b986d1-2e67-4462-9165-6cb8f260b6c6.json deleted file mode 100644 index f549634bc..000000000 --- a/data/hfopenllm_v2/Quazim0t0/bloom-14b-stock/28b986d1-2e67-4462-9165-6cb8f260b6c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_bloom-14b-stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bloom-14b-stock", - "id": "Quazim0t0/bloom-14b-stock", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6878 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4811 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5373 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/caramel-14B/fe1e21cb-7934-4022-a74a-777172310021.json b/data/hfopenllm_v2/Quazim0t0/caramel-14B/fe1e21cb-7934-4022-a74a-777172310021.json deleted file mode 100644 index fc8e481dc..000000000 --- a/data/hfopenllm_v2/Quazim0t0/caramel-14B/fe1e21cb-7934-4022-a74a-777172310021.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_caramel-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "caramel-14B", - "id": "Quazim0t0/caramel-14B", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6745 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6919 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4713 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4454 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5436 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/graphite-14b-sce/90871638-b828-484d-8822-95ffceb20909.json b/data/hfopenllm_v2/Quazim0t0/graphite-14b-sce/90871638-b828-484d-8822-95ffceb20909.json deleted file mode 100644 index b45a1e117..000000000 --- a/data/hfopenllm_v2/Quazim0t0/graphite-14b-sce/90871638-b828-484d-8822-95ffceb20909.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_graphite-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "graphite-14b-sce", - "id": "Quazim0t0/graphite-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3217 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6631 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.528 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/mocha-14B/04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json b/data/hfopenllm_v2/Quazim0t0/mocha-14B/04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json deleted file mode 100644 index c48422312..000000000 --- a/data/hfopenllm_v2/Quazim0t0/mocha-14B/04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_mocha-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mocha-14B", - "id": "Quazim0t0/mocha-14B", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5893 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6895 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4272 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/mosaic-14b-sce/8c5c22af-f230-4d34-b80d-f42ef27e1675.json b/data/hfopenllm_v2/Quazim0t0/mosaic-14b-sce/8c5c22af-f230-4d34-b80d-f42ef27e1675.json deleted file mode 100644 index e02cc70c3..000000000 --- a/data/hfopenllm_v2/Quazim0t0/mosaic-14b-sce/8c5c22af-f230-4d34-b80d-f42ef27e1675.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_mosaic-14b-sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mosaic-14b-sce", - "id": "Quazim0t0/mosaic-14b-sce", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6876 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6907 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4026 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4558 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/tesseract-14b-stock/f3466a90-541b-4a08-a9c6-d5a79b2299b0.json b/data/hfopenllm_v2/Quazim0t0/tesseract-14b-stock/f3466a90-541b-4a08-a9c6-d5a79b2299b0.json deleted file mode 100644 index f5ff78824..000000000 --- a/data/hfopenllm_v2/Quazim0t0/tesseract-14b-stock/f3466a90-541b-4a08-a9c6-d5a79b2299b0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_tesseract-14b-stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tesseract-14b-stock", - "id": "Quazim0t0/tesseract-14b-stock", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5848 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.688 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Quazim0t0/time-14b-stock/ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json b/data/hfopenllm_v2/Quazim0t0/time-14b-stock/ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json deleted file mode 100644 index 763d06ba9..000000000 --- a/data/hfopenllm_v2/Quazim0t0/time-14b-stock/ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Quazim0t0_time-14b-stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "time-14b-stock", - "id": "Quazim0t0/time-14b-stock", - "developer": "Quazim0t0", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6699 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6897 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5419 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/QwQ-32B-Preview/859af708-ac37-4749-bc06-73d92338d1f5.json b/data/hfopenllm_v2/Qwen/QwQ-32B-Preview/859af708-ac37-4749-bc06-73d92338d1f5.json deleted file mode 100644 index 9072fe047..000000000 --- a/data/hfopenllm_v2/Qwen/QwQ-32B-Preview/859af708-ac37-4749-bc06-73d92338d1f5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_QwQ-32B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-32B-Preview", - "id": "Qwen/QwQ-32B-Preview", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4035 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6691 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4494 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.411 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5678 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/QwQ-32B/e274380d-e0f7-47c3-afc3-e603e6cecf9e.json b/data/hfopenllm_v2/Qwen/QwQ-32B/e274380d-e0f7-47c3-afc3-e603e6cecf9e.json deleted file mode 100644 index 8a460b8de..000000000 --- a/data/hfopenllm_v2/Qwen/QwQ-32B/e274380d-e0f7-47c3-afc3-e603e6cecf9e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_QwQ-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-32B", - "id": "Qwen/QwQ-32B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2983 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1609 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4206 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1196 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B-Chat/19810be8-ea81-4db5-9854-1830b05a5732.json b/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B-Chat/19810be8-ea81-4db5-9854-1830b05a5732.json deleted file mode 100644 index d12f434fb..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B-Chat/19810be8-ea81-4db5-9854-1830b05a5732.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-0.5B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-0.5B-Chat", - "id": "Qwen/Qwen1.5-0.5B-Chat", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.62 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1807 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3837 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1213 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B/1258c282-3672-4b42-9d4d-117568e17bf5.json b/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B/1258c282-3672-4b42-9d4d-117568e17bf5.json deleted file mode 100644 index b3a7d8e15..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B/1258c282-3672-4b42-9d4d-117568e17bf5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-0.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-0.5B", - "id": "Qwen/Qwen1.5-0.5B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.62 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1706 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B-Chat/9b9f6e01-238e-4893-b398-4e1c83c44dfa.json b/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B-Chat/9b9f6e01-238e-4893-b398-4e1c83c44dfa.json deleted file mode 100644 index 5d7e32fc8..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B-Chat/9b9f6e01-238e-4893-b398-4e1c83c44dfa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-1.8B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-1.8B-Chat", - "id": "Qwen/Qwen1.5-1.8B-Chat", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.837 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2019 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1804 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B/b267621b-dbba-4c4a-bb9f-fa85734d0f59.json b/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B/b267621b-dbba-4c4a-bb9f-fa85734d0f59.json deleted file mode 100644 index c4b4b7c05..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B/b267621b-dbba-4c4a-bb9f-fa85734d0f59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-1.8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-1.8B", - "id": "Qwen/Qwen1.5-1.8B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.837 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2154 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3476 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3605 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1882 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-110B-Chat/a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json b/data/hfopenllm_v2/Qwen/Qwen1.5-110B-Chat/a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json deleted file mode 100644 index 1539af866..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-110B-Chat/a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-110B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-110B-Chat", - "id": "Qwen/Qwen1.5-110B-Chat", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 111.21 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5939 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6184 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2341 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4522 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4825 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-110B/3d39dcab-55df-4ad3-bdc8-03ae684e4390.json b/data/hfopenllm_v2/Qwen/Qwen1.5-110B/3d39dcab-55df-4ad3-bdc8-03ae684e4390.json deleted file mode 100644 index 3d0d4bcdf..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-110B/3d39dcab-55df-4ad3-bdc8-03ae684e4390.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-110B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-110B", - "id": "Qwen/Qwen1.5-110B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 111.21 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.247 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5361 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-14B-Chat/1b499881-9edb-4626-a919-977393d6bef1.json b/data/hfopenllm_v2/Qwen/Qwen1.5-14B-Chat/1b499881-9edb-4626-a919-977393d6bef1.json deleted file mode 100644 index c47867900..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-14B-Chat/1b499881-9edb-4626-a919-977393d6bef1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-14B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-14B-Chat", - "id": "Qwen/Qwen1.5-14B-Chat", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.167 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5229 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1526 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-14B/84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json b/data/hfopenllm_v2/Qwen/Qwen1.5-14B/84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json deleted file mode 100644 index d12634623..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-14B/84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-14B", - "id": "Qwen/Qwen1.5-14B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.167 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2905 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-32B-Chat/2e070663-2622-4a8e-bd39-7f0ef9df399e.json b/data/hfopenllm_v2/Qwen/Qwen1.5-32B-Chat/2e070663-2622-4a8e-bd39-7f0ef9df399e.json deleted file mode 100644 index 695a36dcd..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-32B-Chat/2e070663-2622-4a8e-bd39-7f0ef9df399e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-32B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-32B-Chat", - "id": "Qwen/Qwen1.5-32B-Chat", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.512 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5532 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6067 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1956 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4457 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-32B/047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json b/data/hfopenllm_v2/Qwen/Qwen1.5-32B/047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json deleted file mode 100644 index ff8b2ab29..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-32B/047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-32B", - "id": "Qwen/Qwen1.5-32B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.512 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5715 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4278 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-4B-Chat/6d73016e-078e-4ffe-b2ae-5b829d1456df.json b/data/hfopenllm_v2/Qwen/Qwen1.5-4B-Chat/6d73016e-078e-4ffe-b2ae-5b829d1456df.json deleted file mode 100644 index 3d23aaf52..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-4B-Chat/6d73016e-078e-4ffe-b2ae-5b829d1456df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-4B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-4B-Chat", - "id": "Qwen/Qwen1.5-4B-Chat", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.95 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4006 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3978 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-4B/0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json b/data/hfopenllm_v2/Qwen/Qwen1.5-4B/0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json deleted file mode 100644 index b317e8f1c..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-4B/0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-4B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-4B", - "id": "Qwen/Qwen1.5-4B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.95 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4054 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.246 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-7B-Chat/03d51d90-fd15-42b7-ad5f-c7326cc642a7.json b/data/hfopenllm_v2/Qwen/Qwen1.5-7B-Chat/03d51d90-fd15-42b7-ad5f-c7326cc642a7.json deleted file mode 100644 index 7865c5891..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-7B-Chat/03d51d90-fd15-42b7-ad5f-c7326cc642a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-7B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-7B-Chat", - "id": "Qwen/Qwen1.5-7B-Chat", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.721 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3779 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2951 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-7B/d3e5c939-c53a-49d6-80cd-34420dbb176a.json b/data/hfopenllm_v2/Qwen/Qwen1.5-7B/d3e5c939-c53a-49d6-80cd-34420dbb176a.json deleted file mode 100644 index 29ccd9046..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-7B/d3e5c939-c53a-49d6-80cd-34420dbb176a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-7B", - "id": "Qwen/Qwen1.5-7B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.721 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2684 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.456 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4103 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2916 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B-Chat/ab321358-26f9-4577-a5fb-1f5d4b8784b4.json b/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B-Chat/ab321358-26f9-4577-a5fb-1f5d4b8784b4.json deleted file mode 100644 index c414d02d4..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B-Chat/ab321358-26f9-4577-a5fb-1f5d4b8784b4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-MoE-A2.7B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-MoE-A2.7B-Chat", - "id": "Qwen/Qwen1.5-MoE-A2.7B-Chat", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 14.316 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3795 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4272 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3899 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B/a43aae68-f12c-4a6d-b846-c498cf35f6cd.json b/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B/a43aae68-f12c-4a6d-b846-c498cf35f6cd.json deleted file mode 100644 index 07658bbfa..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B/a43aae68-f12c-4a6d-b846-c498cf35f6cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-MoE-A2.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-MoE-A2.7B", - "id": "Qwen/Qwen1.5-MoE-A2.7B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 14.316 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.266 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4013 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2778 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-0.5B-Instruct/b84615c0-43c4-49ec-83fe-5d3f8e6026af.json b/data/hfopenllm_v2/Qwen/Qwen2-0.5B-Instruct/b84615c0-43c4-49ec-83fe-5d3f8e6026af.json deleted file mode 100644 index d6e75e508..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-0.5B-Instruct/b84615c0-43c4-49ec-83fe-5d3f8e6026af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-0.5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-0.5B-Instruct", - "id": "Qwen/Qwen2-0.5B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2247 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3173 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1531 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-0.5B/7e687d24-9e12-4ecf-b283-e222efb9473a.json b/data/hfopenllm_v2/Qwen/Qwen2-0.5B/7e687d24-9e12-4ecf-b283-e222efb9473a.json deleted file mode 100644 index 164002250..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-0.5B/7e687d24-9e12-4ecf-b283-e222efb9473a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-0.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-0.5B", - "id": "Qwen/Qwen2-0.5B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3239 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-1.5B-Instruct/4aea143c-28fd-48bb-b911-37ac3fe58220.json b/data/hfopenllm_v2/Qwen/Qwen2-1.5B-Instruct/4aea143c-28fd-48bb-b911-37ac3fe58220.json deleted file mode 100644 index 29c2d61f9..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-1.5B-Instruct/4aea143c-28fd-48bb-b911-37ac3fe58220.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-1.5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-1.5B-Instruct", - "id": "Qwen/Qwen2-1.5B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2501 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-1.5B/34a8daec-bfff-4cf4-9011-0542b30c1d10.json b/data/hfopenllm_v2/Qwen/Qwen2-1.5B/34a8daec-bfff-4cf4-9011-0542b30c1d10.json deleted file mode 100644 index 290263529..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-1.5B/34a8daec-bfff-4cf4-9011-0542b30c1d10.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-1.5B", - "id": "Qwen/Qwen2-1.5B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2113 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2552 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B-Instruct/3e919d7b-53db-41fb-ac93-224e2768b9c6.json b/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B-Instruct/3e919d7b-53db-41fb-ac93-224e2768b9c6.json deleted file mode 100644 index c5c5b3781..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B-Instruct/3e919d7b-53db-41fb-ac93-224e2768b9c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-57B-A14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-57B-A14B-Instruct", - "id": "Qwen/Qwen2-57B-A14B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 57.409 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6338 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5888 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4361 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4575 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B/66becca1-d92b-409f-ab56-44d05cac66fd.json b/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B/66becca1-d92b-409f-ab56-44d05cac66fd.json deleted file mode 100644 index b8c0fe259..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B/66becca1-d92b-409f-ab56-44d05cac66fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-57B-A14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-57B-A14B", - "id": "Qwen/Qwen2-57B-A14B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 57.409 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3113 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5618 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1866 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4174 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4916 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-72B-Instruct/6293b269-7c4c-44da-bd85-e51954c173a1.json b/data/hfopenllm_v2/Qwen/Qwen2-72B-Instruct/6293b269-7c4c-44da-bd85-e51954c173a1.json deleted file mode 100644 index f6edd8027..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-72B-Instruct/6293b269-7c4c-44da-bd85-e51954c173a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-72B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-72B-Instruct", - "id": "Qwen/Qwen2-72B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7989 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6977 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4177 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.456 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5403 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-72B/add3b058-e7bc-4b7b-bb98-0d7039979072.json b/data/hfopenllm_v2/Qwen/Qwen2-72B/add3b058-e7bc-4b7b-bb98-0d7039979072.json deleted file mode 100644 index fbfd29d40..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-72B/add3b058-e7bc-4b7b-bb98-0d7039979072.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-72B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-72B", - "id": "Qwen/Qwen2-72B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3824 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6617 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4704 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5731 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-7B-Instruct/db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json b/data/hfopenllm_v2/Qwen/Qwen2-7B-Instruct/db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json deleted file mode 100644 index 14d4c60a1..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-7B-Instruct/db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-7B-Instruct", - "id": "Qwen/Qwen2-7B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5679 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5545 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2764 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3847 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-7B/54b055d0-80ae-4bba-b729-bd77b3ec7502.json b/data/hfopenllm_v2/Qwen/Qwen2-7B/54b055d0-80ae-4bba-b729-bd77b3ec7502.json deleted file mode 100644 index e3631e1e9..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-7B/54b055d0-80ae-4bba-b729-bd77b3ec7502.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-7B", - "id": "Qwen/Qwen2-7B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3149 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5315 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2039 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4439 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4183 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-Math-72B-Instruct/5c22d0b3-5082-4c6e-865c-71da03cf9378.json b/data/hfopenllm_v2/Qwen/Qwen2-Math-72B-Instruct/5c22d0b3-5082-4c6e-865c-71da03cf9378.json deleted file mode 100644 index a24afa6b4..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-Math-72B-Instruct/5c22d0b3-5082-4c6e-865c-71da03cf9378.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-Math-72B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-Math-72B-Instruct", - "id": "Qwen/Qwen2-Math-72B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5694 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-Math-7B/f8e5ee9f-519d-4ed8-bd2a-88897075f401.json b/data/hfopenllm_v2/Qwen/Qwen2-Math-7B/f8e5ee9f-519d-4ed8-bd2a-88897075f401.json deleted file mode 100644 index 93ad6f156..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-Math-7B/f8e5ee9f-519d-4ed8-bd2a-88897075f401.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-Math-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-Math-7B", - "id": "Qwen/Qwen2-Math-7B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2687 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3593 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1197 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-VL-72B-Instruct/b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json b/data/hfopenllm_v2/Qwen/Qwen2-VL-72B-Instruct/b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json deleted file mode 100644 index 30ca6ebbd..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-VL-72B-Instruct/b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-VL-72B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-VL-72B-Instruct", - "id": "Qwen/Qwen2-VL-72B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2VLForConditionalGeneration", - "params_billions": 73.406 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5982 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6946 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3444 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4492 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5717 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2-VL-7B-Instruct/27df1e06-463b-4519-87eb-a1666ad3f98c.json b/data/hfopenllm_v2/Qwen/Qwen2-VL-7B-Instruct/27df1e06-463b-4519-87eb-a1666ad3f98c.json deleted file mode 100644 index 276403439..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2-VL-7B-Instruct/27df1e06-463b-4519-87eb-a1666ad3f98c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-VL-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-VL-7B-Instruct", - "id": "Qwen/Qwen2-VL-7B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2VLForConditionalGeneration", - "params_billions": 8.291 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4599 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1986 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9d975b05-7bee-462d-a33a-afa0d5af94d4.json b/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9d975b05-7bee-462d-a33a-afa0d5af94d4.json deleted file mode 100644 index d78bde574..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9d975b05-7bee-462d-a33a-afa0d5af94d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-0.5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-Instruct", - "id": "Qwen/Qwen2.5-0.5B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3153 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9ef9135a-473e-43a5-a460-fd3ec50226f9.json b/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9ef9135a-473e-43a5-a460-fd3ec50226f9.json deleted file mode 100644 index 38a9f628f..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9ef9135a-473e-43a5-a460-fd3ec50226f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-0.5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-Instruct", - "id": "Qwen/Qwen2.5-0.5B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3071 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3329 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1697 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B/c57cae01-328e-447b-8945-e3cd2c4b8a7b.json b/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B/c57cae01-328e-447b-8945-e3cd2c4b8a7b.json deleted file mode 100644 index 7aa39af3d..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B/c57cae01-328e-447b-8945-e3cd2c4b8a7b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-0.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B", - "id": "Qwen/Qwen2.5-0.5B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1627 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3433 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1906 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B-Instruct/494c86cf-7f37-49d8-8160-b81859552c87.json b/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B-Instruct/494c86cf-7f37-49d8-8160-b81859552c87.json deleted file mode 100644 index 6d0e76598..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B-Instruct/494c86cf-7f37-49d8-8160-b81859552c87.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-1.5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-1.5B-Instruct", - "id": "Qwen/Qwen2.5-1.5B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4476 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2799 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B/6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json b/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B/6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json deleted file mode 100644 index 86834fae0..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B/6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-1.5B", - "id": "Qwen/Qwen2.5-1.5B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2674 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4078 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3576 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2855 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct-1M/9b10cd14-82f3-4b36-a4be-5092127d68c3.json b/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct-1M/9b10cd14-82f3-4b36-a4be-5092127d68c3.json deleted file mode 100644 index 7ccfcf4c1..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct-1M/9b10cd14-82f3-4b36-a4be-5092127d68c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-14B-Instruct-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Instruct-1M", - "id": "Qwen/Qwen2.5-14B-Instruct-1M", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8414 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6198 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.485 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct/bbd94181-0523-4543-80a7-056b041e03b7.json b/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct/bbd94181-0523-4543-80a7-056b041e03b7.json deleted file mode 100644 index ffc120545..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct/bbd94181-0523-4543-80a7-056b041e03b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Instruct", - "id": "Qwen/Qwen2.5-14B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8158 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.639 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4904 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-14B/e10d8573-e201-460e-a931-49a1b13ceeea.json b/data/hfopenllm_v2/Qwen/Qwen2.5-14B/e10d8573-e201-460e-a931-49a1b13ceeea.json deleted file mode 100644 index 20c1af18c..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-14B/e10d8573-e201-460e-a931-49a1b13ceeea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B", - "id": "Qwen/Qwen2.5-14B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3694 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6161 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5249 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-32B-Instruct/e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json b/data/hfopenllm_v2/Qwen/Qwen2.5-32B-Instruct/e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json deleted file mode 100644 index d6d218cf5..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-32B-Instruct/e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-32B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-32B-Instruct", - "id": "Qwen/Qwen2.5-32B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8346 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6913 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6254 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4261 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5667 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-32B/831246b8-5433-48e6-ba11-8a4239373106.json b/data/hfopenllm_v2/Qwen/Qwen2.5-32B/831246b8-5433-48e6-ba11-8a4239373106.json deleted file mode 100644 index ba1f6cfae..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-32B/831246b8-5433-48e6-ba11-8a4239373106.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-32B", - "id": "Qwen/Qwen2.5-32B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6771 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4119 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5805 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-3B-Instruct/8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json b/data/hfopenllm_v2/Qwen/Qwen2.5-3B-Instruct/8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json deleted file mode 100644 index a44b6ae5b..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-3B-Instruct/8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-Instruct", - "id": "Qwen/Qwen2.5-3B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6475 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4693 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3678 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-3B/5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json b/data/hfopenllm_v2/Qwen/Qwen2.5-3B/5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json deleted file mode 100644 index 229075fd2..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-3B/5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B", - "id": "Qwen/Qwen2.5-3B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.269 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4612 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4303 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3203 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-72B-Instruct/cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json b/data/hfopenllm_v2/Qwen/Qwen2.5-72B-Instruct/cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json deleted file mode 100644 index 9fcc6ecb3..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-72B-Instruct/cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-72B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-72B-Instruct", - "id": "Qwen/Qwen2.5-72B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8638 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7273 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4206 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5626 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-72B/3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json b/data/hfopenllm_v2/Qwen/Qwen2.5-72B/3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json deleted file mode 100644 index af273092b..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-72B/3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-72B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-72B", - "id": "Qwen/Qwen2.5-72B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4137 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6797 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3912 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4052 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4771 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5968 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct-1M/fc817789-2f44-4d2b-b40e-2422fe33d104.json b/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct-1M/fc817789-2f44-4d2b-b40e-2422fe33d104.json deleted file mode 100644 index 776013deb..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct-1M/fc817789-2f44-4d2b-b40e-2422fe33d104.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-7B-Instruct-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Instruct-1M", - "id": "Qwen/Qwen2.5-7B-Instruct-1M", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7448 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5404 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4335 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4087 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct/5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json b/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct/5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json deleted file mode 100644 index 66802b86b..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct/5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Instruct", - "id": "Qwen/Qwen2.5-7B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7585 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5394 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4287 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-7B/b6740747-19ac-4a9c-892f-6556013ddc8b.json b/data/hfopenllm_v2/Qwen/Qwen2.5-7B/b6740747-19ac-4a9c-892f-6556013ddc8b.json deleted file mode 100644 index 744db06b8..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-7B/b6740747-19ac-4a9c-892f-6556013ddc8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B", - "id": "Qwen/Qwen2.5-7B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3374 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4424 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4365 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B-Instruct/3263ab46-09ae-4c24-9332-b6874d0d0330.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B-Instruct/3263ab46-09ae-4c24-9332-b6874d0d0330.json deleted file mode 100644 index efdce1948..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B-Instruct/3263ab46-09ae-4c24-9332-b6874d0d0330.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-14B-Instruct", - "id": "Qwen/Qwen2.5-Coder-14B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6908 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3915 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B/a8706a7e-5693-4768-a955-a448549d2e77.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B/a8706a7e-5693-4768-a955-a448549d2e77.json deleted file mode 100644 index b59e9c9ed..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B/a8706a7e-5693-4768-a955-a448549d2e77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-14B", - "id": "Qwen/Qwen2.5-Coder-14B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5865 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2251 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4521 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B-Instruct/3c932329-0440-4799-886f-10bc4a5aeb09.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B-Instruct/3c932329-0440-4799-886f-10bc4a5aeb09.json deleted file mode 100644 index 61b3a04a0..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B-Instruct/3c932329-0440-4799-886f-10bc4a5aeb09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-32B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-32B-Instruct", - "id": "Qwen/Qwen2.5-Coder-32B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7265 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6625 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B/b1e42d9d-827d-4109-8d1b-182694033b21.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B/b1e42d9d-827d-4109-8d1b-182694033b21.json deleted file mode 100644 index 636fd2224..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B/b1e42d9d-827d-4109-8d1b-182694033b21.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-32B", - "id": "Qwen/Qwen2.5-Coder-32B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4363 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6404 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3089 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5303 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json deleted file mode 100644 index 00f370a8a..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-7B-Instruct", - "id": "Qwen/Qwen2.5-Coder-7B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6147 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4999 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4099 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/73b07681-8e10-414e-8922-650908f9cf6a.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/73b07681-8e10-414e-8922-650908f9cf6a.json deleted file mode 100644 index f33929085..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/73b07681-8e10-414e-8922-650908f9cf6a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-7B-Instruct", - "id": "Qwen/Qwen2.5-Coder-7B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6101 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B/8b1549f8-0602-4538-842c-abe9dca7baff.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B/8b1549f8-0602-4538-842c-abe9dca7baff.json deleted file mode 100644 index db8e13ac7..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B/8b1549f8-0602-4538-842c-abe9dca7baff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-7B", - "id": "Qwen/Qwen2.5-Coder-7B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3446 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4856 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-1.5B-Instruct/ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Math-1.5B-Instruct/ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json deleted file mode 100644 index 376ff5457..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-1.5B-Instruct/ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Math-1.5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-1.5B-Instruct", - "id": "Qwen/Qwen2.5-Math-1.5B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1856 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2628 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3685 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1801 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-72B-Instruct/14c01681-fbef-49c4-b737-a7baaa02d393.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Math-72B-Instruct/14c01681-fbef-49c4-b737-a7baaa02d393.json deleted file mode 100644 index 0366bc367..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-72B-Instruct/14c01681-fbef-49c4-b737-a7baaa02d393.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Math-72B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-72B-Instruct", - "id": "Qwen/Qwen2.5-Math-72B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B-Instruct/3ad495c0-da8e-4776-8d05-bc7dce1fe120.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B-Instruct/3ad495c0-da8e-4776-8d05-bc7dce1fe120.json deleted file mode 100644 index c41423d24..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B-Instruct/3ad495c0-da8e-4776-8d05-bc7dce1fe120.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Math-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-7B-Instruct", - "id": "Qwen/Qwen2.5-Math-7B-Instruct", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2636 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3647 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.282 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B/0762ca9e-f0d4-408e-9992-e91a10e0e65f.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B/0762ca9e-f0d4-408e-9992-e91a10e0e65f.json deleted file mode 100644 index e97af5279..000000000 --- a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B/0762ca9e-f0d4-408e-9992-e91a10e0e65f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Math-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-7B", - "id": "Qwen/Qwen2.5-Math-7B", - "developer": "Qwen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.246 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4455 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3051 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/RDson/WomboCombo-R1-Coder-14B-Preview/ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json b/data/hfopenllm_v2/RDson/WomboCombo-R1-Coder-14B-Preview/ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json deleted file mode 100644 index 8ed8e2c46..000000000 --- a/data/hfopenllm_v2/RDson/WomboCombo-R1-Coder-14B-Preview/ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/RDson_WomboCombo-R1-Coder-14B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WomboCombo-R1-Coder-14B-Preview", - "id": "RDson/WomboCombo-R1-Coder-14B-Preview", - "developer": "RDson", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6286 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4844 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5168 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/RESMPDEV/EVA-Qwen2.5-1.5B-FRFR/1fc39812-77fb-4d0c-b9fb-706e94c40afe.json b/data/hfopenllm_v2/RESMPDEV/EVA-Qwen2.5-1.5B-FRFR/1fc39812-77fb-4d0c-b9fb-706e94c40afe.json deleted file mode 100644 index 44c5b7786..000000000 --- a/data/hfopenllm_v2/RESMPDEV/EVA-Qwen2.5-1.5B-FRFR/1fc39812-77fb-4d0c-b9fb-706e94c40afe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/RESMPDEV_EVA-Qwen2.5-1.5B-FRFR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EVA-Qwen2.5-1.5B-FRFR", - "id": "RESMPDEV/EVA-Qwen2.5-1.5B-FRFR", - "developer": "RESMPDEV", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3082 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3932 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1027 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3539 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.277 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/RESMPDEV/Qwen2-Wukong-0.5B/fdc3c502-53ad-4bf7-85ce-51eaed72754b.json b/data/hfopenllm_v2/RESMPDEV/Qwen2-Wukong-0.5B/fdc3c502-53ad-4bf7-85ce-51eaed72754b.json deleted file mode 100644 index 7c90e36d8..000000000 --- a/data/hfopenllm_v2/RESMPDEV/Qwen2-Wukong-0.5B/fdc3c502-53ad-4bf7-85ce-51eaed72754b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/RESMPDEV_Qwen2-Wukong-0.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-Wukong-0.5B", - "id": "RESMPDEV/Qwen2-Wukong-0.5B", - "developer": "RESMPDEV", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1854 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3085 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2366 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3525 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1327 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/RLHFlow/ArmoRM-Llama3-8B-v0.1/3f74c1c7-f349-4193-95cf-b0033112fea0.json b/data/hfopenllm_v2/RLHFlow/ArmoRM-Llama3-8B-v0.1/3f74c1c7-f349-4193-95cf-b0033112fea0.json deleted file mode 100644 index 0705593b8..000000000 --- a/data/hfopenllm_v2/RLHFlow/ArmoRM-Llama3-8B-v0.1/3f74c1c7-f349-4193-95cf-b0033112fea0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/RLHFlow_ArmoRM-Llama3-8B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ArmoRM-Llama3-8B-v0.1", - "id": "RLHFlow/ArmoRM-Llama3-8B-v0.1", - "developer": "RLHFlow", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForRewardModelWithGating", - "params_billions": 7.511 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1897 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2876 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3948 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1078 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/RLHFlow/LLaMA3-iterative-DPO-final/36a803da-83ab-4c49-8855-9344aaa7a68b.json b/data/hfopenllm_v2/RLHFlow/LLaMA3-iterative-DPO-final/36a803da-83ab-4c49-8855-9344aaa7a68b.json deleted file mode 100644 index 49dd63ad0..000000000 --- a/data/hfopenllm_v2/RLHFlow/LLaMA3-iterative-DPO-final/36a803da-83ab-4c49-8855-9344aaa7a68b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/RLHFlow_LLaMA3-iterative-DPO-final/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA3-iterative-DPO-final", - "id": "RLHFlow/LLaMA3-iterative-DPO-final", - "developer": "RLHFlow", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5058 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0884 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3673 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3257 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/RWKV/rwkv-raven-14b/df986996-249e-49f9-b074-91e8dcdf62e2.json b/data/hfopenllm_v2/RWKV/rwkv-raven-14b/df986996-249e-49f9-b074-91e8dcdf62e2.json deleted file mode 100644 index 608ee4b4e..000000000 --- a/data/hfopenllm_v2/RWKV/rwkv-raven-14b/df986996-249e-49f9-b074-91e8dcdf62e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/RWKV_rwkv-raven-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "rwkv-raven-14b", - "id": "RWKV/rwkv-raven-14b", - "developer": "RWKV", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "RwkvForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0768 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3307 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.229 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Rakuten/RakutenAI-2.0-mini-instruct/90f007e9-e323-4a82-b276-ac1b928030ca.json b/data/hfopenllm_v2/Rakuten/RakutenAI-2.0-mini-instruct/90f007e9-e323-4a82-b276-ac1b928030ca.json deleted file mode 100644 index 559b245ad..000000000 --- a/data/hfopenllm_v2/Rakuten/RakutenAI-2.0-mini-instruct/90f007e9-e323-4a82-b276-ac1b928030ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Rakuten_RakutenAI-2.0-mini-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RakutenAI-2.0-mini-instruct", - "id": "Rakuten/RakutenAI-2.0-mini-instruct", - "developer": "Rakuten", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 1.535 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6794 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2867 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Rakuten/RakutenAI-7B-chat/2b627f93-5cc7-4a5e-b682-d129396362e5.json b/data/hfopenllm_v2/Rakuten/RakutenAI-7B-chat/2b627f93-5cc7-4a5e-b682-d129396362e5.json deleted file mode 100644 index 29a9ba71c..000000000 --- a/data/hfopenllm_v2/Rakuten/RakutenAI-7B-chat/2b627f93-5cc7-4a5e-b682-d129396362e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Rakuten_RakutenAI-7B-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RakutenAI-7B-chat", - "id": "Rakuten/RakutenAI-7B-chat", - "developer": "Rakuten", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.373 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2686 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.379 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2798 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Rakuten/RakutenAI-7B/2fde07ac-d218-4cc6-947e-8ceb87eedbee.json b/data/hfopenllm_v2/Rakuten/RakutenAI-7B/2fde07ac-d218-4cc6-947e-8ceb87eedbee.json deleted file mode 100644 index 7c3ef32cc..000000000 --- a/data/hfopenllm_v2/Rakuten/RakutenAI-7B/2fde07ac-d218-4cc6-947e-8ceb87eedbee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Rakuten_RakutenAI-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RakutenAI-7B", - "id": "Rakuten/RakutenAI-7B", - "developer": "Rakuten", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.373 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4315 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2877 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/L3-Pneuma-8B/2a141bfe-4632-4058-a232-1f2c5540c41f.json b/data/hfopenllm_v2/Replete-AI/L3-Pneuma-8B/2a141bfe-4632-4058-a232-1f2c5540c41f.json deleted file mode 100644 index 4c2435507..000000000 --- a/data/hfopenllm_v2/Replete-AI/L3-Pneuma-8B/2a141bfe-4632-4058-a232-1f2c5540c41f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_L3-Pneuma-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Pneuma-8B", - "id": "Replete-AI/L3-Pneuma-8B", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2413 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4909 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4105 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3176 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/L3.1-Pneuma-8B/fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json b/data/hfopenllm_v2/Replete-AI/L3.1-Pneuma-8B/fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json deleted file mode 100644 index dc4c32906..000000000 --- a/data/hfopenllm_v2/Replete-AI/L3.1-Pneuma-8B/fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_L3.1-Pneuma-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Pneuma-8B", - "id": "Replete-AI/L3.1-Pneuma-8B", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7076 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.505 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3871 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3691 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/Llama3-8B-Instruct-Replete-Adapted/c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json b/data/hfopenllm_v2/Replete-AI/Llama3-8B-Instruct-Replete-Adapted/c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json deleted file mode 100644 index 1449f9a9c..000000000 --- a/data/hfopenllm_v2/Replete-AI/Llama3-8B-Instruct-Replete-Adapted/c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_Llama3-8B-Instruct-Replete-Adapted/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-8B-Instruct-Replete-Adapted", - "id": "Replete-AI/Llama3-8B-Instruct-Replete-Adapted", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6915 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.487 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3634 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Instruct-8b-Merged/c439478a-1734-4038-aa8b-bb2d12ec022d.json b/data/hfopenllm_v2/Replete-AI/Replete-Coder-Instruct-8b-Merged/c439478a-1734-4038-aa8b-bb2d12ec022d.json deleted file mode 100644 index f5afd447c..000000000 --- a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Instruct-8b-Merged/c439478a-1734-4038-aa8b-bb2d12ec022d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-Coder-Instruct-8b-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Replete-Coder-Instruct-8b-Merged", - "id": "Replete-AI/Replete-Coder-Instruct-8b-Merged", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5388 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Llama3-8B/4a36f73a-9495-4ea2-863c-220b8ca6bf99.json b/data/hfopenllm_v2/Replete-AI/Replete-Coder-Llama3-8B/4a36f73a-9495-4ea2-863c-220b8ca6bf99.json deleted file mode 100644 index c28e41a33..000000000 --- a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Llama3-8B/4a36f73a-9495-4ea2-863c-220b8ca6bf99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-Coder-Llama3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Replete-Coder-Llama3-8B", - "id": "Replete-AI/Replete-Coder-Llama3-8B", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4729 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3271 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3953 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Qwen2-1.5b/faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json b/data/hfopenllm_v2/Replete-AI/Replete-Coder-Qwen2-1.5b/faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json deleted file mode 100644 index 5845dc173..000000000 --- a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Qwen2-1.5b/faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-Coder-Qwen2-1.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Replete-Coder-Qwen2-1.5b", - "id": "Replete-AI/Replete-Coder-Qwen2-1.5b", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3014 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/a55bf380-d567-4228-b30c-57e9df31e844.json b/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/a55bf380-d567-4228-b30c-57e9df31e844.json deleted file mode 100644 index fcb0416cd..000000000 --- a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/a55bf380-d567-4228-b30c-57e9df31e844.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-LLM-Qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Replete-LLM-Qwen2-7b", - "id": "Replete-AI/Replete-LLM-Qwen2-7b", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0932 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2977 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3941 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/dfd92311-4f3d-4355-8ccf-a59f29914b8f.json b/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/dfd92311-4f3d-4355-8ccf-a59f29914b8f.json deleted file mode 100644 index 2013766f3..000000000 --- a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/dfd92311-4f3d-4355-8ccf-a59f29914b8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-LLM-Qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Replete-LLM-Qwen2-7b", - "id": "Replete-AI/Replete-LLM-Qwen2-7b", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0905 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2985 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3848 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1158 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview/d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json b/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview/d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json deleted file mode 100644 index 43cd838cf..000000000 --- a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview/d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-LLM-Qwen2-7b_Beta-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Replete-LLM-Qwen2-7b_Beta-Preview", - "id": "Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0858 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2929 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1285 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Replete-AI/Replete-LLM-V2-Llama-3.1-8b/32edb764-2a42-4efe-ac86-9eda81942b84.json b/data/hfopenllm_v2/Replete-AI/Replete-LLM-V2-Llama-3.1-8b/32edb764-2a42-4efe-ac86-9eda81942b84.json deleted file mode 100644 index f78d9e7b1..000000000 --- a/data/hfopenllm_v2/Replete-AI/Replete-LLM-V2-Llama-3.1-8b/32edb764-2a42-4efe-ac86-9eda81942b84.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-LLM-V2-Llama-3.1-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Replete-LLM-V2-Llama-3.1-8b", - "id": "Replete-AI/Replete-LLM-V2-Llama-3.1-8b", - "developer": "Replete-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5339 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1405 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4001 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B/36855ebd-2030-4d5d-9c42-ca049244e694.json b/data/hfopenllm_v2/RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B/36855ebd-2030-4d5d-9c42-ca049244e694.json deleted file mode 100644 index bf5d8960d..000000000 --- a/data/hfopenllm_v2/RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B/36855ebd-2030-4d5d-9c42-ca049244e694.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/RezVortex_JAJUKA-WEWILLNEVERFORGETYOU-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "JAJUKA-WEWILLNEVERFORGETYOU-3B", - "id": "RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B", - "developer": "RezVortex", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6858 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4619 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/RezVortex/Jajuka-3b/9651a0a1-4004-42f3-ad8f-2aebb38ec967.json b/data/hfopenllm_v2/RezVortex/Jajuka-3b/9651a0a1-4004-42f3-ad8f-2aebb38ec967.json deleted file mode 100644 index 797c96c6d..000000000 --- a/data/hfopenllm_v2/RezVortex/Jajuka-3b/9651a0a1-4004-42f3-ad8f-2aebb38ec967.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/RezVortex_Jajuka-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jajuka-3b", - "id": "RezVortex/Jajuka-3b", - "developer": "RezVortex", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6925 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4594 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1594 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-DARE-0/a59e55dc-e2b5-43be-8469-49eee0e98d55.json b/data/hfopenllm_v2/Ro-xe/FMixIA-7B-DARE-0/a59e55dc-e2b5-43be-8469-49eee0e98d55.json deleted file mode 100644 index b2648eb78..000000000 --- a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-DARE-0/a59e55dc-e2b5-43be-8469-49eee0e98d55.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ro-xe_FMixIA-7B-DARE-0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FMixIA-7B-DARE-0", - "id": "Ro-xe/FMixIA-7B-DARE-0", - "developer": "Ro-xe", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5035 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4545 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3016 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-SLERP-27/a956e306-f184-4dbc-ac7a-3793ae735801.json b/data/hfopenllm_v2/Ro-xe/FMixIA-7B-SLERP-27/a956e306-f184-4dbc-ac7a-3793ae735801.json deleted file mode 100644 index f8798e128..000000000 --- a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-SLERP-27/a956e306-f184-4dbc-ac7a-3793ae735801.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ro-xe_FMixIA-7B-SLERP-27/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FMixIA-7B-SLERP-27", - "id": "Ro-xe/FMixIA-7B-SLERP-27", - "developer": "Ro-xe", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3765 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5151 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3008 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-TIES-1/c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json b/data/hfopenllm_v2/Ro-xe/FMixIA-7B-TIES-1/c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json deleted file mode 100644 index 1a47f9d40..000000000 --- a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-TIES-1/c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ro-xe_FMixIA-7B-TIES-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FMixIA-7B-TIES-1", - "id": "Ro-xe/FMixIA-7B-TIES-1", - "developer": "Ro-xe", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3453 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5092 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4689 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2992 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9/415875b7-fe10-47e7-aca0-029c2f51c067.json b/data/hfopenllm_v2/Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9/415875b7-fe10-47e7-aca0-029c2f51c067.json deleted file mode 100644 index 2f038af4f..000000000 --- a/data/hfopenllm_v2/Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9/415875b7-fe10-47e7-aca0-029c2f51c067.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Ro-xe_FMixIA-FrankenMerge-9.5B-PT-9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FMixIA-FrankenMerge-9.5B-PT-9", - "id": "Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9", - "developer": "Ro-xe", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.141 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.194 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5088 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3657 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Rombo-Org/Rombo-LLM-V2.5-Qwen-7b/c505ee64-3d3b-48e2-9c8a-f59609a758e9.json b/data/hfopenllm_v2/Rombo-Org/Rombo-LLM-V2.5-Qwen-7b/c505ee64-3d3b-48e2-9c8a-f59609a758e9.json deleted file mode 100644 index 92335ffa4..000000000 --- a/data/hfopenllm_v2/Rombo-Org/Rombo-LLM-V2.5-Qwen-7b/c505ee64-3d3b-48e2-9c8a-f59609a758e9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Rombo-Org_Rombo-LLM-V2.5-Qwen-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombo-LLM-V2.5-Qwen-7b", - "id": "Rombo-Org/Rombo-LLM-V2.5-Qwen-7b", - "developer": "Rombo-Org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4283 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2/00003185-c291-40c5-bba1-f87eae0afc08.json b/data/hfopenllm_v2/RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2/00003185-c291-40c5-bba1-f87eae0afc08.json deleted file mode 100644 index 14bcc9da2..000000000 --- a/data/hfopenllm_v2/RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2/00003185-c291-40c5-bba1-f87eae0afc08.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/RubielLabarta_LogoS-7Bx2-MoE-13B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LogoS-7Bx2-MoE-13B-v0.2", - "id": "RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2", - "developer": "RubielLabarta", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4379 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3088 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SaisExperiments/Evil-Alpaca-3B-L3.2/328f61d7-677b-4a06-b464-0da42153f9ae.json b/data/hfopenllm_v2/SaisExperiments/Evil-Alpaca-3B-L3.2/328f61d7-677b-4a06-b464-0da42153f9ae.json deleted file mode 100644 index 2ee369a00..000000000 --- a/data/hfopenllm_v2/SaisExperiments/Evil-Alpaca-3B-L3.2/328f61d7-677b-4a06-b464-0da42153f9ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SaisExperiments_Evil-Alpaca-3B-L3.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Evil-Alpaca-3B-L3.2", - "id": "SaisExperiments/Evil-Alpaca-3B-L3.2", - "developer": "SaisExperiments", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3251 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4341 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2621 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Opus-Instruct/9cb5b8fd-062c-4161-9301-640980d21b9f.json b/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Opus-Instruct/9cb5b8fd-062c-4161-9301-640980d21b9f.json deleted file mode 100644 index b9f60ffdb..000000000 --- a/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Opus-Instruct/9cb5b8fd-062c-4161-9301-640980d21b9f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SaisExperiments_Gemma-2-2B-Opus-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-2B-Opus-Instruct", - "id": "SaisExperiments/Gemma-2-2B-Opus-Instruct", - "developer": "SaisExperiments", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.475 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4057 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.265 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Stheno-Filtered/09284b75-a2f9-40ea-8135-7aa61c626fa2.json b/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Stheno-Filtered/09284b75-a2f9-40ea-8135-7aa61c626fa2.json deleted file mode 100644 index cef011e36..000000000 --- a/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Stheno-Filtered/09284b75-a2f9-40ea-8135-7aa61c626fa2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SaisExperiments_Gemma-2-2B-Stheno-Filtered/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-2B-Stheno-Filtered", - "id": "SaisExperiments/Gemma-2-2B-Stheno-Filtered", - "developer": "SaisExperiments", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4197 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4149 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.263 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SaisExperiments/Not-So-Small-Alpaca-24B/e2502331-6ac3-43bc-8218-259b44333283.json b/data/hfopenllm_v2/SaisExperiments/Not-So-Small-Alpaca-24B/e2502331-6ac3-43bc-8218-259b44333283.json deleted file mode 100644 index f23479676..000000000 --- a/data/hfopenllm_v2/SaisExperiments/Not-So-Small-Alpaca-24B/e2502331-6ac3-43bc-8218-259b44333283.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SaisExperiments_Not-So-Small-Alpaca-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Not-So-Small-Alpaca-24B", - "id": "SaisExperiments/Not-So-Small-Alpaca-24B", - "developer": "SaisExperiments", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6244 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5339 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1828 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4282 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3694 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SaisExperiments/QwOwO-7B-V1/8dde454d-aa48-4ee1-b5c6-f3353087d492.json b/data/hfopenllm_v2/SaisExperiments/QwOwO-7B-V1/8dde454d-aa48-4ee1-b5c6-f3353087d492.json deleted file mode 100644 index 858e45e66..000000000 --- a/data/hfopenllm_v2/SaisExperiments/QwOwO-7B-V1/8dde454d-aa48-4ee1-b5c6-f3353087d492.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SaisExperiments_QwOwO-7B-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwOwO-7B-V1", - "id": "SaisExperiments/QwOwO-7B-V1", - "developer": "SaisExperiments", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4556 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3835 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4224 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SaisExperiments/RightSheep-Llama3.2-3B/662c8ed2-2407-4606-ac1e-ec7ade185d2d.json b/data/hfopenllm_v2/SaisExperiments/RightSheep-Llama3.2-3B/662c8ed2-2407-4606-ac1e-ec7ade185d2d.json deleted file mode 100644 index 8ded05543..000000000 --- a/data/hfopenllm_v2/SaisExperiments/RightSheep-Llama3.2-3B/662c8ed2-2407-4606-ac1e-ec7ade185d2d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SaisExperiments_RightSheep-Llama3.2-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RightSheep-Llama3.2-3B", - "id": "SaisExperiments/RightSheep-Llama3.2-3B", - "developer": "SaisExperiments", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4156 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.254 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Anemoi-3B/332aef8c-7c62-463e-ba3c-07ae0205d457.json b/data/hfopenllm_v2/Sakalti/Anemoi-3B/332aef8c-7c62-463e-ba3c-07ae0205d457.json deleted file mode 100644 index 311f81095..000000000 --- a/data/hfopenllm_v2/Sakalti/Anemoi-3B/332aef8c-7c62-463e-ba3c-07ae0205d457.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Anemoi-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Anemoi-3B", - "id": "Sakalti/Anemoi-3B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4922 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3766 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Euphrates-14B/cfdfcf21-e445-430e-a295-946cb8c3fce9.json b/data/hfopenllm_v2/Sakalti/Euphrates-14B/cfdfcf21-e445-430e-a295-946cb8c3fce9.json deleted file mode 100644 index 228187fd8..000000000 --- a/data/hfopenllm_v2/Sakalti/Euphrates-14B/cfdfcf21-e445-430e-a295-946cb8c3fce9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Euphrates-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Euphrates-14B", - "id": "Sakalti/Euphrates-14B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2647 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6138 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3051 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4516 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5255 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Llama3.2-3B-Uranus-1/a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json b/data/hfopenllm_v2/Sakalti/Llama3.2-3B-Uranus-1/a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json deleted file mode 100644 index 077b4b960..000000000 --- a/data/hfopenllm_v2/Sakalti/Llama3.2-3B-Uranus-1/a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Llama3.2-3B-Uranus-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2-3B-Uranus-1", - "id": "Sakalti/Llama3.2-3B-Uranus-1", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5335 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1495 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3669 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3094 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Magro-7B-v1.1/465d473c-ef28-4725-8cac-02f2a031b22c.json b/data/hfopenllm_v2/Sakalti/Magro-7B-v1.1/465d473c-ef28-4725-8cac-02f2a031b22c.json deleted file mode 100644 index 748ddb98f..000000000 --- a/data/hfopenllm_v2/Sakalti/Magro-7B-v1.1/465d473c-ef28-4725-8cac-02f2a031b22c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Magro-7B-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magro-7B-v1.1", - "id": "Sakalti/Magro-7B-v1.1", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1204 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4179 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4433 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Neptuno-3B/2c636544-8676-4eee-8bcd-d623be0275be.json b/data/hfopenllm_v2/Sakalti/Neptuno-3B/2c636544-8676-4eee-8bcd-d623be0275be.json deleted file mode 100644 index 6e3a0d4d0..000000000 --- a/data/hfopenllm_v2/Sakalti/Neptuno-3B/2c636544-8676-4eee-8bcd-d623be0275be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Neptuno-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neptuno-3B", - "id": "Sakalti/Neptuno-3B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4296 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4834 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2553 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4002 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3773 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Neptuno-Alpha/8b332fac-1cfa-498b-853a-52ec5492ddc7.json b/data/hfopenllm_v2/Sakalti/Neptuno-Alpha/8b332fac-1cfa-498b-853a-52ec5492ddc7.json deleted file mode 100644 index c98619fdf..000000000 --- a/data/hfopenllm_v2/Sakalti/Neptuno-Alpha/8b332fac-1cfa-498b-853a-52ec5492ddc7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Neptuno-Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neptuno-Alpha", - "id": "Sakalti/Neptuno-Alpha", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4925 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1835 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Oxyge1-33B/2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json b/data/hfopenllm_v2/Sakalti/Oxyge1-33B/2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json deleted file mode 100644 index eb5bc1d97..000000000 --- a/data/hfopenllm_v2/Sakalti/Oxyge1-33B/2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Oxyge1-33B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Oxyge1-33B", - "id": "Sakalti/Oxyge1-33B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4548 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7033 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5909 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Phi3.5-Comets-3.8B/69bb0243-75b2-4858-ba6b-5e70cfb516a7.json b/data/hfopenllm_v2/Sakalti/Phi3.5-Comets-3.8B/69bb0243-75b2-4858-ba6b-5e70cfb516a7.json deleted file mode 100644 index 0dfa27332..000000000 --- a/data/hfopenllm_v2/Sakalti/Phi3.5-Comets-3.8B/69bb0243-75b2-4858-ba6b-5e70cfb516a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Phi3.5-Comets-3.8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi3.5-Comets-3.8B", - "id": "Sakalti/Phi3.5-Comets-3.8B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2094 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3335 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3764 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1153 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Qwen2.5-1B-Instruct/4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json b/data/hfopenllm_v2/Sakalti/Qwen2.5-1B-Instruct/4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json deleted file mode 100644 index 6c0cb747d..000000000 --- a/data/hfopenllm_v2/Sakalti/Qwen2.5-1B-Instruct/4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Qwen2.5-1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-1B-Instruct", - "id": "Sakalti/Qwen2.5-1B-Instruct", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.988 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1751 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3027 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1213 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/QwenTest-7/87878b74-22ce-4554-914c-03e486d13de3.json b/data/hfopenllm_v2/Sakalti/QwenTest-7/87878b74-22ce-4554-914c-03e486d13de3.json deleted file mode 100644 index 5e4bcbff5..000000000 --- a/data/hfopenllm_v2/Sakalti/QwenTest-7/87878b74-22ce-4554-914c-03e486d13de3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_QwenTest-7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenTest-7", - "id": "Sakalti/QwenTest-7", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.988 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1672 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1212 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-0.5B/5030f8d4-f216-4f78-84f1-dd03b0324bb0.json b/data/hfopenllm_v2/Sakalti/SJT-0.5B/5030f8d4-f216-4f78-84f1-dd03b0324bb0.json deleted file mode 100644 index 8cfc9ba99..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-0.5B/5030f8d4-f216-4f78-84f1-dd03b0324bb0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-0.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-0.5B", - "id": "Sakalti/SJT-0.5B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2425 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3306 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1891 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha-1.1/c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json b/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha-1.1/c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json deleted file mode 100644 index b6b2e229c..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha-1.1/c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-1.5B-Alpha-1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-1.5B-Alpha-1.1", - "id": "Sakalti/SJT-1.5B-Alpha-1.1", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3439 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4243 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4239 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2966 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha/38261a01-62df-42b2-9b1d-f924598e70ef.json b/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha/38261a01-62df-42b2-9b1d-f924598e70ef.json deleted file mode 100644 index b438fe370..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha/38261a01-62df-42b2-9b1d-f924598e70ef.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-1.5B-Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-1.5B-Alpha", - "id": "Sakalti/SJT-1.5B-Alpha", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3449 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-1.7B/5736f0b5-3903-4774-a84a-c3db260d36e4.json b/data/hfopenllm_v2/Sakalti/SJT-1.7B/5736f0b5-3903-4774-a84a-c3db260d36e4.json deleted file mode 100644 index b6d4f8db5..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-1.7B/5736f0b5-3903-4774-a84a-c3db260d36e4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-1.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-1.7B", - "id": "Sakalti/SJT-1.7B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.684 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1776 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2934 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2416 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3964 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-14B/70134d58-972e-49c9-8cde-4ba2691d3dc3.json b/data/hfopenllm_v2/Sakalti/SJT-14B/70134d58-972e-49c9-8cde-4ba2691d3dc3.json deleted file mode 100644 index ff7d304d7..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-14B/70134d58-972e-49c9-8cde-4ba2691d3dc3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-14B", - "id": "Sakalti/SJT-14B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5494 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6536 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4766 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-2.4B/d4bb1440-2064-4752-bcb3-c9cec234fd1b.json b/data/hfopenllm_v2/Sakalti/SJT-2.4B/d4bb1440-2064-4752-bcb3-c9cec234fd1b.json deleted file mode 100644 index b6553ae79..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-2.4B/d4bb1440-2064-4752-bcb3-c9cec234fd1b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-2.4B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-2.4B", - "id": "Sakalti/SJT-2.4B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.432 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2804 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-24B-Alpha/d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json b/data/hfopenllm_v2/Sakalti/SJT-24B-Alpha/d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json deleted file mode 100644 index d24b4279e..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-24B-Alpha/d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-24B-Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-24B-Alpha", - "id": "Sakalti/SJT-24B-Alpha", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 24.125 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3206 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6081 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.253 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4857 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-2B-V1.1/f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json b/data/hfopenllm_v2/Sakalti/SJT-2B-V1.1/f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json deleted file mode 100644 index 1042b7d30..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-2B-V1.1/f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-2B-V1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-2B-V1.1", - "id": "Sakalti/SJT-2B-V1.1", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4299 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-2B/7bf23db0-877c-4700-95c8-e35dee5e57b4.json b/data/hfopenllm_v2/Sakalti/SJT-2B/7bf23db0-877c-4700-95c8-e35dee5e57b4.json deleted file mode 100644 index 344d5d801..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-2B/7bf23db0-877c-4700-95c8-e35dee5e57b4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-2B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-2B", - "id": "Sakalti/SJT-2B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2151 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2416 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3564 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1187 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-3.7B/07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json b/data/hfopenllm_v2/Sakalti/SJT-3.7B/07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json deleted file mode 100644 index c851ca6fd..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-3.7B/07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-3.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-3.7B", - "id": "Sakalti/SJT-3.7B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.783 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1078 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3393 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3617 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-4B/8535ffae-f39d-46ed-89bb-a1656885db91.json b/data/hfopenllm_v2/Sakalti/SJT-4B/8535ffae-f39d-46ed-89bb-a1656885db91.json deleted file mode 100644 index 841001a9a..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-4B/8535ffae-f39d-46ed-89bb-a1656885db91.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-4B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-4B", - "id": "Sakalti/SJT-4B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3281 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-7.5B/5e832121-9a67-44d9-973d-fffdb1b37975.json b/data/hfopenllm_v2/Sakalti/SJT-7.5B/5e832121-9a67-44d9-973d-fffdb1b37975.json deleted file mode 100644 index 157236e81..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-7.5B/5e832121-9a67-44d9-973d-fffdb1b37975.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-7.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-7.5B", - "id": "Sakalti/SJT-7.5B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4223 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5367 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2168 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4399 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1-Multilingal/92d3f67d-a026-49e3-a440-68c10fb358ae.json b/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1-Multilingal/92d3f67d-a026-49e3-a440-68c10fb358ae.json deleted file mode 100644 index 8dfb066a7..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1-Multilingal/92d3f67d-a026-49e3-a440-68c10fb358ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-7B-V1.1-Multilingal/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-7B-V1.1-Multilingal", - "id": "Sakalti/SJT-7B-V1.1-Multilingal", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1949 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1/9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json b/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1/9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json deleted file mode 100644 index 7f654b278..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1/9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-7B-V1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-7B-V1.1", - "id": "Sakalti/SJT-7B-V1.1", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4703 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2432 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4411 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-8B-V1.1/489e8e84-5e30-46fa-a421-f52308f051e7.json b/data/hfopenllm_v2/Sakalti/SJT-8B-V1.1/489e8e84-5e30-46fa-a421-f52308f051e7.json deleted file mode 100644 index 011bbc7a2..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-8B-V1.1/489e8e84-5e30-46fa-a421-f52308f051e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-8B-V1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-8B-V1.1", - "id": "Sakalti/SJT-8B-V1.1", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 8.545 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5121 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2069 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4266 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-8B/a208f807-c930-4e81-8ebd-dcbb4db76442.json b/data/hfopenllm_v2/Sakalti/SJT-8B/a208f807-c930-4e81-8ebd-dcbb4db76442.json deleted file mode 100644 index effa93eb9..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-8B/a208f807-c930-4e81-8ebd-dcbb4db76442.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-8B", - "id": "Sakalti/SJT-8B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 8.548 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6535 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5282 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2538 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-900M/4956539d-a255-4c56-877f-257e463fa3e4.json b/data/hfopenllm_v2/Sakalti/SJT-900M/4956539d-a255-4c56-877f-257e463fa3e4.json deleted file mode 100644 index 73e5c563b..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-900M/4956539d-a255-4c56-877f-257e463fa3e4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-900M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-900M", - "id": "Sakalti/SJT-900M", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.899 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.241 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3169 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJT-Moe2x7.5B/3451eb65-020c-4e34-9128-7410e6b293cd.json b/data/hfopenllm_v2/Sakalti/SJT-Moe2x7.5B/3451eb65-020c-4e34-9128-7410e6b293cd.json deleted file mode 100644 index af8b9e5e0..000000000 --- a/data/hfopenllm_v2/Sakalti/SJT-Moe2x7.5B/3451eb65-020c-4e34-9128-7410e6b293cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJT-Moe2x7.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJT-Moe2x7.5B", - "id": "Sakalti/SJT-Moe2x7.5B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 13.401 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4117 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5371 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4399 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3954 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJTPass-2/b5cd0061-e4dd-4049-a51e-b16490e69120.json b/data/hfopenllm_v2/Sakalti/SJTPass-2/b5cd0061-e4dd-4049-a51e-b16490e69120.json deleted file mode 100644 index b2c0d1e00..000000000 --- a/data/hfopenllm_v2/Sakalti/SJTPass-2/b5cd0061-e4dd-4049-a51e-b16490e69120.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJTPass-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJTPass-2", - "id": "Sakalti/SJTPass-2", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.24 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1902 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJTPass-4/c4686af6-0b7b-4df3-9152-14a3ef087b7f.json b/data/hfopenllm_v2/Sakalti/SJTPass-4/c4686af6-0b7b-4df3-9152-14a3ef087b7f.json deleted file mode 100644 index fbfa47dbe..000000000 --- a/data/hfopenllm_v2/Sakalti/SJTPass-4/c4686af6-0b7b-4df3-9152-14a3ef087b7f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJTPass-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJTPass-4", - "id": "Sakalti/SJTPass-4", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.167 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1913 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2964 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3898 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1083 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SJTPass-5/155885ca-11e7-4cd2-b26c-53e001e2a6f9.json b/data/hfopenllm_v2/Sakalti/SJTPass-5/155885ca-11e7-4cd2-b26c-53e001e2a6f9.json deleted file mode 100644 index 8136444ac..000000000 --- a/data/hfopenllm_v2/Sakalti/SJTPass-5/155885ca-11e7-4cd2-b26c-53e001e2a6f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SJTPass-5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SJTPass-5", - "id": "Sakalti/SJTPass-5", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.809 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2425 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3103 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1327 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saba-Passthrough-2/d9ca5411-def6-43b3-a522-595131d8e5e6.json b/data/hfopenllm_v2/Sakalti/Saba-Passthrough-2/d9ca5411-def6-43b3-a522-595131d8e5e6.json deleted file mode 100644 index 4e51d28f5..000000000 --- a/data/hfopenllm_v2/Sakalti/Saba-Passthrough-2/d9ca5411-def6-43b3-a522-595131d8e5e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saba-Passthrough-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saba-Passthrough-2", - "id": "Sakalti/Saba-Passthrough-2", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.087 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1691 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3672 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2077 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saba1-1.8B/e54553ab-0897-4cb5-9213-5bb72758d2b5.json b/data/hfopenllm_v2/Sakalti/Saba1-1.8B/e54553ab-0897-4cb5-9213-5bb72758d2b5.json deleted file mode 100644 index db3cc96a9..000000000 --- a/data/hfopenllm_v2/Sakalti/Saba1-1.8B/e54553ab-0897-4cb5-9213-5bb72758d2b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saba1-1.8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saba1-1.8B", - "id": "Sakalti/Saba1-1.8B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3333 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4147 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4239 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2926 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saba1-7B/eed48cdc-18db-4c03-84bf-d2d50e3328b0.json b/data/hfopenllm_v2/Sakalti/Saba1-7B/eed48cdc-18db-4c03-84bf-d2d50e3328b0.json deleted file mode 100644 index 9e78fd60a..000000000 --- a/data/hfopenllm_v2/Sakalti/Saba1-7B/eed48cdc-18db-4c03-84bf-d2d50e3328b0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saba1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saba1-7B", - "id": "Sakalti/Saba1-7B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4585 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5489 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saba1.5-1.5B/d7952aef-37e2-4c15-a1a4-598690773bbb.json b/data/hfopenllm_v2/Sakalti/Saba1.5-1.5B/d7952aef-37e2-4c15-a1a4-598690773bbb.json deleted file mode 100644 index d446d42e8..000000000 --- a/data/hfopenllm_v2/Sakalti/Saba1.5-1.5B/d7952aef-37e2-4c15-a1a4-598690773bbb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saba1.5-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saba1.5-1.5B", - "id": "Sakalti/Saba1.5-1.5B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3333 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4147 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4239 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2926 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saba1.5-Pro-3B/5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json b/data/hfopenllm_v2/Sakalti/Saba1.5-Pro-3B/5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json deleted file mode 100644 index a268ff87b..000000000 --- a/data/hfopenllm_v2/Sakalti/Saba1.5-Pro-3B/5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saba1.5-Pro-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saba1.5-Pro-3B", - "id": "Sakalti/Saba1.5-Pro-3B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.9 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2386 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3623 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4405 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1958 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saba2-14B-Preview/cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json b/data/hfopenllm_v2/Sakalti/Saba2-14B-Preview/cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json deleted file mode 100644 index 2d5a6e567..000000000 --- a/data/hfopenllm_v2/Sakalti/Saba2-14B-Preview/cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saba2-14B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saba2-14B-Preview", - "id": "Sakalti/Saba2-14B-Preview", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4722 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6496 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saba2-3B/a12208ce-e9e1-4476-8054-0d565efad92c.json b/data/hfopenllm_v2/Sakalti/Saba2-3B/a12208ce-e9e1-4476-8054-0d565efad92c.json deleted file mode 100644 index 1efd679c7..000000000 --- a/data/hfopenllm_v2/Sakalti/Saba2-3B/a12208ce-e9e1-4476-8054-0d565efad92c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saba2-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saba2-3B", - "id": "Sakalti/Saba2-3B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2865 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2801 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3927 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.121 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Sailor-japanese/f46e1eeb-8b8b-4d47-9510-445109b5518b.json b/data/hfopenllm_v2/Sakalti/Sailor-japanese/f46e1eeb-8b8b-4d47-9510-445109b5518b.json deleted file mode 100644 index 353e25d9c..000000000 --- a/data/hfopenllm_v2/Sakalti/Sailor-japanese/f46e1eeb-8b8b-4d47-9510-445109b5518b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Sailor-japanese/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sailor-japanese", - "id": "Sakalti/Sailor-japanese", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1605 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2913 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3912 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saka-1.5B/7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json b/data/hfopenllm_v2/Sakalti/Saka-1.5B/7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json deleted file mode 100644 index bae9cee17..000000000 --- a/data/hfopenllm_v2/Sakalti/Saka-1.5B/7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saka-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saka-1.5B", - "id": "Sakalti/Saka-1.5B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2726 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3988 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0801 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2415 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saka-14B/823e886a-1431-4078-81a3-4b941983461d.json b/data/hfopenllm_v2/Sakalti/Saka-14B/823e886a-1431-4078-81a3-4b941983461d.json deleted file mode 100644 index a2883bea2..000000000 --- a/data/hfopenllm_v2/Sakalti/Saka-14B/823e886a-1431-4078-81a3-4b941983461d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saka-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saka-14B", - "id": "Sakalti/Saka-14B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7174 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6497 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4094 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saka-24B/583609f0-de5b-43cd-a667-bb2c36679fd2.json b/data/hfopenllm_v2/Sakalti/Saka-24B/583609f0-de5b-43cd-a667-bb2c36679fd2.json deleted file mode 100644 index 873a3e63b..000000000 --- a/data/hfopenllm_v2/Sakalti/Saka-24B/583609f0-de5b-43cd-a667-bb2c36679fd2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saka-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saka-24B", - "id": "Sakalti/Saka-24B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3819 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6072 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4766 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saka-7.2B/2d2cea8b-167e-4d63-b01c-537f372672f9.json b/data/hfopenllm_v2/Sakalti/Saka-7.2B/2d2cea8b-167e-4d63-b01c-537f372672f9.json deleted file mode 100644 index fc53daeea..000000000 --- a/data/hfopenllm_v2/Sakalti/Saka-7.2B/2d2cea8b-167e-4d63-b01c-537f372672f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saka-7.2B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saka-7.2B", - "id": "Sakalti/Saka-7.2B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.292 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1545 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3711 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.116 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Saka-7.6B/f584f596-3a17-404a-81a2-3033ad38cad6.json b/data/hfopenllm_v2/Sakalti/Saka-7.6B/f584f596-3a17-404a-81a2-3033ad38cad6.json deleted file mode 100644 index 045f0e9ed..000000000 --- a/data/hfopenllm_v2/Sakalti/Saka-7.6B/f584f596-3a17-404a-81a2-3033ad38cad6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Saka-7.6B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Saka-7.6B", - "id": "Sakalti/Saka-7.6B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4524 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.454 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SakaMoe-3x1.6B-Instruct/ebb0930f-92be-4e1b-a2a6-779f69d2151c.json b/data/hfopenllm_v2/Sakalti/SakaMoe-3x1.6B-Instruct/ebb0930f-92be-4e1b-a2a6-779f69d2151c.json deleted file mode 100644 index e8b9b5045..000000000 --- a/data/hfopenllm_v2/Sakalti/SakaMoe-3x1.6B-Instruct/ebb0930f-92be-4e1b-a2a6-779f69d2151c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SakaMoe-3x1.6B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SakaMoe-3x1.6B-Instruct", - "id": "Sakalti/SakaMoe-3x1.6B-Instruct", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 1.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3282 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1882 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Alpha/b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json b/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Alpha/b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json deleted file mode 100644 index d9f49d39c..000000000 --- a/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Alpha/b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SakalFusion-7B-Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SakalFusion-7B-Alpha", - "id": "Sakalti/SakalFusion-7B-Alpha", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5591 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4581 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4474 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Beta/4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json b/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Beta/4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json deleted file mode 100644 index 55e8681c6..000000000 --- a/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Beta/4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_SakalFusion-7B-Beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SakalFusion-7B-Beta", - "id": "Sakalti/SakalFusion-7B-Beta", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1809 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2881 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/Tara-3.8B-v1.1/05ffcb7a-2694-4276-bf45-73e1110bc494.json b/data/hfopenllm_v2/Sakalti/Tara-3.8B-v1.1/05ffcb7a-2694-4276-bf45-73e1110bc494.json deleted file mode 100644 index 1aeae7ac8..000000000 --- a/data/hfopenllm_v2/Sakalti/Tara-3.8B-v1.1/05ffcb7a-2694-4276-bf45-73e1110bc494.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_Tara-3.8B-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tara-3.8B-v1.1", - "id": "Sakalti/Tara-3.8B-v1.1", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3281 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/light-1.1-3B/dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json b/data/hfopenllm_v2/Sakalti/light-1.1-3B/dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json deleted file mode 100644 index 731f06557..000000000 --- a/data/hfopenllm_v2/Sakalti/light-1.1-3B/dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_light-1.1-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "light-1.1-3B", - "id": "Sakalti/light-1.1-3B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2803 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3901 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1209 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/light-3B/154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json b/data/hfopenllm_v2/Sakalti/light-3B/154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json deleted file mode 100644 index 08a907de4..000000000 --- a/data/hfopenllm_v2/Sakalti/light-3B/154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_light-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "light-3B", - "id": "Sakalti/light-3B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5337 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4831 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2591 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/light-3b-beta/998316d2-389a-4ce0-b0b0-0430c1361de7.json b/data/hfopenllm_v2/Sakalti/light-3b-beta/998316d2-389a-4ce0-b0b0-0430c1361de7.json deleted file mode 100644 index e8992c2b0..000000000 --- a/data/hfopenllm_v2/Sakalti/light-3b-beta/998316d2-389a-4ce0-b0b0-0430c1361de7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_light-3b-beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "light-3b-beta", - "id": "Sakalti/light-3b-beta", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5485 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4815 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2772 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/light-7b-beta/ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json b/data/hfopenllm_v2/Sakalti/light-7b-beta/ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json deleted file mode 100644 index 0e0fb6d0e..000000000 --- a/data/hfopenllm_v2/Sakalti/light-7b-beta/ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_light-7b-beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "light-7b-beta", - "id": "Sakalti/light-7b-beta", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6234 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4291 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/llama-3-yanyuedao-8b-instruct/2519485b-47cd-497c-a349-9e69db0266f3.json b/data/hfopenllm_v2/Sakalti/llama-3-yanyuedao-8b-instruct/2519485b-47cd-497c-a349-9e69db0266f3.json deleted file mode 100644 index 6cdfb156a..000000000 --- a/data/hfopenllm_v2/Sakalti/llama-3-yanyuedao-8b-instruct/2519485b-47cd-497c-a349-9e69db0266f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_llama-3-yanyuedao-8b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-yanyuedao-8b-instruct", - "id": "Sakalti/llama-3-yanyuedao-8b-instruct", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2186 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/magro-7B/56d86e26-4ee6-4652-9b7b-a538238a24d4.json b/data/hfopenllm_v2/Sakalti/magro-7B/56d86e26-4ee6-4652-9b7b-a538238a24d4.json deleted file mode 100644 index 412d9d54e..000000000 --- a/data/hfopenllm_v2/Sakalti/magro-7B/56d86e26-4ee6-4652-9b7b-a538238a24d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_magro-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magro-7B", - "id": "Sakalti/magro-7B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2765 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/mergekit-01/416b89e4-5e8a-4131-9403-e8967a4127b8.json b/data/hfopenllm_v2/Sakalti/mergekit-01/416b89e4-5e8a-4131-9403-e8967a4127b8.json deleted file mode 100644 index 6209d9fb6..000000000 --- a/data/hfopenllm_v2/Sakalti/mergekit-01/416b89e4-5e8a-4131-9403-e8967a4127b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_mergekit-01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-01", - "id": "Sakalti/mergekit-01", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6234 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4291 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/mergekit-della_linear-vmeykci/347a90e8-d8b7-4266-8242-ceac865796a0.json b/data/hfopenllm_v2/Sakalti/mergekit-della_linear-vmeykci/347a90e8-d8b7-4266-8242-ceac865796a0.json deleted file mode 100644 index 17a180e43..000000000 --- a/data/hfopenllm_v2/Sakalti/mergekit-della_linear-vmeykci/347a90e8-d8b7-4266-8242-ceac865796a0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_mergekit-della_linear-vmeykci/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-della_linear-vmeykci", - "id": "Sakalti/mergekit-della_linear-vmeykci", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1126 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2816 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3897 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1089 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/model-3/389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json b/data/hfopenllm_v2/Sakalti/model-3/389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json deleted file mode 100644 index 768fa5f46..000000000 --- a/data/hfopenllm_v2/Sakalti/model-3/389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_model-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "model-3", - "id": "Sakalti/model-3", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6264 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5542 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4264 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4455 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/qwen2.5-2.3B/6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json b/data/hfopenllm_v2/Sakalti/qwen2.5-2.3B/6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json deleted file mode 100644 index a8265114f..000000000 --- a/data/hfopenllm_v2/Sakalti/qwen2.5-2.3B/6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_qwen2.5-2.3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-2.3B", - "id": "Sakalti/qwen2.5-2.3B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2Model", - "params_billions": 2.339 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1288 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2849 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3857 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/tara-3.8B/d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json b/data/hfopenllm_v2/Sakalti/tara-3.8B/d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json deleted file mode 100644 index 87315ad45..000000000 --- a/data/hfopenllm_v2/Sakalti/tara-3.8B/d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_tara-3.8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tara-3.8B", - "id": "Sakalti/tara-3.8B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3281 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.2/f8d362f6-eafc-4d11-bc40-d169d69d3a95.json b/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.2/f8d362f6-eafc-4d11-bc40-d169d69d3a95.json deleted file mode 100644 index 8ee1c7c15..000000000 --- a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.2/f8d362f6-eafc-4d11-bc40-d169d69d3a95.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-14B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ultiima-14B-v0.2", - "id": "Sakalti/ultiima-14B-v0.2", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.707 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6472 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3995 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.3/4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json b/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.3/4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json deleted file mode 100644 index 86e48a32c..000000000 --- a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.3/4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-14B-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ultiima-14B-v0.3", - "id": "Sakalti/ultiima-14B-v0.3", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6398 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3965 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5337 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.4/de073f45-0d14-4f8a-9d3b-d4fd961186b8.json b/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.4/de073f45-0d14-4f8a-9d3b-d4fd961186b8.json deleted file mode 100644 index 172ff4245..000000000 --- a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.4/de073f45-0d14-4f8a-9d3b-d4fd961186b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-14B-v0.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ultiima-14B-v0.4", - "id": "Sakalti/ultiima-14B-v0.4", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3008 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.642 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/ultiima-14B/fd88d234-b3f9-4f48-896c-af58f1a69880.json b/data/hfopenllm_v2/Sakalti/ultiima-14B/fd88d234-b3f9-4f48-896c-af58f1a69880.json deleted file mode 100644 index 640b74564..000000000 --- a/data/hfopenllm_v2/Sakalti/ultiima-14B/fd88d234-b3f9-4f48-896c-af58f1a69880.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ultiima-14B", - "id": "Sakalti/ultiima-14B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5701 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6491 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4698 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4718 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/ultiima-32B/273745b1-3761-463e-b9ab-7860968064eb.json b/data/hfopenllm_v2/Sakalti/ultiima-32B/273745b1-3761-463e-b9ab-7860968064eb.json deleted file mode 100644 index d7a87927c..000000000 --- a/data/hfopenllm_v2/Sakalti/ultiima-32B/273745b1-3761-463e-b9ab-7860968064eb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ultiima-32B", - "id": "Sakalti/ultiima-32B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6854 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7037 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4995 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.591 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/ultiima-72B-v1.5/101d84d3-e741-4eb2-bd8a-db6c12022fe2.json b/data/hfopenllm_v2/Sakalti/ultiima-72B-v1.5/101d84d3-e741-4eb2-bd8a-db6c12022fe2.json deleted file mode 100644 index aec4261bf..000000000 --- a/data/hfopenllm_v2/Sakalti/ultiima-72B-v1.5/101d84d3-e741-4eb2-bd8a-db6c12022fe2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-72B-v1.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ultiima-72B-v1.5", - "id": "Sakalti/ultiima-72B-v1.5", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4691 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6054 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sakalti/ultiima-72B/9c82deca-1998-4506-b038-c5dd592324d8.json b/data/hfopenllm_v2/Sakalti/ultiima-72B/9c82deca-1998-4506-b038-c5dd592324d8.json deleted file mode 100644 index 8be07e143..000000000 --- a/data/hfopenllm_v2/Sakalti/ultiima-72B/9c82deca-1998-4506-b038-c5dd592324d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-72B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ultiima-72B", - "id": "Sakalti/ultiima-72B", - "developer": "Sakalti", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.714 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7218 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4144 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4652 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5906 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R/da620a94-4c0d-4c50-9619-10e12001fb5d.json b/data/hfopenllm_v2/Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R/da620a94-4c0d-4c50-9619-10e12001fb5d.json deleted file mode 100644 index d2ff7565c..000000000 --- a/data/hfopenllm_v2/Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R/da620a94-4c0d-4c50-9619-10e12001fb5d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Salesforce_LLaMA-3-8B-SFR-Iterative-DPO-R/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA-3-8B-SFR-Iterative-DPO-R", - "id": "Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R", - "developer": "Salesforce", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3816 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5012 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SanjiWatsuki/Kunoichi-DPO-v2-7B/51dade8f-34e7-4237-8691-22655249bf76.json b/data/hfopenllm_v2/SanjiWatsuki/Kunoichi-DPO-v2-7B/51dade8f-34e7-4237-8691-22655249bf76.json deleted file mode 100644 index ea148741d..000000000 --- a/data/hfopenllm_v2/SanjiWatsuki/Kunoichi-DPO-v2-7B/51dade8f-34e7-4237-8691-22655249bf76.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SanjiWatsuki_Kunoichi-DPO-v2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kunoichi-DPO-v2-7B", - "id": "SanjiWatsuki/Kunoichi-DPO-v2-7B", - "developer": "SanjiWatsuki", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3107 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SanjiWatsuki/Silicon-Maid-7B/cdd59385-0a54-4ca1-b24d-9316a70f2875.json b/data/hfopenllm_v2/SanjiWatsuki/Silicon-Maid-7B/cdd59385-0a54-4ca1-b24d-9316a70f2875.json deleted file mode 100644 index f3201e23e..000000000 --- a/data/hfopenllm_v2/SanjiWatsuki/Silicon-Maid-7B/cdd59385-0a54-4ca1-b24d-9316a70f2875.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SanjiWatsuki_Silicon-Maid-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Silicon-Maid-7B", - "id": "SanjiWatsuki/Silicon-Maid-7B", - "developer": "SanjiWatsuki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5368 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4128 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3083 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sao10K/70B-L3.3-Cirrus-x1/514a3103-e8a1-49e8-b9da-a85963f5b3dd.json b/data/hfopenllm_v2/Sao10K/70B-L3.3-Cirrus-x1/514a3103-e8a1-49e8-b9da-a85963f5b3dd.json deleted file mode 100644 index c45307b07..000000000 --- a/data/hfopenllm_v2/Sao10K/70B-L3.3-Cirrus-x1/514a3103-e8a1-49e8-b9da-a85963f5b3dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sao10K_70B-L3.3-Cirrus-x1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "70B-L3.3-Cirrus-x1", - "id": "Sao10K/70B-L3.3-Cirrus-x1", - "developer": "Sao10K", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6681 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7029 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4497 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4842 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5378 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sao10K/Fimbulvetr-11B-v2/daafaafa-1e00-4433-95f3-91c169598ebd.json b/data/hfopenllm_v2/Sao10K/Fimbulvetr-11B-v2/daafaafa-1e00-4433-95f3-91c169598ebd.json deleted file mode 100644 index 3f9b9daef..000000000 --- a/data/hfopenllm_v2/Sao10K/Fimbulvetr-11B-v2/daafaafa-1e00-4433-95f3-91c169598ebd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sao10K_Fimbulvetr-11B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fimbulvetr-11B-v2", - "id": "Sao10K/Fimbulvetr-11B-v2", - "developer": "Sao10K", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json b/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json deleted file mode 100644 index 6cc9b0ce2..000000000 --- a/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sao10K_L3-70B-Euryale-v2.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-70B-Euryale-v2.1", - "id": "Sao10K/L3-70B-Euryale-v2.1", - "developer": "Sao10K", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7384 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6471 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2137 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5104 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json b/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json deleted file mode 100644 index f445ab801..000000000 --- a/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sao10K_L3-70B-Euryale-v2.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-70B-Euryale-v2.1", - "id": "Sao10K/L3-70B-Euryale-v2.1", - "developer": "Sao10K", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7281 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6503 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5096 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sao10K/L3-8B-Lunaris-v1/99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json b/data/hfopenllm_v2/Sao10K/L3-8B-Lunaris-v1/99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json deleted file mode 100644 index a7a1e4096..000000000 --- a/data/hfopenllm_v2/Sao10K/L3-8B-Lunaris-v1/99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sao10K_L3-8B-Lunaris-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B-Lunaris-v1", - "id": "Sao10K/L3-8B-Lunaris-v1", - "developer": "Sao10K", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6895 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5235 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3727 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3787 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sao10K/L3-8B-Niitama-v1/362f5875-4dbc-4e68-90ce-789f692bb533.json b/data/hfopenllm_v2/Sao10K/L3-8B-Niitama-v1/362f5875-4dbc-4e68-90ce-789f692bb533.json deleted file mode 100644 index 70117da5c..000000000 --- a/data/hfopenllm_v2/Sao10K/L3-8B-Niitama-v1/362f5875-4dbc-4e68-90ce-789f692bb533.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sao10K_L3-8B-Niitama-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B-Niitama-v1", - "id": "Sao10K/L3-8B-Niitama-v1", - "developer": "Sao10K", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6791 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5303 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.2/fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json b/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.2/fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json deleted file mode 100644 index cb2eabee5..000000000 --- a/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.2/fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sao10K_L3-8B-Stheno-v3.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B-Stheno-v3.2", - "id": "Sao10K/L3-8B-Stheno-v3.2", - "developer": "Sao10K", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6873 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5228 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3768 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.3-32K/93f829b8-b8d9-4389-a210-2a38c3a30edb.json b/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.3-32K/93f829b8-b8d9-4389-a210-2a38c3a30edb.json deleted file mode 100644 index 317e3d07a..000000000 --- a/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.3-32K/93f829b8-b8d9-4389-a210-2a38c3a30edb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sao10K_L3-8B-Stheno-v3.3-32K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B-Stheno-v3.3-32K", - "id": "Sao10K/L3-8B-Stheno-v3.3-32K", - "developer": "Sao10K", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1896 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sao10K/MN-12B-Lyra-v3/6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json b/data/hfopenllm_v2/Sao10K/MN-12B-Lyra-v3/6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json deleted file mode 100644 index 6ca309c31..000000000 --- a/data/hfopenllm_v2/Sao10K/MN-12B-Lyra-v3/6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sao10K_MN-12B-Lyra-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Lyra-v3", - "id": "Sao10K/MN-12B-Lyra-v3", - "developer": "Sao10K", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4486 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4804 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B/70d749cf-2e92-4847-86de-7964fc8eb990.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B/70d749cf-2e92-4847-86de-7964fc8eb990.json deleted file mode 100644 index 991d8a526..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B/70d749cf-2e92-4847-86de-7964fc8eb990.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V1-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Avengers-V1-32B", - "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.76 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7972 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7001 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6027 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5793 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B/623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B/623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json deleted file mode 100644 index f34b82b65..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B/623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V2-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Avengers-V2-32B", - "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.76 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7956 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7023 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4166 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.572 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B/e1aca741-2765-4e47-b6a1-49f3d9532432.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B/e1aca741-2765-4e47-b6a1-49f3d9532432.json deleted file mode 100644 index 32459a45a..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B/e1aca741-2765-4e47-b6a1-49f3d9532432.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V3-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Avengers-V3-32B", - "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8249 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6913 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5664 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B/4f42366e-e6aa-4974-9a40-5781e350616d.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B/4f42366e-e6aa-4974-9a40-5781e350616d.json deleted file mode 100644 index cba352636..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B/4f42366e-e6aa-4974-9a40-5781e350616d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V4-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Avengers-V4-32B", - "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7631 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.692 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4643 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5752 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B/4ec2231d-c012-4ad3-830c-8ff86c977202.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B/4ec2231d-c012-4ad3-830c-8ff86c977202.json deleted file mode 100644 index 9e429543e..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B/4ec2231d-c012-4ad3-830c-8ff86c977202.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V5-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Avengers-V5-32B", - "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7516 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6929 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4709 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5762 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B/1d2e5513-bd0c-4795-8487-f5266c6e368f.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B/1d2e5513-bd0c-4795-8487-f5266c6e368f.json deleted file mode 100644 index 1faf938ef..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B/1d2e5513-bd0c-4795-8487-f5266c6e368f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V6-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Avengers-V6-32B", - "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.76 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8209 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4274 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5672 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/104172b7-86f5-410a-a454-63e1cfbeb87f.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/104172b7-86f5-410a-a454-63e1cfbeb87f.json deleted file mode 100644 index f8ce92b90..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/104172b7-86f5-410a-a454-63e1cfbeb87f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Korean-Avengers-V2-27B", - "id": "Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8146 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6463 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4139 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4599 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/d28e04ac-7d18-43fb-80b8-82c0662fec79.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/d28e04ac-7d18-43fb-80b8-82c0662fec79.json deleted file mode 100644 index 2df6b6d76..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/d28e04ac-7d18-43fb-80b8-82c0662fec79.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Korean-Avengers-V3-27B", - "id": "Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8142 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6404 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4467 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4524 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B/20bb3819-9d85-4d84-99ba-65e33965f0c5.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B/20bb3819-9d85-4d84-99ba-65e33965f0c5.json deleted file mode 100644 index f703921c4..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B/20bb3819-9d85-4d84-99ba-65e33965f0c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Korean-Superb-22B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Korean-Superb-22B", - "id": "Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6767 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5626 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2372 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3908 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3871 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B/3a4bdf58-0137-4d85-b567-59b3fed3dad5.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B/3a4bdf58-0137-4d85-b567-59b3fed3dad5.json deleted file mode 100644 index d43e428ac..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B/3a4bdf58-0137-4d85-b567-59b3fed3dad5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Korean-Superb-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Korean-Superb-27B", - "id": "Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7768 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6518 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4791 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4647 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Superb-27B/04f843ba-947c-4732-979c-2aeae7d34e5a.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Superb-27B/04f843ba-947c-4732-979c-2aeae7d34e5a.json deleted file mode 100644 index f7e4a247b..000000000 --- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Superb-27B/04f843ba-947c-4732-979c-2aeae7d34e5a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Superb-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Linkbricks-Horizon-AI-Superb-27B", - "id": "Saxo/Linkbricks-Horizon-AI-Superb-27B", - "developer": "Saxo", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7302 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6186 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2221 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.465 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2.5/173a31d3-7d12-4ab1-a963-005a81aee767.json b/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2.5/173a31d3-7d12-4ab1-a963-005a81aee767.json deleted file mode 100644 index 03259aebd..000000000 --- a/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2.5/173a31d3-7d12-4ab1-a963-005a81aee767.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SeaLLMs_SeaLLM-7B-v2.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeaLLM-7B-v2.5", - "id": "SeaLLMs/SeaLLM-7B-v2.5", - "developer": "SeaLLMs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4522 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.498 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3203 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2/d0555736-b614-43ca-91d7-8264e3566872.json b/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2/d0555736-b614-43ca-91d7-8264e3566872.json deleted file mode 100644 index 98630be5e..000000000 --- a/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2/d0555736-b614-43ca-91d7-8264e3566872.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SeaLLMs_SeaLLM-7B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeaLLM-7B-v2", - "id": "SeaLLMs/SeaLLM-7B-v2", - "developer": "SeaLLMs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.376 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3083 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SeaLLMs/SeaLLMs-v3-7B-Chat/4b7b13b7-4aee-4462-87e6-aa6c15068236.json b/data/hfopenllm_v2/SeaLLMs/SeaLLMs-v3-7B-Chat/4b7b13b7-4aee-4462-87e6-aa6c15068236.json deleted file mode 100644 index 05981a658..000000000 --- a/data/hfopenllm_v2/SeaLLMs/SeaLLMs-v3-7B-Chat/4b7b13b7-4aee-4462-87e6-aa6c15068236.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SeaLLMs_SeaLLMs-v3-7B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SeaLLMs-v3-7B-Chat", - "id": "SeaLLMs/SeaLLMs-v3-7B-Chat", - "developer": "SeaLLMs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4377 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4174 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3895 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SenseLLM/ReflectionCoder-CL-34B/4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json b/data/hfopenllm_v2/SenseLLM/ReflectionCoder-CL-34B/4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json deleted file mode 100644 index e86dab4c2..000000000 --- a/data/hfopenllm_v2/SenseLLM/ReflectionCoder-CL-34B/4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SenseLLM_ReflectionCoder-CL-34B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReflectionCoder-CL-34B", - "id": "SenseLLM/ReflectionCoder-CL-34B", - "developer": "SenseLLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 33.744 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4008 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3953 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4155 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1424 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SenseLLM/ReflectionCoder-DS-33B/acbcd5a5-bcd8-4209-b35f-425feada7e8b.json b/data/hfopenllm_v2/SenseLLM/ReflectionCoder-DS-33B/acbcd5a5-bcd8-4209-b35f-425feada7e8b.json deleted file mode 100644 index 20a36d5af..000000000 --- a/data/hfopenllm_v2/SenseLLM/ReflectionCoder-DS-33B/acbcd5a5-bcd8-4209-b35f-425feada7e8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SenseLLM_ReflectionCoder-DS-33B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReflectionCoder-DS-33B", - "id": "SenseLLM/ReflectionCoder-DS-33B", - "developer": "SenseLLM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 33.34 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3787 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3449 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1202 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B/cb9a415f-1a02-46ad-a731-bf825ddd78ae.json b/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B/cb9a415f-1a02-46ad-a731-bf825ddd78ae.json deleted file mode 100644 index df38f37c7..000000000 --- a/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B/cb9a415f-1a02-46ad-a731-bf825ddd78ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SentientAGI_Dobby-Mini-Leashed-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dobby-Mini-Leashed-Llama-3.1-8B", - "id": "SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B", - "developer": "SentientAGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7847 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5138 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3694 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B/92cde6db-47f4-43c6-9ad5-643c35faa226.json b/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B/92cde6db-47f4-43c6-9ad5-643c35faa226.json deleted file mode 100644 index 771f354ce..000000000 --- a/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B/92cde6db-47f4-43c6-9ad5-643c35faa226.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SentientAGI_Dobby-Mini-Unhinged-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dobby-Mini-Unhinged-Llama-3.1-8B", - "id": "SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B", - "developer": "SentientAGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7457 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5142 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1563 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4013 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3585 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/5e88a037-f9bd-4b39-944f-f0781bb7884f.json b/data/hfopenllm_v2/SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/5e88a037-f9bd-4b39-944f-f0781bb7884f.json deleted file mode 100644 index d4f48f999..000000000 --- a/data/hfopenllm_v2/SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/5e88a037-f9bd-4b39-944f-f0781bb7884f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SeppeV_SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo", - "id": "SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo", - "developer": "SeppeV", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0955 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4032 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1161 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sharathhebbar24/SSH_355M/d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json b/data/hfopenllm_v2/Sharathhebbar24/SSH_355M/d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json deleted file mode 100644 index ab1e96d43..000000000 --- a/data/hfopenllm_v2/Sharathhebbar24/SSH_355M/d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sharathhebbar24_SSH_355M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SSH_355M", - "id": "Sharathhebbar24/SSH_355M", - "developer": "Sharathhebbar24", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.355 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1424 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3099 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1176 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sharathhebbar24/chat_gpt2_dpo/ac5adf39-f0a4-439b-9873-9141e0a554b1.json b/data/hfopenllm_v2/Sharathhebbar24/chat_gpt2_dpo/ac5adf39-f0a4-439b-9873-9141e0a554b1.json deleted file mode 100644 index 3c6c53f61..000000000 --- a/data/hfopenllm_v2/Sharathhebbar24/chat_gpt2_dpo/ac5adf39-f0a4-439b-9873-9141e0a554b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sharathhebbar24_chat_gpt2_dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "chat_gpt2_dpo", - "id": "Sharathhebbar24/chat_gpt2_dpo", - "developer": "Sharathhebbar24", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0986 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2902 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3818 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Shreyash2010/Uma-4x4B-Instruct-v0.1/62965c92-cdf4-4a3b-b035-990abaab615c.json b/data/hfopenllm_v2/Shreyash2010/Uma-4x4B-Instruct-v0.1/62965c92-cdf4-4a3b-b035-990abaab615c.json deleted file mode 100644 index 5d50114d7..000000000 --- a/data/hfopenllm_v2/Shreyash2010/Uma-4x4B-Instruct-v0.1/62965c92-cdf4-4a3b-b035-990abaab615c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Shreyash2010_Uma-4x4B-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Uma-4x4B-Instruct-v0.1", - "id": "Shreyash2010/Uma-4x4B-Instruct-v0.1", - "developer": "Shreyash2010", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5517 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sicarius-Prototyping/Brainy_LLAMA/3866ece8-d70a-4061-9e86-0798ecd98bd6.json b/data/hfopenllm_v2/Sicarius-Prototyping/Brainy_LLAMA/3866ece8-d70a-4061-9e86-0798ecd98bd6.json deleted file mode 100644 index 26ba36c75..000000000 --- a/data/hfopenllm_v2/Sicarius-Prototyping/Brainy_LLAMA/3866ece8-d70a-4061-9e86-0798ecd98bd6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sicarius-Prototyping_Brainy_LLAMA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Brainy_LLAMA", - "id": "Sicarius-Prototyping/Brainy_LLAMA", - "developer": "Sicarius-Prototyping", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5204 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5117 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4143 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3849 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sicarius-Prototyping/Micropenis_1B/ff484d0e-bb14-4a80-ae29-2351b03cf278.json b/data/hfopenllm_v2/Sicarius-Prototyping/Micropenis_1B/ff484d0e-bb14-4a80-ae29-2351b03cf278.json deleted file mode 100644 index 61ea14597..000000000 --- a/data/hfopenllm_v2/Sicarius-Prototyping/Micropenis_1B/ff484d0e-bb14-4a80-ae29-2351b03cf278.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sicarius-Prototyping_Micropenis_1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Micropenis_1B", - "id": "Sicarius-Prototyping/Micropenis_1B", - "developer": "Sicarius-Prototyping", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.618 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3461 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3325 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.186 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sicarius-Prototyping/bacon_and_food/06ac1718-fe71-4e05-a47f-1200e067336c.json b/data/hfopenllm_v2/Sicarius-Prototyping/bacon_and_food/06ac1718-fe71-4e05-a47f-1200e067336c.json deleted file mode 100644 index a582b1eff..000000000 --- a/data/hfopenllm_v2/Sicarius-Prototyping/bacon_and_food/06ac1718-fe71-4e05-a47f-1200e067336c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sicarius-Prototyping_bacon_and_food/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bacon_and_food", - "id": "Sicarius-Prototyping/bacon_and_food", - "developer": "Sicarius-Prototyping", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.586 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4725 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/2B-ad/4ddb1616-7889-45ef-96de-823fee338e1d.json b/data/hfopenllm_v2/SicariusSicariiStuff/2B-ad/4ddb1616-7889-45ef-96de-823fee338e1d.json deleted file mode 100644 index 01e28f2ac..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/2B-ad/4ddb1616-7889-45ef-96de-823fee338e1d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_2B-ad/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "2B-ad", - "id": "SicariusSicariiStuff/2B-ad", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 3.204 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4379 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4092 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2662 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/2B_or_not_2B/487dd91b-5bc4-4355-90d3-c82ecc789ab3.json b/data/hfopenllm_v2/SicariusSicariiStuff/2B_or_not_2B/487dd91b-5bc4-4355-90d3-c82ecc789ab3.json deleted file mode 100644 index 55d96ac62..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/2B_or_not_2B/487dd91b-5bc4-4355-90d3-c82ecc789ab3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_2B_or_not_2B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "2B_or_not_2B", - "id": "SicariusSicariiStuff/2B_or_not_2B", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2062 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3791 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1399 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Dusk_Rainbow/a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json b/data/hfopenllm_v2/SicariusSicariiStuff/Dusk_Rainbow/a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json deleted file mode 100644 index 0d3c0a4eb..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Dusk_Rainbow/a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Dusk_Rainbow/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dusk_Rainbow", - "id": "SicariusSicariiStuff/Dusk_Rainbow", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3588 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4772 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4025 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3443 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Eximius_Persona_5B/9a9239ab-9e0e-449b-bd1b-6ec280fad505.json b/data/hfopenllm_v2/SicariusSicariiStuff/Eximius_Persona_5B/9a9239ab-9e0e-449b-bd1b-6ec280fad505.json deleted file mode 100644 index 0bf2ac15d..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Eximius_Persona_5B/9a9239ab-9e0e-449b-bd1b-6ec280fad505.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Eximius_Persona_5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Eximius_Persona_5B", - "id": "SicariusSicariiStuff/Eximius_Persona_5B", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 5.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3818 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.314 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_LLAMA_3B/2c710cd5-75a6-46b7-8356-212da7bf864d.json b/data/hfopenllm_v2/SicariusSicariiStuff/Impish_LLAMA_3B/2c710cd5-75a6-46b7-8356-212da7bf864d.json deleted file mode 100644 index a3193370f..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_LLAMA_3B/2c710cd5-75a6-46b7-8356-212da7bf864d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Impish_LLAMA_3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Impish_LLAMA_3B", - "id": "SicariusSicariiStuff/Impish_LLAMA_3B", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4091 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3673 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2941 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_Mind_8B/377d5240-73b5-48d0-bbdc-0960ad1d9069.json b/data/hfopenllm_v2/SicariusSicariiStuff/Impish_Mind_8B/377d5240-73b5-48d0-bbdc-0960ad1d9069.json deleted file mode 100644 index 4fcce76c3..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_Mind_8B/377d5240-73b5-48d0-bbdc-0960ad1d9069.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Impish_Mind_8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Impish_Mind_8B", - "id": "SicariusSicariiStuff/Impish_Mind_8B", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3179 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4674 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3309 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_14B-1M/9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json b/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_14B-1M/9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json deleted file mode 100644 index 644abf1f0..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_14B-1M/9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Impish_QWEN_14B-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Impish_QWEN_14B-1M", - "id": "SicariusSicariiStuff/Impish_QWEN_14B-1M", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7868 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6283 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3965 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4615 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_7B-1M/104a0157-c614-44cf-b6cc-9f15dab4b187.json b/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_7B-1M/104a0157-c614-44cf-b6cc-9f15dab4b187.json deleted file mode 100644 index 30b1d2ab9..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_7B-1M/104a0157-c614-44cf-b6cc-9f15dab4b187.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Impish_QWEN_7B-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Impish_QWEN_7B-1M", - "id": "SicariusSicariiStuff/Impish_QWEN_7B-1M", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6382 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5372 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3089 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4074 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4265 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA/bb379093-c169-44bd-ac86-edb8ab8fc225.json b/data/hfopenllm_v2/SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA/bb379093-c169-44bd-ac86-edb8ab8fc225.json deleted file mode 100644 index be20ff82c..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA/bb379093-c169-44bd-ac86-edb8ab8fc225.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_LLAMA-3_8B_Unaligned_BETA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLAMA-3_8B_Unaligned_BETA", - "id": "SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4717 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4119 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Phi-Line_14B/e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json b/data/hfopenllm_v2/SicariusSicariiStuff/Phi-Line_14B/e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json deleted file mode 100644 index d361cd930..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Phi-Line_14B/e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Phi-Line_14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-Line_14B", - "id": "SicariusSicariiStuff/Phi-Line_14B", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6154 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5454 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Phi-lthy4/43d87bf5-2620-4f8e-a8b6-f86fc157d987.json b/data/hfopenllm_v2/SicariusSicariiStuff/Phi-lthy4/43d87bf5-2620-4f8e-a8b6-f86fc157d987.json deleted file mode 100644 index 9e1a945d7..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Phi-lthy4/43d87bf5-2620-4f8e-a8b6-f86fc157d987.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Phi-lthy4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-lthy4", - "id": "SicariusSicariiStuff/Phi-lthy4", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 11.933 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7679 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5879 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4083 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4333 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncencored/735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json b/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncencored/735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json deleted file mode 100644 index becab3e7a..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncencored/735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Qwen2.5-14B_Uncencored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B_Uncencored", - "id": "SicariusSicariiStuff/Qwen2.5-14B_Uncencored", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3158 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6309 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored/0c6dcc87-343c-4973-a589-3e3393829184.json b/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored/0c6dcc87-343c-4973-a589-3e3393829184.json deleted file mode 100644 index ffe8535f1..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored/0c6dcc87-343c-4973-a589-3e3393829184.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Qwen2.5-14B_Uncensored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B_Uncensored", - "id": "SicariusSicariiStuff/Qwen2.5-14B_Uncensored", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3173 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6309 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct/7c1d1657-e9ae-433f-be9d-523431bfc7ae.json b/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct/7c1d1657-e9ae-433f-be9d-523431bfc7ae.json deleted file mode 100644 index 79cd730d2..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct/7c1d1657-e9ae-433f-be9d-523431bfc7ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Qwen2.5-14B_Uncensored_Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B_Uncensored_Instruct", - "id": "SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3789 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5937 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3285 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Redemption_Wind_24B/0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json b/data/hfopenllm_v2/SicariusSicariiStuff/Redemption_Wind_24B/0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json deleted file mode 100644 index 9778a9581..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Redemption_Wind_24B/0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Redemption_Wind_24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Redemption_Wind_24B", - "id": "SicariusSicariiStuff/Redemption_Wind_24B", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2501 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6428 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5432 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Winged_Imp_8B/e87e1d3f-1476-499d-a9f3-b6463b429262.json b/data/hfopenllm_v2/SicariusSicariiStuff/Winged_Imp_8B/e87e1d3f-1476-499d-a9f3-b6463b429262.json deleted file mode 100644 index d8868496d..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Winged_Imp_8B/e87e1d3f-1476-499d-a9f3-b6463b429262.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Winged_Imp_8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Winged_Imp_8B", - "id": "SicariusSicariiStuff/Winged_Imp_8B", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4148 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3639 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Wingless_Imp_8B/246e8450-3c53-4bde-99bb-5663f751e88e.json b/data/hfopenllm_v2/SicariusSicariiStuff/Wingless_Imp_8B/246e8450-3c53-4bde-99bb-5663f751e88e.json deleted file mode 100644 index 822e85e02..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Wingless_Imp_8B/246e8450-3c53-4bde-99bb-5663f751e88e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Wingless_Imp_8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Wingless_Imp_8B", - "id": "SicariusSicariiStuff/Wingless_Imp_8B", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4148 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3639 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Zion_Alpha/496b9e45-2f64-456e-b35e-12a94c5643b1.json b/data/hfopenllm_v2/SicariusSicariiStuff/Zion_Alpha/496b9e45-2f64-456e-b35e-12a94c5643b1.json deleted file mode 100644 index 87bd4943e..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/Zion_Alpha/496b9e45-2f64-456e-b35e-12a94c5643b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Zion_Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zion_Alpha", - "id": "SicariusSicariiStuff/Zion_Alpha", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3324 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4932 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3132 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/dn_ep02/05890047-a95a-433e-b6b6-fb037592cdd1.json b/data/hfopenllm_v2/SicariusSicariiStuff/dn_ep02/05890047-a95a-433e-b6b6-fb037592cdd1.json deleted file mode 100644 index 1653d6cec..000000000 --- a/data/hfopenllm_v2/SicariusSicariiStuff/dn_ep02/05890047-a95a-433e-b6b6-fb037592cdd1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_dn_ep02/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dn_ep02", - "id": "SicariusSicariiStuff/dn_ep02", - "developer": "SicariusSicariiStuff", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.142 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1/4a30580c-1d25-49d4-984d-2d28ef3a5656.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1/4a30580c-1d25-49d4-984d-2d28ef3a5656.json deleted file mode 100644 index 79a1347e0..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1/4a30580c-1d25-49d4-984d-2d28ef3a5656.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.1-8B-lora-epoch1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Llama3.1-8B-lora-epoch1", - "id": "SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5058 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5088 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora/696d7966-d140-4f43-91df-54f02247b34f.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora/696d7966-d140-4f43-91df-54f02247b34f.json deleted file mode 100644 index 3e699beeb..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora/696d7966-d140-4f43-91df-54f02247b34f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.1-8B-lora/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Llama3.1-8B-lora", - "id": "SkyOrbis/SKY-Ko-Llama3.1-8B-lora", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5058 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5088 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3/fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3/fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json deleted file mode 100644 index cc2e232f1..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3/fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Llama3.2-1B-lora-epoch3", - "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1279 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5/9ac16d1f-d894-414d-8a14-110e971d0ba6.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5/9ac16d1f-d894-414d-8a14-110e971d0ba6.json deleted file mode 100644 index c6e0028d4..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5/9ac16d1f-d894-414d-8a14-110e971d0ba6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Llama3.2-1B-lora-epoch5", - "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3471 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1946 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3/2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3/2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json deleted file mode 100644 index 13bfe0eca..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3/2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Llama3.2-1B-lora-v2-epoch3", - "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3471 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1946 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5/3b221b0e-6158-471f-bcd2-b09514f28bd7.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5/3b221b0e-6158-471f-bcd2-b09514f28bd7.json deleted file mode 100644 index e22db53cd..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5/3b221b0e-6158-471f-bcd2-b09514f28bd7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Llama3.2-1B-lora-v2-epoch5", - "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4247 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3397 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3458 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1946 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1/c8af8428-aab6-4d19-b185-2b437c0334fa.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1/c8af8428-aab6-4d19-b185-2b437c0334fa.json deleted file mode 100644 index e29268df8..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1/c8af8428-aab6-4d19-b185-2b437c0334fa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Llama3.2-3B-lora-epoch1", - "id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3522 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2/c617d12b-c37f-47ef-9704-e19774c67aeb.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2/c617d12b-c37f-47ef-9704-e19774c67aeb.json deleted file mode 100644 index 69fae55da..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2/c617d12b-c37f-47ef-9704-e19774c67aeb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Llama3.2-3B-lora-epoch2", - "id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3522 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3/577f31e2-1808-45e2-a528-5933019cfa85.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3/577f31e2-1808-45e2-a528-5933019cfa85.json deleted file mode 100644 index 44054c6cc..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3/577f31e2-1808-45e2-a528-5933019cfa85.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Llama3.2-3B-lora-epoch3", - "id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3522 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct/7bd7f5c8-be9e-473e-be18-03ad22a195ee.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct/7bd7f5c8-be9e-473e-be18-03ad22a195ee.json deleted file mode 100644 index db8427937..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct/7bd7f5c8-be9e-473e-be18-03ad22a195ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Qwen2.5-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Qwen2.5-3B-Instruct", - "id": "SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3534 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4265 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4024 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/5036a549-5583-4775-935a-1a12b6de3e7d.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/5036a549-5583-4775-935a-1a12b6de3e7d.json deleted file mode 100644 index ce701d5f4..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/5036a549-5583-4775-935a-1a12b6de3e7d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000", - "id": "SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3819 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5078 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1866 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4436 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/5c0ffff9-542c-424e-88e9-89584e686e12.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/5c0ffff9-542c-424e-88e9-89584e686e12.json deleted file mode 100644 index 08f3d6990..000000000 --- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/5c0ffff9-542c-424e-88e9-89584e686e12.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000", - "id": "SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000", - "developer": "SkyOrbis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4238 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4238 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/5c6a045d-2c90-4938-9185-9c1a0f82903a.json b/data/hfopenllm_v2/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/5c6a045d-2c90-4938-9185-9c1a0f82903a.json deleted file mode 100644 index ef679628f..000000000 --- a/data/hfopenllm_v2/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/5c6a045d-2c90-4938-9185-9c1a0f82903a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork-Reward-Gemma-2-27B-v0.2", - "id": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2", - "developer": "Skywork", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForSequenceClassification", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7807 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.636 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2273 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4103 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Skywork/Skywork-o1-Open-Llama-3.1-8B/02480176-2058-4e71-a970-9698be8d235e.json b/data/hfopenllm_v2/Skywork/Skywork-o1-Open-Llama-3.1-8B/02480176-2058-4e71-a970-9698be8d235e.json deleted file mode 100644 index 92fd0cf95..000000000 --- a/data/hfopenllm_v2/Skywork/Skywork-o1-Open-Llama-3.1-8B/02480176-2058-4e71-a970-9698be8d235e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Skywork_Skywork-o1-Open-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork-o1-Open-Llama-3.1-8B", - "id": "Skywork/Skywork-o1-Open-Llama-3.1-8B", - "developer": "Skywork", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.203 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Solshine/Brimful-merged-replete/4be1e5b4-254c-4287-907d-cc845042de37.json b/data/hfopenllm_v2/Solshine/Brimful-merged-replete/4be1e5b4-254c-4287-907d-cc845042de37.json deleted file mode 100644 index 8374a1a8d..000000000 --- a/data/hfopenllm_v2/Solshine/Brimful-merged-replete/4be1e5b4-254c-4287-907d-cc845042de37.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Solshine_Brimful-merged-replete/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Brimful-merged-replete", - "id": "Solshine/Brimful-merged-replete", - "developer": "Solshine", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 12.277 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1761 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2883 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1085 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2/21b51852-5cad-414e-92d5-31878f025d67.json b/data/hfopenllm_v2/Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2/21b51852-5cad-414e-92d5-31878f025d67.json deleted file mode 100644 index 92595ba08..000000000 --- a/data/hfopenllm_v2/Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2/21b51852-5cad-414e-92d5-31878f025d67.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Solshine_Llama-3-1-big-thoughtful-passthrough-merge-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-1-big-thoughtful-passthrough-merge-2", - "id": "Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2", - "developer": "Solshine", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 18.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2547 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3889 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1185 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sorawiz/Gemma-9B-Base/9eb07d4a-1f01-4696-9137-d477ffca43be.json b/data/hfopenllm_v2/Sorawiz/Gemma-9B-Base/9eb07d4a-1f01-4696-9137-d477ffca43be.json deleted file mode 100644 index 57b9d2ddf..000000000 --- a/data/hfopenllm_v2/Sorawiz/Gemma-9B-Base/9eb07d4a-1f01-4696-9137-d477ffca43be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sorawiz_Gemma-9B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-9B-Base", - "id": "Sorawiz/Gemma-9B-Base", - "developer": "Sorawiz", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1667 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.593 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4045 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4235 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sorawiz/Gemma-Creative-9B-Base/4236485b-aa92-4bc4-a652-17ed3231ecf4.json b/data/hfopenllm_v2/Sorawiz/Gemma-Creative-9B-Base/4236485b-aa92-4bc4-a652-17ed3231ecf4.json deleted file mode 100644 index 5a311b551..000000000 --- a/data/hfopenllm_v2/Sorawiz/Gemma-Creative-9B-Base/4236485b-aa92-4bc4-a652-17ed3231ecf4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sorawiz_Gemma-Creative-9B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-Creative-9B-Base", - "id": "Sorawiz/Gemma-Creative-9B-Base", - "developer": "Sorawiz", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5459 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4008 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-8b-Sify/9c0d6b71-8c6a-4294-961c-972a002b847f.json b/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-8b-Sify/9c0d6b71-8c6a-4294-961c-972a002b847f.json deleted file mode 100644 index 509be1166..000000000 --- a/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-8b-Sify/9c0d6b71-8c6a-4294-961c-972a002b847f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sourjayon_DeepSeek-R1-8b-Sify/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-8b-Sify", - "id": "Sourjayon/DeepSeek-R1-8b-Sify", - "developer": "Sourjayon", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3379 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2447 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3303 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1981 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-ForumNXT/d1e906d5-8f0d-49c2-88c3-cf71774de600.json b/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-ForumNXT/d1e906d5-8f0d-49c2-88c3-cf71774de600.json deleted file mode 100644 index e4a1e60b8..000000000 --- a/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-ForumNXT/d1e906d5-8f0d-49c2-88c3-cf71774de600.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Sourjayon_DeepSeek-R1-ForumNXT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-ForumNXT", - "id": "Sourjayon/DeepSeek-R1-ForumNXT", - "developer": "Sourjayon", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2603 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.331 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3392 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1648 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SpaceYL/ECE_Poirot/798e4f83-6262-4d5b-a854-6ff114167209.json b/data/hfopenllm_v2/SpaceYL/ECE_Poirot/798e4f83-6262-4d5b-a854-6ff114167209.json deleted file mode 100644 index fa2400717..000000000 --- a/data/hfopenllm_v2/SpaceYL/ECE_Poirot/798e4f83-6262-4d5b-a854-6ff114167209.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SpaceYL_ECE_Poirot/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE_Poirot", - "id": "SpaceYL/ECE_Poirot", - "developer": "SpaceYL", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3107 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4026 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2883 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Spestly/Athena-1-3B/dd2603d5-e99e-4778-95d0-159c788626cf.json b/data/hfopenllm_v2/Spestly/Athena-1-3B/dd2603d5-e99e-4778-95d0-159c788626cf.json deleted file mode 100644 index da9add401..000000000 --- a/data/hfopenllm_v2/Spestly/Athena-1-3B/dd2603d5-e99e-4778-95d0-159c788626cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Spestly_Athena-1-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Athena-1-3B", - "id": "Spestly/Athena-1-3B", - "developer": "Spestly", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5569 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4702 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2379 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4362 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3519 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Spestly/Atlas-Pro-1.5B-Preview/41c71990-e79d-447f-b082-63c96fd67a1f.json b/data/hfopenllm_v2/Spestly/Atlas-Pro-1.5B-Preview/41c71990-e79d-447f-b082-63c96fd67a1f.json deleted file mode 100644 index 81aa9d6c5..000000000 --- a/data/hfopenllm_v2/Spestly/Atlas-Pro-1.5B-Preview/41c71990-e79d-447f-b082-63c96fd67a1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Spestly_Atlas-Pro-1.5B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Atlas-Pro-1.5B-Preview", - "id": "Spestly/Atlas-Pro-1.5B-Preview", - "developer": "Spestly", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.243 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3499 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3195 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1925 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Spestly/Atlas-Pro-7B-Preview/b9e25948-2871-4b6c-933b-8a731e48e81b.json b/data/hfopenllm_v2/Spestly/Atlas-Pro-7B-Preview/b9e25948-2871-4b6c-933b-8a731e48e81b.json deleted file mode 100644 index 5b58d6fbc..000000000 --- a/data/hfopenllm_v2/Spestly/Atlas-Pro-7B-Preview/b9e25948-2871-4b6c-933b-8a731e48e81b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Spestly_Atlas-Pro-7B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Atlas-Pro-7B-Preview", - "id": "Spestly/Atlas-Pro-7B-Preview", - "developer": "Spestly", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4668 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Stark2008/GutenLaserPi/7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json b/data/hfopenllm_v2/Stark2008/GutenLaserPi/7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json deleted file mode 100644 index 1770ffa03..000000000 --- a/data/hfopenllm_v2/Stark2008/GutenLaserPi/7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Stark2008_GutenLaserPi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GutenLaserPi", - "id": "Stark2008/GutenLaserPi", - "developer": "Stark2008", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4227 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5212 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3106 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Stark2008/LayleleFlamPi/ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json b/data/hfopenllm_v2/Stark2008/LayleleFlamPi/ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json deleted file mode 100644 index 9aef1f6e6..000000000 --- a/data/hfopenllm_v2/Stark2008/LayleleFlamPi/ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Stark2008_LayleleFlamPi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LayleleFlamPi", - "id": "Stark2008/LayleleFlamPi", - "developer": "Stark2008", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5116 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4608 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Stark2008/VisFlamCat/b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json b/data/hfopenllm_v2/Stark2008/VisFlamCat/b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json deleted file mode 100644 index afdd9f905..000000000 --- a/data/hfopenllm_v2/Stark2008/VisFlamCat/b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Stark2008_VisFlamCat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VisFlamCat", - "id": "Stark2008/VisFlamCat", - "developer": "Stark2008", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5217 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4463 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Steelskull/L3.3-MS-Nevoria-70b/7395fcde-49dd-47f4-a8ea-463eda40f5e3.json b/data/hfopenllm_v2/Steelskull/L3.3-MS-Nevoria-70b/7395fcde-49dd-47f4-a8ea-463eda40f5e3.json deleted file mode 100644 index 26de03a08..000000000 --- a/data/hfopenllm_v2/Steelskull/L3.3-MS-Nevoria-70b/7395fcde-49dd-47f4-a8ea-463eda40f5e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Steelskull_L3.3-MS-Nevoria-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.3-MS-Nevoria-70b", - "id": "Steelskull/L3.3-MS-Nevoria-70b", - "developer": "Steelskull", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6963 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6998 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3958 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4706 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4682 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5535 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Steelskull/L3.3-Nevoria-R1-70b/a130087f-566f-4405-b662-1102f1664c49.json b/data/hfopenllm_v2/Steelskull/L3.3-Nevoria-R1-70b/a130087f-566f-4405-b662-1102f1664c49.json deleted file mode 100644 index ff75741dd..000000000 --- a/data/hfopenllm_v2/Steelskull/L3.3-Nevoria-R1-70b/a130087f-566f-4405-b662-1102f1664c49.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Steelskull_L3.3-Nevoria-R1-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.3-Nevoria-R1-70b", - "id": "Steelskull/L3.3-Nevoria-R1-70b", - "developer": "Steelskull", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6024 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6972 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.469 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4775 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5463 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/StelleX/Qwen2.5_Math_7B_Cot/3be58cf3-4761-4459-9f3c-eabf812a3c19.json b/data/hfopenllm_v2/StelleX/Qwen2.5_Math_7B_Cot/3be58cf3-4761-4459-9f3c-eabf812a3c19.json deleted file mode 100644 index e315d4ecc..000000000 --- a/data/hfopenllm_v2/StelleX/Qwen2.5_Math_7B_Cot/3be58cf3-4761-4459-9f3c-eabf812a3c19.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/StelleX_Qwen2.5_Math_7B_Cot/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5_Math_7B_Cot", - "id": "StelleX/Qwen2.5_Math_7B_Cot", - "developer": "StelleX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2143 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4313 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3924 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/StelleX/Vorisatex-7B-preview/dbdd71ad-db5b-4b4b-8856-68b55adbe127.json b/data/hfopenllm_v2/StelleX/Vorisatex-7B-preview/dbdd71ad-db5b-4b4b-8856-68b55adbe127.json deleted file mode 100644 index 7a0e33163..000000000 --- a/data/hfopenllm_v2/StelleX/Vorisatex-7B-preview/dbdd71ad-db5b-4b4b-8856-68b55adbe127.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/StelleX_Vorisatex-7B-preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Vorisatex-7B-preview", - "id": "StelleX/Vorisatex-7B-preview", - "developer": "StelleX", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4192 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Instruct/da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json b/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Instruct/da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json deleted file mode 100644 index 0304b53c6..000000000 --- a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Instruct/da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SultanR_SmolTulu-1.7b-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolTulu-1.7b-Instruct", - "id": "SultanR/SmolTulu-1.7b-Instruct", - "developer": "SultanR", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.711 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6541 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.171 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Reinforced/77d5f51e-5ad2-42a6-a32c-060cd844b949.json b/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Reinforced/77d5f51e-5ad2-42a6-a32c-060cd844b949.json deleted file mode 100644 index 37b00c3a4..000000000 --- a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Reinforced/77d5f51e-5ad2-42a6-a32c-060cd844b949.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SultanR_SmolTulu-1.7b-Reinforced/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolTulu-1.7b-Reinforced", - "id": "SultanR/SmolTulu-1.7b-Reinforced", - "developer": "SultanR", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.711 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6791 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3552 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1763 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-it-v0/724cc582-cc83-474b-9606-70dbc22f3581.json b/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-it-v0/724cc582-cc83-474b-9606-70dbc22f3581.json deleted file mode 100644 index 4d26688c4..000000000 --- a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-it-v0/724cc582-cc83-474b-9606-70dbc22f3581.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/SultanR_SmolTulu-1.7b-it-v0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolTulu-1.7b-it-v0", - "id": "SultanR/SmolTulu-1.7b-it-v0", - "developer": "SultanR", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.711 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6541 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.171 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/BBA-123/8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json b/data/hfopenllm_v2/Supichi/BBA-123/8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json deleted file mode 100644 index c72fad264..000000000 --- a/data/hfopenllm_v2/Supichi/BBA-123/8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_BBA-123/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBA-123", - "id": "Supichi/BBA-123", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 17.161 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3499 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/BBA99/0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json b/data/hfopenllm_v2/Supichi/BBA99/0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json deleted file mode 100644 index 7dab217a1..000000000 --- a/data/hfopenllm_v2/Supichi/BBA99/0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_BBA99/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBA99", - "id": "Supichi/BBA99", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 17.161 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1407 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2769 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/BBAIK29/ab2512fa-2335-4817-9a76-3259690bbc67.json b/data/hfopenllm_v2/Supichi/BBAIK29/ab2512fa-2335-4817-9a76-3259690bbc67.json deleted file mode 100644 index 90e50a5c5..000000000 --- a/data/hfopenllm_v2/Supichi/BBAIK29/ab2512fa-2335-4817-9a76-3259690bbc67.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_BBAIK29/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAIK29", - "id": "Supichi/BBAIK29", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4588 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3678 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4469 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/BBAI_135_Gemma/fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json b/data/hfopenllm_v2/Supichi/BBAI_135_Gemma/fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json deleted file mode 100644 index f1fa97e95..000000000 --- a/data/hfopenllm_v2/Supichi/BBAI_135_Gemma/fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_BBAI_135_Gemma/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_135_Gemma", - "id": "Supichi/BBAI_135_Gemma", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 19.3 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0656 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3568 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3805 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1672 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/BBAI_250_Xia0_gZ/0e14484a-69d7-423e-bf6c-33d0992f408c.json b/data/hfopenllm_v2/Supichi/BBAI_250_Xia0_gZ/0e14484a-69d7-423e-bf6c-33d0992f408c.json deleted file mode 100644 index dbf736418..000000000 --- a/data/hfopenllm_v2/Supichi/BBAI_250_Xia0_gZ/0e14484a-69d7-423e-bf6c-33d0992f408c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_BBAI_250_Xia0_gZ/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_250_Xia0_gZ", - "id": "Supichi/BBAI_250_Xia0_gZ", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5568 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4465 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/BBAI_275_Tsunami_gZ/881eaa2c-af5f-4e84-8807-d0835c10ebd2.json b/data/hfopenllm_v2/Supichi/BBAI_275_Tsunami_gZ/881eaa2c-af5f-4e84-8807-d0835c10ebd2.json deleted file mode 100644 index c2465ba88..000000000 --- a/data/hfopenllm_v2/Supichi/BBAI_275_Tsunami_gZ/881eaa2c-af5f-4e84-8807-d0835c10ebd2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_BBAI_275_Tsunami_gZ/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_275_Tsunami_gZ", - "id": "Supichi/BBAI_275_Tsunami_gZ", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5531 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3285 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4448 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4492 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/BBAI_525_Tsu_gZ_Xia0/ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json b/data/hfopenllm_v2/Supichi/BBAI_525_Tsu_gZ_Xia0/ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json deleted file mode 100644 index 8e1fdf952..000000000 --- a/data/hfopenllm_v2/Supichi/BBAI_525_Tsu_gZ_Xia0/ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_BBAI_525_Tsu_gZ_Xia0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_525_Tsu_gZ_Xia0", - "id": "Supichi/BBAI_525_Tsu_gZ_Xia0", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5339 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5562 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3429 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4477 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/BBAI_78B_Calme_3_1_Ties/db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json b/data/hfopenllm_v2/Supichi/BBAI_78B_Calme_3_1_Ties/db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json deleted file mode 100644 index 7bb33875a..000000000 --- a/data/hfopenllm_v2/Supichi/BBAI_78B_Calme_3_1_Ties/db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_BBAI_78B_Calme_3_1_Ties/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_78B_Calme_3_1_Ties", - "id": "Supichi/BBAI_78B_Calme_3_1_Ties", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 27.06 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1828 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2828 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.229 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/BBAI_QWEEN_V000000_LUMEN_14B/0c44a429-e705-4794-b702-1a731e52df90.json b/data/hfopenllm_v2/Supichi/BBAI_QWEEN_V000000_LUMEN_14B/0c44a429-e705-4794-b702-1a731e52df90.json deleted file mode 100644 index 4b3c57482..000000000 --- a/data/hfopenllm_v2/Supichi/BBAI_QWEEN_V000000_LUMEN_14B/0c44a429-e705-4794-b702-1a731e52df90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_BBAI_QWEEN_V000000_LUMEN_14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_QWEEN_V000000_LUMEN_14B", - "id": "Supichi/BBAI_QWEEN_V000000_LUMEN_14B", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 10.366 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1815 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2297 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2315 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.116 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/HF_TOKEN/92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json b/data/hfopenllm_v2/Supichi/HF_TOKEN/92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json deleted file mode 100644 index 3690ec90c..000000000 --- a/data/hfopenllm_v2/Supichi/HF_TOKEN/92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_HF_TOKEN/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HF_TOKEN", - "id": "Supichi/HF_TOKEN", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 17.161 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.138 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2764 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Supichi/NJS26/5703e81d-055c-459b-8202-80ec382a8d5b.json b/data/hfopenllm_v2/Supichi/NJS26/5703e81d-055c-459b-8202-80ec382a8d5b.json deleted file mode 100644 index fb56689db..000000000 --- a/data/hfopenllm_v2/Supichi/NJS26/5703e81d-055c-459b-8202-80ec382a8d5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Supichi_NJS26/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NJS26", - "id": "Supichi/NJS26", - "developer": "Supichi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0448 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3854 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.0/f6260b6e-52a2-4142-93ba-5393807fa0d4.json b/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.0/f6260b6e-52a2-4142-93ba-5393807fa0d4.json deleted file mode 100644 index cf81cb961..000000000 --- a/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.0/f6260b6e-52a2-4142-93ba-5393807fa0d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Svak_MN-12B-Inferor-v0.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Inferor-v0.0", - "id": "Svak/MN-12B-Inferor-v0.0", - "developer": "Svak", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5708 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5195 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4639 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3559 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.1/83b84506-4826-48de-a6fe-2af6ae5d425a.json b/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.1/83b84506-4826-48de-a6fe-2af6ae5d425a.json deleted file mode 100644 index 30b42165b..000000000 --- a/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.1/83b84506-4826-48de-a6fe-2af6ae5d425a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Svak_MN-12B-Inferor-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Inferor-v0.1", - "id": "Svak/MN-12B-Inferor-v0.1", - "developer": "Svak", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6347 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5147 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3662 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo/7483e260-9853-4d3f-aa10-187796d96de9.json b/data/hfopenllm_v2/Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo/7483e260-9853-4d3f-aa10-187796d96de9.json deleted file mode 100644 index e37c2fec5..000000000 --- a/data/hfopenllm_v2/Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo/7483e260-9853-4d3f-aa10-187796d96de9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Syed-Hasan-8503_Phi-3-mini-4K-instruct-cpo-simpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-mini-4K-instruct-cpo-simpo", - "id": "Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo", - "developer": "Syed-Hasan-8503", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5714 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5682 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1571 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3964 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3861 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P1/f9925806-4252-44e8-b67e-917737572bd4.json b/data/hfopenllm_v2/T145/KRONOS-8B-V1-P1/f9925806-4252-44e8-b67e-917737572bd4.json deleted file mode 100644 index a479386a3..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P1/f9925806-4252-44e8-b67e-917737572bd4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V1-P1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V1-P1", - "id": "T145/KRONOS-8B-V1-P1", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5085 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1979 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3881 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P2/70470e6c-8d66-4249-b762-a5a2e3589a53.json b/data/hfopenllm_v2/T145/KRONOS-8B-V1-P2/70470e6c-8d66-4249-b762-a5a2e3589a53.json deleted file mode 100644 index 9dacbbf53..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P2/70470e6c-8d66-4249-b762-a5a2e3589a53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V1-P2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V1-P2", - "id": "T145/KRONOS-8B-V1-P2", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6724 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4772 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1601 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3568 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3453 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P3/d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json b/data/hfopenllm_v2/T145/KRONOS-8B-V1-P3/d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json deleted file mode 100644 index 4ae2c0b29..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P3/d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V1-P3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V1-P3", - "id": "T145/KRONOS-8B-V1-P3", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7137 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5128 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3405 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V2/a35b06bc-d759-421a-94cf-f408a98e9273.json b/data/hfopenllm_v2/T145/KRONOS-8B-V2/a35b06bc-d759-421a-94cf-f408a98e9273.json deleted file mode 100644 index 1869aec88..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V2/a35b06bc-d759-421a-94cf-f408a98e9273.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V2", - "id": "T145/KRONOS-8B-V2", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5133 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2266 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3829 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V3/bbac659c-7cf8-41d4-98d4-ded4c471bd98.json b/data/hfopenllm_v2/T145/KRONOS-8B-V3/bbac659c-7cf8-41d4-98d4-ded4c471bd98.json deleted file mode 100644 index 86f598a6c..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V3/bbac659c-7cf8-41d4-98d4-ded4c471bd98.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V3", - "id": "T145/KRONOS-8B-V3", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5475 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5119 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2598 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V4/0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json b/data/hfopenllm_v2/T145/KRONOS-8B-V4/0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json deleted file mode 100644 index b0d2848fc..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V4/0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V4", - "id": "T145/KRONOS-8B-V4", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7889 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5092 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1949 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.383 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3786 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V5/a7ab6f16-717f-4567-8057-a4a18e1a1e77.json b/data/hfopenllm_v2/T145/KRONOS-8B-V5/a7ab6f16-717f-4567-8057-a4a18e1a1e77.json deleted file mode 100644 index fb35a1d47..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V5/a7ab6f16-717f-4567-8057-a4a18e1a1e77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V5", - "id": "T145/KRONOS-8B-V5", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5405 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5089 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2689 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4055 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3759 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V6/2abe2c9d-032d-469e-852b-114eca5e84f8.json b/data/hfopenllm_v2/T145/KRONOS-8B-V6/2abe2c9d-032d-469e-852b-114eca5e84f8.json deleted file mode 100644 index 55fd3a74f..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V6/2abe2c9d-032d-469e-852b-114eca5e84f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V6", - "id": "T145/KRONOS-8B-V6", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7022 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5034 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2598 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4121 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V7/2e8a83dc-c760-4f42-a361-e02cf3a65427.json b/data/hfopenllm_v2/T145/KRONOS-8B-V7/2e8a83dc-c760-4f42-a361-e02cf3a65427.json deleted file mode 100644 index 5846cc83c..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V7/2e8a83dc-c760-4f42-a361-e02cf3a65427.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V7", - "id": "T145/KRONOS-8B-V7", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4526 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2697 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V8/743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json b/data/hfopenllm_v2/T145/KRONOS-8B-V8/743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json deleted file mode 100644 index d7f05654d..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V8/743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V8", - "id": "T145/KRONOS-8B-V8", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.777 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5094 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2047 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3869 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V9/4e37c90b-65a8-4b71-bfc2-d63541fb8962.json b/data/hfopenllm_v2/T145/KRONOS-8B-V9/4e37c90b-65a8-4b71-bfc2-d63541fb8962.json deleted file mode 100644 index 405395324..000000000 --- a/data/hfopenllm_v2/T145/KRONOS-8B-V9/4e37c90b-65a8-4b71-bfc2-d63541fb8962.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KRONOS-8B-V9", - "id": "T145/KRONOS-8B-V9", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7856 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5099 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1986 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3868 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/Llama-3.1-8B-Instruct-Zeus/2e34d74e-1b69-4daf-8bee-77e5357fd439.json b/data/hfopenllm_v2/T145/Llama-3.1-8B-Instruct-Zeus/2e34d74e-1b69-4daf-8bee-77e5357fd439.json deleted file mode 100644 index 41ff9323c..000000000 --- a/data/hfopenllm_v2/T145/Llama-3.1-8B-Instruct-Zeus/2e34d74e-1b69-4daf-8bee-77e5357fd439.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_Llama-3.1-8B-Instruct-Zeus/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Instruct-Zeus", - "id": "T145/Llama-3.1-8B-Instruct-Zeus", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7941 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5174 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1956 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3976 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/Llama-3.1-8B-Zeus/0646e2f7-d2e6-42d3-8f09-f8daee302709.json b/data/hfopenllm_v2/T145/Llama-3.1-8B-Zeus/0646e2f7-d2e6-42d3-8f09-f8daee302709.json deleted file mode 100644 index 0756465ea..000000000 --- a/data/hfopenllm_v2/T145/Llama-3.1-8B-Zeus/0646e2f7-d2e6-42d3-8f09-f8daee302709.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_Llama-3.1-8B-Zeus/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Zeus", - "id": "T145/Llama-3.1-8B-Zeus", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1332 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/Meta-Llama-3.1-8B-Instruct-TIES/c66b1ff8-9c04-4f9c-b83e-088f31f79590.json b/data/hfopenllm_v2/T145/Meta-Llama-3.1-8B-Instruct-TIES/c66b1ff8-9c04-4f9c-b83e-088f31f79590.json deleted file mode 100644 index d2e2124a8..000000000 --- a/data/hfopenllm_v2/T145/Meta-Llama-3.1-8B-Instruct-TIES/c66b1ff8-9c04-4f9c-b83e-088f31f79590.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_Meta-Llama-3.1-8B-Instruct-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3.1-8B-Instruct-TIES", - "id": "T145/Meta-Llama-3.1-8B-Instruct-TIES", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5424 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V10/1bd2affc-9970-4149-b52b-51549b1f0029.json b/data/hfopenllm_v2/T145/ZEUS-8B-V10/1bd2affc-9970-4149-b52b-51549b1f0029.json deleted file mode 100644 index 0c3eeb4e0..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V10/1bd2affc-9970-4149-b52b-51549b1f0029.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V10/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V10", - "id": "T145/ZEUS-8B-V10", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7707 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.527 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3898 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3904 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V11/f0479d74-4684-4b41-a63b-16d7fe0e3290.json b/data/hfopenllm_v2/T145/ZEUS-8B-V11/f0479d74-4684-4b41-a63b-16d7fe0e3290.json deleted file mode 100644 index 65e7ba095..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V11/f0479d74-4684-4b41-a63b-16d7fe0e3290.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V11/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V11", - "id": "T145/ZEUS-8B-V11", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5162 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V12/95deb890-a15d-4c71-8151-ed45c3dfb87f.json b/data/hfopenllm_v2/T145/ZEUS-8B-V12/95deb890-a15d-4c71-8151-ed45c3dfb87f.json deleted file mode 100644 index 2173c77bb..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V12/95deb890-a15d-4c71-8151-ed45c3dfb87f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V12/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V12", - "id": "T145/ZEUS-8B-V12", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7816 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5254 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3858 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3912 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V13-abliterated/1c07fc4c-a773-4e03-bb14-7144e7815c01.json b/data/hfopenllm_v2/T145/ZEUS-8B-V13-abliterated/1c07fc4c-a773-4e03-bb14-7144e7815c01.json deleted file mode 100644 index 4f8852135..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V13-abliterated/1c07fc4c-a773-4e03-bb14-7144e7815c01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V13-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V13-abliterated", - "id": "T145/ZEUS-8B-V13-abliterated", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7878 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5198 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.179 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3871 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V13/e7e8388e-db3c-4881-b67c-5177c60562b9.json b/data/hfopenllm_v2/T145/ZEUS-8B-V13/e7e8388e-db3c-4881-b67c-5177c60562b9.json deleted file mode 100644 index f1a974e92..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V13/e7e8388e-db3c-4881-b67c-5177c60562b9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V13/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V13", - "id": "T145/ZEUS-8B-V13", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7904 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5277 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2137 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3845 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V14/c4923208-2a47-45f2-a74a-4483e4b99bee.json b/data/hfopenllm_v2/T145/ZEUS-8B-V14/c4923208-2a47-45f2-a74a-4483e4b99bee.json deleted file mode 100644 index 0b3a25678..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V14/c4923208-2a47-45f2-a74a-4483e4b99bee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V14/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V14", - "id": "T145/ZEUS-8B-V14", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7709 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V15/b5f06a78-5b57-45a5-93be-4f3c1b36f208.json b/data/hfopenllm_v2/T145/ZEUS-8B-V15/b5f06a78-5b57-45a5-93be-4f3c1b36f208.json deleted file mode 100644 index b2f90345b..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V15/b5f06a78-5b57-45a5-93be-4f3c1b36f208.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V15/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V15", - "id": "T145/ZEUS-8B-V15", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7013 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5538 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2304 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4059 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V16/835f19d3-515c-4bc4-ab96-5cb5bece45dc.json b/data/hfopenllm_v2/T145/ZEUS-8B-V16/835f19d3-515c-4bc4-ab96-5cb5bece45dc.json deleted file mode 100644 index 5181ab5f4..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V16/835f19d3-515c-4bc4-ab96-5cb5bece45dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V16", - "id": "T145/ZEUS-8B-V16", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7925 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V2/7dd96382-6fc1-4a39-924b-d9034b5b0839.json b/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V2/7dd96382-6fc1-4a39-924b-d9034b5b0839.json deleted file mode 100644 index 978451253..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V2/7dd96382-6fc1-4a39-924b-d9034b5b0839.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V17-abliterated-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V17-abliterated-V2", - "id": "T145/ZEUS-8B-V17-abliterated-V2", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6532 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4928 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V4/77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json b/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V4/77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json deleted file mode 100644 index cee81252f..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V4/77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V17-abliterated-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V17-abliterated-V4", - "id": "T145/ZEUS-8B-V17-abliterated-V4", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7228 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5169 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3774 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated/e3eae267-46ab-4433-a8f3-2a2f8448299b.json b/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated/e3eae267-46ab-4433-a8f3-2a2f8448299b.json deleted file mode 100644 index f3ac720d6..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated/e3eae267-46ab-4433-a8f3-2a2f8448299b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V17-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V17-abliterated", - "id": "T145/ZEUS-8B-V17-abliterated", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.594 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7576 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4269 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3622 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V17/e31308c4-8eb2-4a72-8127-18049d58b814.json b/data/hfopenllm_v2/T145/ZEUS-8B-V17/e31308c4-8eb2-4a72-8127-18049d58b814.json deleted file mode 100644 index 7b6a3680c..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V17/e31308c4-8eb2-4a72-8127-18049d58b814.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V17/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V17", - "id": "T145/ZEUS-8B-V17", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7941 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5251 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4016 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V18/c7098a7a-e865-4ecd-b511-abeb2c0872bd.json b/data/hfopenllm_v2/T145/ZEUS-8B-V18/c7098a7a-e865-4ecd-b511-abeb2c0872bd.json deleted file mode 100644 index 4f93f40a0..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V18/c7098a7a-e865-4ecd-b511-abeb2c0872bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V18/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V18", - "id": "T145/ZEUS-8B-V18", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7834 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.527 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4043 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3942 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V19/b3a8c734-e63a-47f7-af2c-a3b6518802fa.json b/data/hfopenllm_v2/T145/ZEUS-8B-V19/b3a8c734-e63a-47f7-af2c-a3b6518802fa.json deleted file mode 100644 index 893297bba..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V19/b3a8c734-e63a-47f7-af2c-a3b6518802fa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V19/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V19", - "id": "T145/ZEUS-8B-V19", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5276 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4043 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3934 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2-ORPO/35937965-2791-4f75-8954-5a2280381c91.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2-ORPO/35937965-2791-4f75-8954-5a2280381c91.json deleted file mode 100644 index 970e9dcd6..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V2-ORPO/35937965-2791-4f75-8954-5a2280381c91.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V2-ORPO", - "id": "T145/ZEUS-8B-V2-ORPO", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7187 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1828 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3678 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2-abliterated/4ab806fe-738d-4f5b-89e4-004134d2f7fe.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2-abliterated/4ab806fe-738d-4f5b-89e4-004134d2f7fe.json deleted file mode 100644 index 1e938c379..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V2-abliterated/4ab806fe-738d-4f5b-89e4-004134d2f7fe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V2-abliterated", - "id": "T145/ZEUS-8B-V2-abliterated", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7895 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5129 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2/a937e27e-b757-4de7-b679-01ac29d8bb22.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2/a937e27e-b757-4de7-b679-01ac29d8bb22.json deleted file mode 100644 index d403098af..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V2/a937e27e-b757-4de7-b679-01ac29d8bb22.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V2", - "id": "T145/ZEUS-8B-V2", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8029 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V20/1d906aab-33a6-4ffe-8a63-694482d83d09.json b/data/hfopenllm_v2/T145/ZEUS-8B-V20/1d906aab-33a6-4ffe-8a63-694482d83d09.json deleted file mode 100644 index 269fa1ea5..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V20/1d906aab-33a6-4ffe-8a63-694482d83d09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V20/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V20", - "id": "T145/ZEUS-8B-V20", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7956 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5244 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4043 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V21/9e101298-6482-4ae8-83e4-b948ba8fa550.json b/data/hfopenllm_v2/T145/ZEUS-8B-V21/9e101298-6482-4ae8-83e4-b948ba8fa550.json deleted file mode 100644 index 55ec61818..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V21/9e101298-6482-4ae8-83e4-b948ba8fa550.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V21/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V21", - "id": "T145/ZEUS-8B-V21", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3785 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1594 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1714 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V22/3818710d-80a9-4e7d-90e3-f06afffb71ac.json b/data/hfopenllm_v2/T145/ZEUS-8B-V22/3818710d-80a9-4e7d-90e3-f06afffb71ac.json deleted file mode 100644 index 35e2f1e24..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V22/3818710d-80a9-4e7d-90e3-f06afffb71ac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V22/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V22", - "id": "T145/ZEUS-8B-V22", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7995 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5245 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2228 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.399 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3938 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V23/a18ec0c4-6f3f-4904-b69c-e40770df169e.json b/data/hfopenllm_v2/T145/ZEUS-8B-V23/a18ec0c4-6f3f-4904-b69c-e40770df169e.json deleted file mode 100644 index 9423e9265..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V23/a18ec0c4-6f3f-4904-b69c-e40770df169e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V23/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V23", - "id": "T145/ZEUS-8B-V23", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5195 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V24/529c2bd4-6b8e-4e3c-8737-c0b794444d13.json b/data/hfopenllm_v2/T145/ZEUS-8B-V24/529c2bd4-6b8e-4e3c-8737-c0b794444d13.json deleted file mode 100644 index cebba5e48..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V24/529c2bd4-6b8e-4e3c-8737-c0b794444d13.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V24/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V24", - "id": "T145/ZEUS-8B-V24", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4778 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3729 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3285 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V25/9e994362-a1d1-48f7-9db1-dd9d532b9f35.json b/data/hfopenllm_v2/T145/ZEUS-8B-V25/9e994362-a1d1-48f7-9db1-dd9d532b9f35.json deleted file mode 100644 index 7c03079a9..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V25/9e994362-a1d1-48f7-9db1-dd9d532b9f35.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V25/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V25", - "id": "T145/ZEUS-8B-V25", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.332 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4547 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2039 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2885 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V26/cf35b7db-f675-4362-8916-36b0582b64f4.json b/data/hfopenllm_v2/T145/ZEUS-8B-V26/cf35b7db-f675-4362-8916-36b0582b64f4.json deleted file mode 100644 index 1c35fccff..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V26/cf35b7db-f675-4362-8916-36b0582b64f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V26/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V26", - "id": "T145/ZEUS-8B-V26", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6708 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5232 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4016 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3907 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V27/79ee7e34-36cd-4024-8978-86c1b059ae5f.json b/data/hfopenllm_v2/T145/ZEUS-8B-V27/79ee7e34-36cd-4024-8978-86c1b059ae5f.json deleted file mode 100644 index 6b17c7f6a..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V27/79ee7e34-36cd-4024-8978-86c1b059ae5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V27/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V27", - "id": "T145/ZEUS-8B-V27", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6544 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.523 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3902 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V28/9ec4fb99-ed4d-416e-9342-0c036aadd35d.json b/data/hfopenllm_v2/T145/ZEUS-8B-V28/9ec4fb99-ed4d-416e-9342-0c036aadd35d.json deleted file mode 100644 index 1ebb0754d..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V28/9ec4fb99-ed4d-416e-9342-0c036aadd35d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V28/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V28", - "id": "T145/ZEUS-8B-V28", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6353 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5254 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3902 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V29/8788e4fa-04c5-4f7c-bb4e-523287901f71.json b/data/hfopenllm_v2/T145/ZEUS-8B-V29/8788e4fa-04c5-4f7c-bb4e-523287901f71.json deleted file mode 100644 index a0483b34c..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V29/8788e4fa-04c5-4f7c-bb4e-523287901f71.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V29/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V29", - "id": "T145/ZEUS-8B-V29", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7418 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5253 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1601 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2L1/18097bf4-5149-40e9-9850-558c3f143ed8.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2L1/18097bf4-5149-40e9-9850-558c3f143ed8.json deleted file mode 100644 index e6c750d51..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V2L1/18097bf4-5149-40e9-9850-558c3f143ed8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2L1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V2L1", - "id": "T145/ZEUS-8B-V2L1", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3192 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5013 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3638 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2L2/b5942721-5c30-4c49-a6e1-fb5419539652.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2L2/b5942721-5c30-4c49-a6e1-fb5419539652.json deleted file mode 100644 index d9184ab9f..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V2L2/b5942721-5c30-4c49-a6e1-fb5419539652.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2L2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V2L2", - "id": "T145/ZEUS-8B-V2L2", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8021 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5203 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2017 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3975 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V3/76d27de3-0309-4e4b-8d0d-0e402bde0a31.json b/data/hfopenllm_v2/T145/ZEUS-8B-V3/76d27de3-0309-4e4b-8d0d-0e402bde0a31.json deleted file mode 100644 index 5781ec510..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V3/76d27de3-0309-4e4b-8d0d-0e402bde0a31.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V3", - "id": "T145/ZEUS-8B-V3", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7887 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5265 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1677 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4017 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V30/5c0553ff-4910-45a9-aa8d-3a76af098403.json b/data/hfopenllm_v2/T145/ZEUS-8B-V30/5c0553ff-4910-45a9-aa8d-3a76af098403.json deleted file mode 100644 index cf4cd7678..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V30/5c0553ff-4910-45a9-aa8d-3a76af098403.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V30/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V30", - "id": "T145/ZEUS-8B-V30", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5243 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1586 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4029 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3944 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V4/fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json b/data/hfopenllm_v2/T145/ZEUS-8B-V4/fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json deleted file mode 100644 index b4583aa87..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V4/fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V4", - "id": "T145/ZEUS-8B-V4", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7807 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5246 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4029 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3788 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V6/f77aa103-5a09-409c-ad72-7992b6049f94.json b/data/hfopenllm_v2/T145/ZEUS-8B-V6/f77aa103-5a09-409c-ad72-7992b6049f94.json deleted file mode 100644 index 0d6b104c5..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V6/f77aa103-5a09-409c-ad72-7992b6049f94.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V6", - "id": "T145/ZEUS-8B-V6", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7838 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4068 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3759 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V7/0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json b/data/hfopenllm_v2/T145/ZEUS-8B-V7/0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json deleted file mode 100644 index 4c23b2855..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V7/0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V7", - "id": "T145/ZEUS-8B-V7", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7786 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V8/044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json b/data/hfopenllm_v2/T145/ZEUS-8B-V8/044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json deleted file mode 100644 index dac50a4bc..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V8/044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V8", - "id": "T145/ZEUS-8B-V8", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7914 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5065 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4214 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V9/ac6b884d-62ea-4ff5-8eee-cfce08869030.json b/data/hfopenllm_v2/T145/ZEUS-8B-V9/ac6b884d-62ea-4ff5-8eee-cfce08869030.json deleted file mode 100644 index 73c3bb0b4..000000000 --- a/data/hfopenllm_v2/T145/ZEUS-8B-V9/ac6b884d-62ea-4ff5-8eee-cfce08869030.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZEUS-8B-V9", - "id": "T145/ZEUS-8B-V9", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5551 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2137 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3949 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3901 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/T145/qwen-2.5-3B-merge-test/8ffa696e-adef-4808-ba0e-bb04921a433d.json b/data/hfopenllm_v2/T145/qwen-2.5-3B-merge-test/8ffa696e-adef-4808-ba0e-bb04921a433d.json deleted file mode 100644 index 2f5ff3f18..000000000 --- a/data/hfopenllm_v2/T145/qwen-2.5-3B-merge-test/8ffa696e-adef-4808-ba0e-bb04921a433d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/T145_qwen-2.5-3B-merge-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-2.5-3B-merge-test", - "id": "T145/qwen-2.5-3B-merge-test", - "developer": "T145", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5751 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4842 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3202 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4007 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m-hf/8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json b/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m-hf/8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json deleted file mode 100644 index 2a4ac4794..000000000 --- a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m-hf/8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b-chat-1m-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "glm-4-9b-chat-1m-hf", - "id": "THUDM/glm-4-9b-chat-1m-hf", - "developer": "THUDM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GlmForCausalLM", - "params_billions": 9.484 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5341 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3901 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3689 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1814 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m/4f24fc46-3686-41fa-bf25-a0e39b252cc9.json b/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m/4f24fc46-3686-41fa-bf25-a0e39b252cc9.json deleted file mode 100644 index cbe32d26e..000000000 --- a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m/4f24fc46-3686-41fa-bf25-a0e39b252cc9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b-chat-1m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "glm-4-9b-chat-1m", - "id": "THUDM/glm-4-9b-chat-1m", - "developer": "THUDM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "ChatGLMModel", - "params_billions": 9.484 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3795 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-hf/b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json b/data/hfopenllm_v2/THUDM/glm-4-9b-chat-hf/b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json deleted file mode 100644 index f54072bac..000000000 --- a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-hf/b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b-chat-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "glm-4-9b-chat-hf", - "id": "THUDM/glm-4-9b-chat-hf", - "developer": "THUDM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GlmForCausalLM", - "params_billions": 9.4 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6513 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3593 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2774 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b-chat/4ce062da-acfc-4684-95c2-679cbe5a697b.json b/data/hfopenllm_v2/THUDM/glm-4-9b-chat/4ce062da-acfc-4684-95c2-679cbe5a697b.json deleted file mode 100644 index 01dceb960..000000000 --- a/data/hfopenllm_v2/THUDM/glm-4-9b-chat/4ce062da-acfc-4684-95c2-679cbe5a697b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "glm-4-9b-chat", - "id": "THUDM/glm-4-9b-chat", - "developer": "THUDM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "ChatGLMModelM", - "params_billions": 9.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4736 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3994 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b/3d785765-befa-4e53-8672-769f7bb87dcd.json b/data/hfopenllm_v2/THUDM/glm-4-9b/3d785765-befa-4e53-8672-769f7bb87dcd.json deleted file mode 100644 index f7c95a078..000000000 --- a/data/hfopenllm_v2/THUDM/glm-4-9b/3d785765-befa-4e53-8672-769f7bb87dcd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "glm-4-9b", - "id": "THUDM/glm-4-9b", - "developer": "THUDM", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "ChatGLMModelM", - "params_billions": 9.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1426 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5528 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TIGER-Lab/AceCodeRM-7B/ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json b/data/hfopenllm_v2/TIGER-Lab/AceCodeRM-7B/ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json deleted file mode 100644 index a74035d6b..000000000 --- a/data/hfopenllm_v2/TIGER-Lab/AceCodeRM-7B/ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TIGER-Lab_AceCodeRM-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceCodeRM-7B", - "id": "TIGER-Lab/AceCodeRM-7B", - "developer": "TIGER-Lab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalRM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5855 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4773 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3467 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4192 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3361 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule/31f0b186-1805-42ff-86cf-d8455a66d538.json b/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule/31f0b186-1805-42ff-86cf-d8455a66d538.json deleted file mode 100644 index 0ea92826f..000000000 --- a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule/31f0b186-1805-42ff-86cf-d8455a66d538.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TIGER-Lab_AceCoder-Qwen2.5-7B-Ins-Rule/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceCoder-Qwen2.5-7B-Ins-Rule", - "id": "TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule", - "developer": "TIGER-Lab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7424 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5404 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4992 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4322 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule/ed6b3e7e-d294-420d-b9b9-460a52cd0239.json b/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule/ed6b3e7e-d294-420d-b9b9-460a52cd0239.json deleted file mode 100644 index 60cf1a5b7..000000000 --- a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule/ed6b3e7e-d294-420d-b9b9-460a52cd0239.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Base-Rule/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceCoder-Qwen2.5-Coder-7B-Base-Rule", - "id": "TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule", - "developer": "TIGER-Lab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2017 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3745 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule/91dec0c0-9854-4790-a0a5-e17d19636f17.json b/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule/91dec0c0-9854-4790-a0a5-e17d19636f17.json deleted file mode 100644 index e2e2f2697..000000000 --- a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule/91dec0c0-9854-4790-a0a5-e17d19636f17.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Ins-Rule/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceCoder-Qwen2.5-Coder-7B-Ins-Rule", - "id": "TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule", - "developer": "TIGER-Lab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6222 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5089 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4046 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3428 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TIGER-Lab/MAmmoTH2-7B-Plus/599616fb-26c1-47e3-a98b-9ad922a95c08.json b/data/hfopenllm_v2/TIGER-Lab/MAmmoTH2-7B-Plus/599616fb-26c1-47e3-a98b-9ad922a95c08.json deleted file mode 100644 index 0a1e9887a..000000000 --- a/data/hfopenllm_v2/TIGER-Lab/MAmmoTH2-7B-Plus/599616fb-26c1-47e3-a98b-9ad922a95c08.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TIGER-Lab_MAmmoTH2-7B-Plus/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MAmmoTH2-7B-Plus", - "id": "TIGER-Lab/MAmmoTH2-7B-Plus", - "developer": "TIGER-Lab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4235 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3017 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TIGER-Lab/Qwen2.5-Math-7B-CFT/aeee4365-c34d-46b9-8c98-29976010bb62.json b/data/hfopenllm_v2/TIGER-Lab/Qwen2.5-Math-7B-CFT/aeee4365-c34d-46b9-8c98-29976010bb62.json deleted file mode 100644 index 92ab6326a..000000000 --- a/data/hfopenllm_v2/TIGER-Lab/Qwen2.5-Math-7B-CFT/aeee4365-c34d-46b9-8c98-29976010bb62.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TIGER-Lab_Qwen2.5-Math-7B-CFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-7B-CFT", - "id": "TIGER-Lab/Qwen2.5-Math-7B-CFT", - "developer": "TIGER-Lab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3887 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TTTXXX01/Mistral-7B-Base-SimPO2-5e-7/1ec68708-94c9-4561-bb99-7f211d7a9950.json b/data/hfopenllm_v2/TTTXXX01/Mistral-7B-Base-SimPO2-5e-7/1ec68708-94c9-4561-bb99-7f211d7a9950.json deleted file mode 100644 index f4ed2d887..000000000 --- a/data/hfopenllm_v2/TTTXXX01/Mistral-7B-Base-SimPO2-5e-7/1ec68708-94c9-4561-bb99-7f211d7a9950.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TTTXXX01_Mistral-7B-Base-SimPO2-5e-7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Base-SimPO2-5e-7", - "id": "TTTXXX01/Mistral-7B-Base-SimPO2-5e-7", - "developer": "TTTXXX01", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2766 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Tarek07/Progenitor-V1.1-LLaMa-70B/0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json b/data/hfopenllm_v2/Tarek07/Progenitor-V1.1-LLaMa-70B/0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json deleted file mode 100644 index 80dd34f48..000000000 --- a/data/hfopenllm_v2/Tarek07/Progenitor-V1.1-LLaMa-70B/0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Tarek07_Progenitor-V1.1-LLaMa-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Progenitor-V1.1-LLaMa-70B", - "id": "Tarek07/Progenitor-V1.1-LLaMa-70B", - "developer": "Tarek07", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6906 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6971 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3573 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4581 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4736 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Tarek07/Thalassic-Alpha-LLaMa-70B/91bcd646-fe3d-458b-a426-a6a8863d69a0.json b/data/hfopenllm_v2/Tarek07/Thalassic-Alpha-LLaMa-70B/91bcd646-fe3d-458b-a426-a6a8863d69a0.json deleted file mode 100644 index f5ea7f4d8..000000000 --- a/data/hfopenllm_v2/Tarek07/Thalassic-Alpha-LLaMa-70B/91bcd646-fe3d-458b-a426-a6a8863d69a0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Tarek07_Thalassic-Alpha-LLaMa-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Thalassic-Alpha-LLaMa-70B", - "id": "Tarek07/Thalassic-Alpha-LLaMa-70B", - "developer": "Tarek07", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7003 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4438 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4802 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TeeZee/DoubleBagel-57B-v1.0/2e0458cc-e092-4770-bd80-00dff169d754.json b/data/hfopenllm_v2/TeeZee/DoubleBagel-57B-v1.0/2e0458cc-e092-4770-bd80-00dff169d754.json deleted file mode 100644 index b1e13e640..000000000 --- a/data/hfopenllm_v2/TeeZee/DoubleBagel-57B-v1.0/2e0458cc-e092-4770-bd80-00dff169d754.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TeeZee_DoubleBagel-57B-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DoubleBagel-57B-v1.0", - "id": "TeeZee/DoubleBagel-57B-v1.0", - "developer": "TeeZee", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 56.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3251 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1478 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0/d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json b/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0/d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json deleted file mode 100644 index 2039e5d97..000000000 --- a/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0/d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Telugu-LLM-Labs_Indic-gemma-2b-finetuned-sft-Navarasa-2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Indic-gemma-2b-finetuned-sft-Navarasa-2.0", - "id": "Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0", - "developer": "Telugu-LLM-Labs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2103 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3899 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1279 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0/a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json b/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0/a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json deleted file mode 100644 index a9a9aa606..000000000 --- a/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0/a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Telugu-LLM-Labs_Indic-gemma-7b-finetuned-sft-Navarasa-2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Indic-gemma-7b-finetuned-sft-Navarasa-2.0", - "id": "Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0", - "developer": "Telugu-LLM-Labs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3237 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4023 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4083 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.235 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B-Instruct/f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json b/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B-Instruct/f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json deleted file mode 100644 index 2c3b0edf0..000000000 --- a/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B-Instruct/f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TencentARC_LLaMA-Pro-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA-Pro-8B-Instruct", - "id": "TencentARC/LLaMA-Pro-8B-Instruct", - "developer": "TencentARC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.357 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4486 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4224 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.419 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1946 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B/2c8c6c6a-ce95-4d11-a33a-d547859fee11.json b/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B/2c8c6c6a-ce95-4d11-a33a-d547859fee11.json deleted file mode 100644 index 5af85f4e1..000000000 --- a/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B/2c8c6c6a-ce95-4d11-a33a-d547859fee11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TencentARC_LLaMA-Pro-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA-Pro-8B", - "id": "TencentARC/LLaMA-Pro-8B", - "developer": "TencentARC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.357 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2277 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3484 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4018 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1811 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TencentARC/MetaMath-Mistral-Pro/47858744-3378-4ed4-9101-8acbc3a53cda.json b/data/hfopenllm_v2/TencentARC/MetaMath-Mistral-Pro/47858744-3378-4ed4-9101-8acbc3a53cda.json deleted file mode 100644 index 666b6ee5b..000000000 --- a/data/hfopenllm_v2/TencentARC/MetaMath-Mistral-Pro/47858744-3378-4ed4-9101-8acbc3a53cda.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TencentARC_MetaMath-Mistral-Pro/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MetaMath-Mistral-Pro", - "id": "TencentARC/MetaMath-Mistral-Pro", - "developer": "TencentARC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.987 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2119 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3524 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2472 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TencentARC/Mistral_Pro_8B_v0.1/2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json b/data/hfopenllm_v2/TencentARC/Mistral_Pro_8B_v0.1/2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json deleted file mode 100644 index 4307e8288..000000000 --- a/data/hfopenllm_v2/TencentARC/Mistral_Pro_8B_v0.1/2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TencentARC_Mistral_Pro_8B_v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral_Pro_8B_v0.1", - "id": "TencentARC/Mistral_Pro_8B_v0.1", - "developer": "TencentARC", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.987 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4526 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4242 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2765 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrummer/Cydonia-22B-v1.2/23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json b/data/hfopenllm_v2/TheDrummer/Cydonia-22B-v1.2/23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json deleted file mode 100644 index 66b44eb6d..000000000 --- a/data/hfopenllm_v2/TheDrummer/Cydonia-22B-v1.2/23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrummer_Cydonia-22B-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cydonia-22B-v1.2", - "id": "TheDrummer/Cydonia-22B-v1.2", - "developer": "TheDrummer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5635 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5809 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2032 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4022 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrummer/Gemmasutra-9B-v1/312ec315-6175-4f99-8741-97d97eb26b47.json b/data/hfopenllm_v2/TheDrummer/Gemmasutra-9B-v1/312ec315-6175-4f99-8741-97d97eb26b47.json deleted file mode 100644 index d22fd945e..000000000 --- a/data/hfopenllm_v2/TheDrummer/Gemmasutra-9B-v1/312ec315-6175-4f99-8741-97d97eb26b47.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrummer_Gemmasutra-9B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemmasutra-9B-v1", - "id": "TheDrummer/Gemmasutra-9B-v1", - "developer": "TheDrummer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5887 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4846 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4045 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrummer/Gemmasutra-Mini-2B-v1/7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json b/data/hfopenllm_v2/TheDrummer/Gemmasutra-Mini-2B-v1/7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json deleted file mode 100644 index ebce47bf5..000000000 --- a/data/hfopenllm_v2/TheDrummer/Gemmasutra-Mini-2B-v1/7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrummer_Gemmasutra-Mini-2B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemmasutra-Mini-2B-v1", - "id": "TheDrummer/Gemmasutra-Mini-2B-v1", - "developer": "TheDrummer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2549 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2055 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrummer/Llama-3SOME-8B-v2/68c9fb85-f90e-442f-aa96-458dabe30b39.json b/data/hfopenllm_v2/TheDrummer/Llama-3SOME-8B-v2/68c9fb85-f90e-442f-aa96-458dabe30b39.json deleted file mode 100644 index 4ed973176..000000000 --- a/data/hfopenllm_v2/TheDrummer/Llama-3SOME-8B-v2/68c9fb85-f90e-442f-aa96-458dabe30b39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrummer_Llama-3SOME-8B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3SOME-8B-v2", - "id": "TheDrummer/Llama-3SOME-8B-v2", - "developer": "TheDrummer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4508 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5203 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3833 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrummer/Ministrations-8B-v1/6891d1dd-0e1a-42e8-9206-64a4c71854f9.json b/data/hfopenllm_v2/TheDrummer/Ministrations-8B-v1/6891d1dd-0e1a-42e8-9206-64a4c71854f9.json deleted file mode 100644 index f103c0ed1..000000000 --- a/data/hfopenllm_v2/TheDrummer/Ministrations-8B-v1/6891d1dd-0e1a-42e8-9206-64a4c71854f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrummer_Ministrations-8B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ministrations-8B-v1", - "id": "TheDrummer/Ministrations-8B-v1", - "developer": "TheDrummer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.02 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2822 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4877 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1843 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrummer/Rocinante-12B-v1/c62eb6b3-2a3d-45bd-acdf-bad717e51766.json b/data/hfopenllm_v2/TheDrummer/Rocinante-12B-v1/c62eb6b3-2a3d-45bd-acdf-bad717e51766.json deleted file mode 100644 index f5272f067..000000000 --- a/data/hfopenllm_v2/TheDrummer/Rocinante-12B-v1/c62eb6b3-2a3d-45bd-acdf-bad717e51766.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrummer_Rocinante-12B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rocinante-12B-v1", - "id": "TheDrummer/Rocinante-12B-v1", - "developer": "TheDrummer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6076 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5065 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4017 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3477 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v1/55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json b/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v1/55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json deleted file mode 100644 index d5d96478e..000000000 --- a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v1/55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrummer_Tiger-Gemma-9B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tiger-Gemma-9B-v1", - "id": "TheDrummer/Tiger-Gemma-9B-v1", - "developer": "TheDrummer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7282 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5704 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1835 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v2/227e3e19-29d6-414f-b538-9f6f89d47677.json b/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v2/227e3e19-29d6-414f-b538-9f6f89d47677.json deleted file mode 100644 index 8fccf8d00..000000000 --- a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v2/227e3e19-29d6-414f-b538-9f6f89d47677.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrummer_Tiger-Gemma-9B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tiger-Gemma-9B-v2", - "id": "TheDrummer/Tiger-Gemma-9B-v2", - "developer": "TheDrummer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6986 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5617 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4084 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v3/e922ac2c-e8d0-48f2-99fc-da70c925136c.json b/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v3/e922ac2c-e8d0-48f2-99fc-da70c925136c.json deleted file mode 100644 index 4a7777424..000000000 --- a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v3/e922ac2c-e8d0-48f2-99fc-da70c925136c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrummer_Tiger-Gemma-9B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tiger-Gemma-9B-v3", - "id": "TheDrummer/Tiger-Gemma-9B-v3", - "developer": "TheDrummer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6821 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5812 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1624 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4004 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4059 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrunkenSnail/Daughter-of-Rhodia-12B/59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json b/data/hfopenllm_v2/TheDrunkenSnail/Daughter-of-Rhodia-12B/59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json deleted file mode 100644 index c592a549e..000000000 --- a/data/hfopenllm_v2/TheDrunkenSnail/Daughter-of-Rhodia-12B/59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrunkenSnail_Daughter-of-Rhodia-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Daughter-of-Rhodia-12B", - "id": "TheDrunkenSnail/Daughter-of-Rhodia-12B", - "developer": "TheDrunkenSnail", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6904 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5179 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4348 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrunkenSnail/Mother-of-Rhodia-12B/a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json b/data/hfopenllm_v2/TheDrunkenSnail/Mother-of-Rhodia-12B/a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json deleted file mode 100644 index e5301b44e..000000000 --- a/data/hfopenllm_v2/TheDrunkenSnail/Mother-of-Rhodia-12B/a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrunkenSnail_Mother-of-Rhodia-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mother-of-Rhodia-12B", - "id": "TheDrunkenSnail/Mother-of-Rhodia-12B", - "developer": "TheDrunkenSnail", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4948 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheDrunkenSnail/Son-of-Rhodia/a889f561-0d8a-4345-9131-0a897ec215ac.json b/data/hfopenllm_v2/TheDrunkenSnail/Son-of-Rhodia/a889f561-0d8a-4345-9131-0a897ec215ac.json deleted file mode 100644 index b5fd619fc..000000000 --- a/data/hfopenllm_v2/TheDrunkenSnail/Son-of-Rhodia/a889f561-0d8a-4345-9131-0a897ec215ac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheDrunkenSnail_Son-of-Rhodia/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Son-of-Rhodia", - "id": "TheDrunkenSnail/Son-of-Rhodia", - "developer": "TheDrunkenSnail", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7046 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5097 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3608 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheHierophant/Underground-Cognitive-V0.3-test/6402facc-6258-43a4-a0fd-78e21765c504.json b/data/hfopenllm_v2/TheHierophant/Underground-Cognitive-V0.3-test/6402facc-6258-43a4-a0fd-78e21765c504.json deleted file mode 100644 index 53650d960..000000000 --- a/data/hfopenllm_v2/TheHierophant/Underground-Cognitive-V0.3-test/6402facc-6258-43a4-a0fd-78e21765c504.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheHierophant_Underground-Cognitive-V0.3-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Underground-Cognitive-V0.3-test", - "id": "TheHierophant/Underground-Cognitive-V0.3-test", - "developer": "TheHierophant", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4808 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3318 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheTsar1209/nemo-carpmuscle-v0.1/29fbd2e0-e08a-48f4-905e-d2aa54886915.json b/data/hfopenllm_v2/TheTsar1209/nemo-carpmuscle-v0.1/29fbd2e0-e08a-48f4-905e-d2aa54886915.json deleted file mode 100644 index 2f11b0d16..000000000 --- a/data/hfopenllm_v2/TheTsar1209/nemo-carpmuscle-v0.1/29fbd2e0-e08a-48f4-905e-d2aa54886915.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheTsar1209_nemo-carpmuscle-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nemo-carpmuscle-v0.1", - "id": "TheTsar1209/nemo-carpmuscle-v0.1", - "developer": "TheTsar1209", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2276 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-r-v0.3/313e0379-d3ea-4f5a-8e06-4b0a94317487.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-r-v0.3/313e0379-d3ea-4f5a-8e06-4b0a94317487.json deleted file mode 100644 index cd556b29c..000000000 --- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-r-v0.3/313e0379-d3ea-4f5a-8e06-4b0a94317487.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-r-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-carpmuscle-r-v0.3", - "id": "TheTsar1209/qwen-carpmuscle-r-v0.3", - "developer": "TheTsar1209", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4455 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6227 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4278 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5103 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.1/f326fbd0-5f92-4324-a587-1f08cf7da208.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.1/f326fbd0-5f92-4324-a587-1f08cf7da208.json deleted file mode 100644 index ee6712ad0..000000000 --- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.1/f326fbd0-5f92-4324-a587-1f08cf7da208.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-carpmuscle-v0.1", - "id": "TheTsar1209/qwen-carpmuscle-v0.1", - "developer": "TheTsar1209", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5622 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6434 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2628 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4161 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.2/d61310e9-5267-4a87-8e24-ae25172cd64e.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.2/d61310e9-5267-4a87-8e24-ae25172cd64e.json deleted file mode 100644 index 13326c754..000000000 --- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.2/d61310e9-5267-4a87-8e24-ae25172cd64e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-carpmuscle-v0.2", - "id": "TheTsar1209/qwen-carpmuscle-v0.2", - "developer": "TheTsar1209", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5257 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6387 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2832 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.3/60953e5e-523d-43c0-ad00-f746308030b1.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.3/60953e5e-523d-43c0-ad00-f746308030b1.json deleted file mode 100644 index 246e41951..000000000 --- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.3/60953e5e-523d-43c0-ad00-f746308030b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-carpmuscle-v0.3", - "id": "TheTsar1209/qwen-carpmuscle-v0.3", - "developer": "TheTsar1209", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4476 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6152 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3134 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4132 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5062 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4.1/5afd8861-d7cb-45cd-af1b-6db966cb56e0.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4.1/5afd8861-d7cb-45cd-af1b-6db966cb56e0.json deleted file mode 100644 index 255cd662b..000000000 --- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4.1/5afd8861-d7cb-45cd-af1b-6db966cb56e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.4.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-carpmuscle-v0.4.1", - "id": "TheTsar1209/qwen-carpmuscle-v0.4.1", - "developer": "TheTsar1209", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.736 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2779 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5191 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4/c3972df1-4414-4c71-b473-fb9459cf085b.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4/c3972df1-4414-4c71-b473-fb9459cf085b.json deleted file mode 100644 index 9edb7f769..000000000 --- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4/c3972df1-4414-4c71-b473-fb9459cf085b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-carpmuscle-v0.4", - "id": "TheTsar1209/qwen-carpmuscle-v0.4", - "developer": "TheTsar1209", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7202 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6454 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2772 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4516 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Tijmen2/cosmosage-v3/b89d54b7-2329-4608-b9f6-07017e63f1cd.json b/data/hfopenllm_v2/Tijmen2/cosmosage-v3/b89d54b7-2329-4608-b9f6-07017e63f1cd.json deleted file mode 100644 index 39d760183..000000000 --- a/data/hfopenllm_v2/Tijmen2/cosmosage-v3/b89d54b7-2329-4608-b9f6-07017e63f1cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Tijmen2_cosmosage-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cosmosage-v3", - "id": "Tijmen2/cosmosage-v3", - "developer": "Tijmen2", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4551 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2486 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.1/50389350-af23-41ba-af46-5ffe338ff9d2.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.1/50389350-af23-41ba-af46-5ffe338ff9d2.json deleted file mode 100644 index a2c39dd23..000000000 --- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.1/50389350-af23-41ba-af46-5ffe338ff9d2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-Chat-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyLlama-1.1B-Chat-v0.1", - "id": "TinyLlama/TinyLlama-1.1B-Chat-v0.1", - "developer": "TinyLlama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1479 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.229 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3592 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.5/b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.5/b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json deleted file mode 100644 index f0a1f9188..000000000 --- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.5/b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-Chat-v0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyLlama-1.1B-Chat-v0.5", - "id": "TinyLlama/TinyLlama-1.1B-Chat-v0.5", - "developer": "TinyLlama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1634 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3105 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1096 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.6/7cd59011-75d7-4497-956c-322d5d609c5f.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.6/7cd59011-75d7-4497-956c-322d5d609c5f.json deleted file mode 100644 index 07b269a48..000000000 --- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.6/7cd59011-75d7-4497-956c-322d5d609c5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-Chat-v0.6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyLlama-1.1B-Chat-v0.6", - "id": "TinyLlama/TinyLlama-1.1B-Chat-v0.6", - "developer": "TinyLlama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1574 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3067 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1149 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v1.0/1313d865-9c5b-45d2-ad64-629c65f07f2c.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v1.0/1313d865-9c5b-45d2-ad64-629c65f07f2c.json deleted file mode 100644 index 03ae524fb..000000000 --- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v1.0/1313d865-9c5b-45d2-ad64-629c65f07f2c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-Chat-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyLlama-1.1B-Chat-v1.0", - "id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "developer": "TinyLlama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0596 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1101 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/0efc2583-bf21-4b60-96cc-716928768eb1.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/0efc2583-bf21-4b60-96cc-716928768eb1.json deleted file mode 100644 index 36550ed25..000000000 --- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/0efc2583-bf21-4b60-96cc-716928768eb1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-intermediate-step-1431k-3T/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyLlama-1.1B-intermediate-step-1431k-3T", - "id": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T", - "developer": "TinyLlama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2277 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3071 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama_v1.1/be0a2737-19a0-4401-998a-a03663467133.json b/data/hfopenllm_v2/TinyLlama/TinyLlama_v1.1/be0a2737-19a0-4401-998a-a03663467133.json deleted file mode 100644 index 0ccc783b3..000000000 --- a/data/hfopenllm_v2/TinyLlama/TinyLlama_v1.1/be0a2737-19a0-4401-998a-a03663467133.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama_v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyLlama_v1.1", - "id": "TinyLlama/TinyLlama_v1.1", - "developer": "TinyLlama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2001 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3024 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1049 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ToastyPigeon/Sto-vo-kor-12B/71720e07-2de0-4402-bdfd-102150c61765.json b/data/hfopenllm_v2/ToastyPigeon/Sto-vo-kor-12B/71720e07-2de0-4402-bdfd-102150c61765.json deleted file mode 100644 index 65bce7ce5..000000000 --- a/data/hfopenllm_v2/ToastyPigeon/Sto-vo-kor-12B/71720e07-2de0-4402-bdfd-102150c61765.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ToastyPigeon_Sto-vo-kor-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sto-vo-kor-12B", - "id": "ToastyPigeon/Sto-vo-kor-12B", - "developer": "ToastyPigeon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5501 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5065 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3938 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Trappu/Magnum-Picaro-0.7-v2-12b/38c84c69-5cdb-4f24-820d-4b39c5b118ff.json b/data/hfopenllm_v2/Trappu/Magnum-Picaro-0.7-v2-12b/38c84c69-5cdb-4f24-820d-4b39c5b118ff.json deleted file mode 100644 index c9c3a7878..000000000 --- a/data/hfopenllm_v2/Trappu/Magnum-Picaro-0.7-v2-12b/38c84c69-5cdb-4f24-820d-4b39c5b118ff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Trappu_Magnum-Picaro-0.7-v2-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magnum-Picaro-0.7-v2-12b", - "id": "Trappu/Magnum-Picaro-0.7-v2-12b", - "developer": "Trappu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Trappu/Nemo-Picaro-12B/de9d274d-f213-4037-9711-3e9d3dbbcc96.json b/data/hfopenllm_v2/Trappu/Nemo-Picaro-12B/de9d274d-f213-4037-9711-3e9d3dbbcc96.json deleted file mode 100644 index 0162cd45d..000000000 --- a/data/hfopenllm_v2/Trappu/Nemo-Picaro-12B/de9d274d-f213-4037-9711-3e9d3dbbcc96.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Trappu_Nemo-Picaro-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemo-Picaro-12B", - "id": "Trappu/Nemo-Picaro-12B", - "developer": "Trappu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2577 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4726 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3605 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Tremontaine/L3-12B-Lunaris-v1/92381da4-b9d1-43c4-a5c9-59f375017e11.json b/data/hfopenllm_v2/Tremontaine/L3-12B-Lunaris-v1/92381da4-b9d1-43c4-a5c9-59f375017e11.json deleted file mode 100644 index 8a5408f5a..000000000 --- a/data/hfopenllm_v2/Tremontaine/L3-12B-Lunaris-v1/92381da4-b9d1-43c4-a5c9-59f375017e11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Tremontaine_L3-12B-Lunaris-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-12B-Lunaris-v1", - "id": "Tremontaine/L3-12B-Lunaris-v1", - "developer": "Tremontaine", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 11.52 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6909 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.523 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Annunaki-12b/44ab6a50-027d-47df-a518-5aa944eb2a61.json b/data/hfopenllm_v2/Triangle104/Annunaki-12b/44ab6a50-027d-47df-a518-5aa944eb2a61.json deleted file mode 100644 index d0a2060be..000000000 --- a/data/hfopenllm_v2/Triangle104/Annunaki-12b/44ab6a50-027d-47df-a518-5aa944eb2a61.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Annunaki-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Annunaki-12b", - "id": "Triangle104/Annunaki-12b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5499 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3721 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/BigTalker-Lite-8B/2a1947d7-74e0-43d0-931d-b2862348e90a.json b/data/hfopenllm_v2/Triangle104/BigTalker-Lite-8B/2a1947d7-74e0-43d0-931d-b2862348e90a.json deleted file mode 100644 index c521e8873..000000000 --- a/data/hfopenllm_v2/Triangle104/BigTalker-Lite-8B/2a1947d7-74e0-43d0-931d-b2862348e90a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_BigTalker-Lite-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BigTalker-Lite-8B", - "id": "Triangle104/BigTalker-Lite-8B", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3689 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5308 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Chatty-Harry_V2.0/3677b71c-387d-4182-b15d-c3525bc7bc36.json b/data/hfopenllm_v2/Triangle104/Chatty-Harry_V2.0/3677b71c-387d-4182-b15d-c3525bc7bc36.json deleted file mode 100644 index 2703e6bd2..000000000 --- a/data/hfopenllm_v2/Triangle104/Chatty-Harry_V2.0/3677b71c-387d-4182-b15d-c3525bc7bc36.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Chatty-Harry_V2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chatty-Harry_V2.0", - "id": "Triangle104/Chatty-Harry_V2.0", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3326 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5319 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.139 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4078 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Chatty-Harry_V3.0/6b125a8e-5b53-48ca-8875-926249879f39.json b/data/hfopenllm_v2/Triangle104/Chatty-Harry_V3.0/6b125a8e-5b53-48ca-8875-926249879f39.json deleted file mode 100644 index 58fe5380d..000000000 --- a/data/hfopenllm_v2/Triangle104/Chatty-Harry_V3.0/6b125a8e-5b53-48ca-8875-926249879f39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Chatty-Harry_V3.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chatty-Harry_V3.0", - "id": "Triangle104/Chatty-Harry_V3.0", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3675 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5526 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3702 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Chronos-Prism_V1.0/af851d4b-69d4-49a9-a160-a180146c3963.json b/data/hfopenllm_v2/Triangle104/Chronos-Prism_V1.0/af851d4b-69d4-49a9-a160-a180146c3963.json deleted file mode 100644 index a35b89646..000000000 --- a/data/hfopenllm_v2/Triangle104/Chronos-Prism_V1.0/af851d4b-69d4-49a9-a160-a180146c3963.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Chronos-Prism_V1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chronos-Prism_V1.0", - "id": "Triangle104/Chronos-Prism_V1.0", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3259 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5554 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4263 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3673 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1/7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json b/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1/7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json deleted file mode 100644 index bb6dc9f21..000000000 --- a/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1/7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_DS-Distilled-Hermes-Llama-3.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DS-Distilled-Hermes-Llama-3.1", - "id": "Triangle104/DS-Distilled-Hermes-Llama-3.1", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3229 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5117 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2931 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4039 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.311 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES/1bce093e-27c0-41ad-aad6-b656f6773ed5.json b/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES/1bce093e-27c0-41ad-aad6-b656f6773ed5.json deleted file mode 100644 index 5e892b1b2..000000000 --- a/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES/1bce093e-27c0-41ad-aad6-b656f6773ed5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_DS-Distilled-Hermes-Llama-3.1_TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DS-Distilled-Hermes-Llama-3.1_TIES", - "id": "Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1364 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1104 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-10B-Harmony/5c6cffab-ef72-4e12-808c-c26ee8ec6999.json b/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-10B-Harmony/5c6cffab-ef72-4e12-808c-c26ee8ec6999.json deleted file mode 100644 index 8e46d73b5..000000000 --- a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-10B-Harmony/5c6cffab-ef72-4e12-808c-c26ee8ec6999.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_DS-R1-Distill-Q2.5-10B-Harmony/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DS-R1-Distill-Q2.5-10B-Harmony", - "id": "Triangle104/DS-R1-Distill-Q2.5-10B-Harmony", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 10.366 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1751 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2106 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3128 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1/e288a874-f750-4a90-be07-616094c220cf.json b/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1/e288a874-f750-4a90-be07-616094c220cf.json deleted file mode 100644 index 350f712ec..000000000 --- a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1/e288a874-f750-4a90-be07-616094c220cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_DS-R1-Distill-Q2.5-14B-Harmony_V0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DS-R1-Distill-Q2.5-14B-Harmony_V0.1", - "id": "Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5783 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5567 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4601 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-7B-RP/0607da8d-3f4e-468a-91a6-b975261a87c0.json b/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-7B-RP/0607da8d-3f4e-468a-91a6-b975261a87c0.json deleted file mode 100644 index 0817ae888..000000000 --- a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-7B-RP/0607da8d-3f4e-468a-91a6-b975261a87c0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_DS-R1-Distill-Q2.5-7B-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DS-R1-Distill-Q2.5-7B-RP", - "id": "Triangle104/DS-R1-Distill-Q2.5-7B-RP", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4383 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4683 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2891 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/DS-R1-Llama-8B-Harmony/be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json b/data/hfopenllm_v2/Triangle104/DS-R1-Llama-8B-Harmony/be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json deleted file mode 100644 index c407ea12b..000000000 --- a/data/hfopenllm_v2/Triangle104/DS-R1-Llama-8B-Harmony/be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_DS-R1-Llama-8B-Harmony/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DS-R1-Llama-8B-Harmony", - "id": "Triangle104/DS-R1-Llama-8B-Harmony", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3566 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4282 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3762 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2744 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/DSR1-Distill-Llama-Lit-8B/15ffe64e-72fd-4e65-8632-babf137a386d.json b/data/hfopenllm_v2/Triangle104/DSR1-Distill-Llama-Lit-8B/15ffe64e-72fd-4e65-8632-babf137a386d.json deleted file mode 100644 index db0609b3f..000000000 --- a/data/hfopenllm_v2/Triangle104/DSR1-Distill-Llama-Lit-8B/15ffe64e-72fd-4e65-8632-babf137a386d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_DSR1-Distill-Llama-Lit-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DSR1-Distill-Llama-Lit-8B", - "id": "Triangle104/DSR1-Distill-Llama-Lit-8B", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1885 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.352 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2798 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/DSR1-Distill-Qwen-7B-RP/ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json b/data/hfopenllm_v2/Triangle104/DSR1-Distill-Qwen-7B-RP/ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json deleted file mode 100644 index 78fbf1df3..000000000 --- a/data/hfopenllm_v2/Triangle104/DSR1-Distill-Qwen-7B-RP/ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_DSR1-Distill-Qwen-7B-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DSR1-Distill-Qwen-7B-RP", - "id": "Triangle104/DSR1-Distill-Qwen-7B-RP", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3609 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4326 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4804 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4045 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3028 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Dark-Chivalry_V1.0/b5afab38-13ba-4abd-9d04-a433c41061c5.json b/data/hfopenllm_v2/Triangle104/Dark-Chivalry_V1.0/b5afab38-13ba-4abd-9d04-a433c41061c5.json deleted file mode 100644 index 48178e07b..000000000 --- a/data/hfopenllm_v2/Triangle104/Dark-Chivalry_V1.0/b5afab38-13ba-4abd-9d04-a433c41061c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Dark-Chivalry_V1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dark-Chivalry_V1.0", - "id": "Triangle104/Dark-Chivalry_V1.0", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4326 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4974 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4182 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3444 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B/a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json b/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B/a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json deleted file mode 100644 index b8c909e09..000000000 --- a/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B/a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Distilled-DarkPlanet-Allades-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Distilled-DarkPlanet-Allades-8B", - "id": "Triangle104/Distilled-DarkPlanet-Allades-8B", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4634 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2901 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B_TIES/d8254f6c-8110-44d3-800e-101fc731d779.json b/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B_TIES/d8254f6c-8110-44d3-800e-101fc731d779.json deleted file mode 100644 index ab8e49bea..000000000 --- a/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B_TIES/d8254f6c-8110-44d3-800e-101fc731d779.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Distilled-DarkPlanet-Allades-8B_TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Distilled-DarkPlanet-Allades-8B_TIES", - "id": "Triangle104/Distilled-DarkPlanet-Allades-8B_TIES", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3892 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5042 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3868 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3401 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Distilled-Whiskey-8b/ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json b/data/hfopenllm_v2/Triangle104/Distilled-Whiskey-8b/ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json deleted file mode 100644 index 56ce9bac6..000000000 --- a/data/hfopenllm_v2/Triangle104/Distilled-Whiskey-8b/ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Distilled-Whiskey-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Distilled-Whiskey-8b", - "id": "Triangle104/Distilled-Whiskey-8b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5028 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2545 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4172 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3367 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Dolphin3-Llama3.2-Smart/c208b19b-4ecf-4fad-b931-54f65d4b711b.json b/data/hfopenllm_v2/Triangle104/Dolphin3-Llama3.2-Smart/c208b19b-4ecf-4fad-b931-54f65d4b711b.json deleted file mode 100644 index b54a4d6b2..000000000 --- a/data/hfopenllm_v2/Triangle104/Dolphin3-Llama3.2-Smart/c208b19b-4ecf-4fad-b931-54f65d4b711b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Dolphin3-Llama3.2-Smart/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dolphin3-Llama3.2-Smart", - "id": "Triangle104/Dolphin3-Llama3.2-Smart", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4137 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3975 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2195 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Gemmadevi-Stock-10B/debaf4a0-c734-47ea-bea0-2ddc65dc397d.json b/data/hfopenllm_v2/Triangle104/Gemmadevi-Stock-10B/debaf4a0-c734-47ea-bea0-2ddc65dc397d.json deleted file mode 100644 index edaf3ab86..000000000 --- a/data/hfopenllm_v2/Triangle104/Gemmadevi-Stock-10B/debaf4a0-c734-47ea-bea0-2ddc65dc397d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Gemmadevi-Stock-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemmadevi-Stock-10B", - "id": "Triangle104/Gemmadevi-Stock-10B", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1582 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4262 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT-Summary/0eeb5962-ccc0-407b-92e6-7cf17c00941f.json b/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT-Summary/0eeb5962-ccc0-407b-92e6-7cf17c00941f.json deleted file mode 100644 index 2948054fa..000000000 --- a/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT-Summary/0eeb5962-ccc0-407b-92e6-7cf17c00941f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Hermes-Llama-3.2-CoT-Summary/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-Llama-3.2-CoT-Summary", - "id": "Triangle104/Hermes-Llama-3.2-CoT-Summary", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.483 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2901 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT/4b60e863-482c-4f91-8cd1-6c993d3c5988.json b/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT/4b60e863-482c-4f91-8cd1-6c993d3c5988.json deleted file mode 100644 index 156711b1f..000000000 --- a/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT/4b60e863-482c-4f91-8cd1-6c993d3c5988.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Hermes-Llama-3.2-CoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-Llama-3.2-CoT", - "id": "Triangle104/Hermes-Llama-3.2-CoT", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4616 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0952 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2947 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Hermes3-L3.1-DirtyHarry-8B/f5f0bc72-427d-4703-aab1-1bb1bea73895.json b/data/hfopenllm_v2/Triangle104/Hermes3-L3.1-DirtyHarry-8B/f5f0bc72-427d-4703-aab1-1bb1bea73895.json deleted file mode 100644 index 92f5b47d3..000000000 --- a/data/hfopenllm_v2/Triangle104/Hermes3-L3.1-DirtyHarry-8B/f5f0bc72-427d-4703-aab1-1bb1bea73895.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Hermes3-L3.1-DirtyHarry-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes3-L3.1-DirtyHarry-8B", - "id": "Triangle104/Hermes3-L3.1-DirtyHarry-8B", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3242 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4069 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Herodotos-14B/aae7f543-7b5b-435f-a506-e3ab901a8c5a.json b/data/hfopenllm_v2/Triangle104/Herodotos-14B/aae7f543-7b5b-435f-a506-e3ab901a8c5a.json deleted file mode 100644 index 1a742c4be..000000000 --- a/data/hfopenllm_v2/Triangle104/Herodotos-14B/aae7f543-7b5b-435f-a506-e3ab901a8c5a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Herodotos-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Herodotos-14B", - "id": "Triangle104/Herodotos-14B", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4667 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4795 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Herodotos-14B_V0.1/6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json b/data/hfopenllm_v2/Triangle104/Herodotos-14B_V0.1/6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json deleted file mode 100644 index cc2ae0230..000000000 --- a/data/hfopenllm_v2/Triangle104/Herodotos-14B_V0.1/6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Herodotos-14B_V0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Herodotos-14B_V0.1", - "id": "Triangle104/Herodotos-14B_V0.1", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3017 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.224 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink/3ee76278-89d4-44fb-a449-717534b00161.json b/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink/3ee76278-89d4-44fb-a449-717534b00161.json deleted file mode 100644 index c76643578..000000000 --- a/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink/3ee76278-89d4-44fb-a449-717534b00161.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_L3.1-8B-Dusky-Ink/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-8B-Dusky-Ink", - "id": "Triangle104/L3.1-8B-Dusky-Ink", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.453 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5098 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4224 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink_v0.r1/fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json b/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink_v0.r1/fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json deleted file mode 100644 index 021c2c4e6..000000000 --- a/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink_v0.r1/fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_L3.1-8B-Dusky-Ink_v0.r1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-8B-Dusky-Ink_v0.r1", - "id": "Triangle104/L3.1-8B-Dusky-Ink_v0.r1", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1985 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4337 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3988 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3206 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesBlackroot/9ddaa721-bf3a-416a-9be8-291188793cc9.json b/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesBlackroot/9ddaa721-bf3a-416a-9be8-291188793cc9.json deleted file mode 100644 index e187b6fe9..000000000 --- a/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesBlackroot/9ddaa721-bf3a-416a-9be8-291188793cc9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_LThreePointOne-8B-HermesBlackroot/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LThreePointOne-8B-HermesBlackroot", - "id": "Triangle104/LThreePointOne-8B-HermesBlackroot", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1792 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4998 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3586 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3285 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesInk/d659077d-7261-4c69-862c-d61be21662a2.json b/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesInk/d659077d-7261-4c69-862c-d61be21662a2.json deleted file mode 100644 index d949801cc..000000000 --- a/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesInk/d659077d-7261-4c69-862c-d61be21662a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_LThreePointOne-8B-HermesInk/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LThreePointOne-8B-HermesInk", - "id": "Triangle104/LThreePointOne-8B-HermesInk", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5223 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1722 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4129 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3467 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Llama3.1-Allades-Lit-8b/e87ba227-c55e-4666-949d-b45913f8336b.json b/data/hfopenllm_v2/Triangle104/Llama3.1-Allades-Lit-8b/e87ba227-c55e-4666-949d-b45913f8336b.json deleted file mode 100644 index 8839950e8..000000000 --- a/data/hfopenllm_v2/Triangle104/Llama3.1-Allades-Lit-8b/e87ba227-c55e-4666-949d-b45913f8336b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Llama3.1-Allades-Lit-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-Allades-Lit-8b", - "id": "Triangle104/Llama3.1-Allades-Lit-8b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2461 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4183 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2724 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Llama3.1-cc-Lit-8b/077f683a-af6f-4a71-b599-b9b269546b7c.json b/data/hfopenllm_v2/Triangle104/Llama3.1-cc-Lit-8b/077f683a-af6f-4a71-b599-b9b269546b7c.json deleted file mode 100644 index 1c20850c6..000000000 --- a/data/hfopenllm_v2/Triangle104/Llama3.1-cc-Lit-8b/077f683a-af6f-4a71-b599-b9b269546b7c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Llama3.1-cc-Lit-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-cc-Lit-8b", - "id": "Triangle104/Llama3.1-cc-Lit-8b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2993 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3848 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3854 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Minerva-1.5b/54808b08-d10d-4a06-ab60-8d99039311b8.json b/data/hfopenllm_v2/Triangle104/Minerva-1.5b/54808b08-d10d-4a06-ab60-8d99039311b8.json deleted file mode 100644 index fc5d9a10e..000000000 --- a/data/hfopenllm_v2/Triangle104/Minerva-1.5b/54808b08-d10d-4a06-ab60-8d99039311b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-1.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minerva-1.5b", - "id": "Triangle104/Minerva-1.5b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2694 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1027 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3655 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2698 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Minerva-1.5b_V0.2/138e6fdb-7092-4ee6-be82-7bb86c1fc759.json b/data/hfopenllm_v2/Triangle104/Minerva-1.5b_V0.2/138e6fdb-7092-4ee6-be82-7bb86c1fc759.json deleted file mode 100644 index 5d8751a69..000000000 --- a/data/hfopenllm_v2/Triangle104/Minerva-1.5b_V0.2/138e6fdb-7092-4ee6-be82-7bb86c1fc759.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-1.5b_V0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minerva-1.5b_V0.2", - "id": "Triangle104/Minerva-1.5b_V0.2", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3083 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3989 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Minerva-10b/1b27423f-62cc-4189-a293-5af84ef1f2c8.json b/data/hfopenllm_v2/Triangle104/Minerva-10b/1b27423f-62cc-4189-a293-5af84ef1f2c8.json deleted file mode 100644 index 90b1e2629..000000000 --- a/data/hfopenllm_v2/Triangle104/Minerva-10b/1b27423f-62cc-4189-a293-5af84ef1f2c8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-10b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minerva-10b", - "id": "Triangle104/Minerva-10b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 10.067 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3627 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2318 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Minerva-14b-V0.1/f5468512-d2c7-4486-9d31-bef61225af52.json b/data/hfopenllm_v2/Triangle104/Minerva-14b-V0.1/f5468512-d2c7-4486-9d31-bef61225af52.json deleted file mode 100644 index 4c579dddf..000000000 --- a/data/hfopenllm_v2/Triangle104/Minerva-14b-V0.1/f5468512-d2c7-4486-9d31-bef61225af52.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-14b-V0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minerva-14b-V0.1", - "id": "Triangle104/Minerva-14b-V0.1", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3051 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Minerva-14b/0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json b/data/hfopenllm_v2/Triangle104/Minerva-14b/0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json deleted file mode 100644 index 814ca6f10..000000000 --- a/data/hfopenllm_v2/Triangle104/Minerva-14b/0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minerva-14b", - "id": "Triangle104/Minerva-14b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3468 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6301 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3051 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4766 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Minerva-7b/07b87b98-0d61-4479-937f-7447565b4631.json b/data/hfopenllm_v2/Triangle104/Minerva-7b/07b87b98-0d61-4479-937f-7447565b4631.json deleted file mode 100644 index 6e62d362b..000000000 --- a/data/hfopenllm_v2/Triangle104/Minerva-7b/07b87b98-0d61-4479-937f-7447565b4631.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minerva-7b", - "id": "Triangle104/Minerva-7b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5498 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4143 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4444 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Minerva-8b/85b11b91-d686-49e9-8db0-971dd7cafb75.json b/data/hfopenllm_v2/Triangle104/Minerva-8b/85b11b91-d686-49e9-8db0-971dd7cafb75.json deleted file mode 100644 index fa931f160..000000000 --- a/data/hfopenllm_v2/Triangle104/Minerva-8b/85b11b91-d686-49e9-8db0-971dd7cafb75.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minerva-8b", - "id": "Triangle104/Minerva-8b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1721 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4669 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3089 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Mistral-Redemption-Arc/21bac032-a092-4afa-8d29-ebdefb3a0650.json b/data/hfopenllm_v2/Triangle104/Mistral-Redemption-Arc/21bac032-a092-4afa-8d29-ebdefb3a0650.json deleted file mode 100644 index 5d71ad3d3..000000000 --- a/data/hfopenllm_v2/Triangle104/Mistral-Redemption-Arc/21bac032-a092-4afa-8d29-ebdefb3a0650.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Mistral-Redemption-Arc/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Redemption-Arc", - "id": "Triangle104/Mistral-Redemption-Arc", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4029 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6255 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Mistral-Small-24b-Harmony/29e3a687-429f-4f33-ae5f-48db85127364.json b/data/hfopenllm_v2/Triangle104/Mistral-Small-24b-Harmony/29e3a687-429f-4f33-ae5f-48db85127364.json deleted file mode 100644 index 9760eb31d..000000000 --- a/data/hfopenllm_v2/Triangle104/Mistral-Small-24b-Harmony/29e3a687-429f-4f33-ae5f-48db85127364.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Mistral-Small-24b-Harmony/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-24b-Harmony", - "id": "Triangle104/Mistral-Small-24b-Harmony", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1687 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6434 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1911 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4276 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.1/d98493a6-f237-4565-8508-9e4cc3188d2d.json b/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.1/d98493a6-f237-4565-8508-9e4cc3188d2d.json deleted file mode 100644 index 6cd483d58..000000000 --- a/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.1/d98493a6-f237-4565-8508-9e4cc3188d2d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Pans_Gutenbergum_V0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pans_Gutenbergum_V0.1", - "id": "Triangle104/Pans_Gutenbergum_V0.1", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3097 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5541 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1057 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.2/2def6fbd-7488-4e9f-a822-2405d4f7a315.json b/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.2/2def6fbd-7488-4e9f-a822-2405d4f7a315.json deleted file mode 100644 index 9dd3b30ff..000000000 --- a/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.2/2def6fbd-7488-4e9f-a822-2405d4f7a315.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Pans_Gutenbergum_V0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pans_Gutenbergum_V0.2", - "id": "Triangle104/Pans_Gutenbergum_V0.2", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3215 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5526 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4673 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3585 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Pantheon_ChatWaifu_V0.2/819143d4-9538-48b9-b7af-128bc15c518a.json b/data/hfopenllm_v2/Triangle104/Pantheon_ChatWaifu_V0.2/819143d4-9538-48b9-b7af-128bc15c518a.json deleted file mode 100644 index 643b17008..000000000 --- a/data/hfopenllm_v2/Triangle104/Pantheon_ChatWaifu_V0.2/819143d4-9538-48b9-b7af-128bc15c518a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Pantheon_ChatWaifu_V0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pantheon_ChatWaifu_V0.2", - "id": "Triangle104/Pantheon_ChatWaifu_V0.2", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2683 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5532 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4755 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3442 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Phi-4-AbliteratedRP/c29d47af-a9de-4edb-acac-6763c0d44ca3.json b/data/hfopenllm_v2/Triangle104/Phi-4-AbliteratedRP/c29d47af-a9de-4edb-acac-6763c0d44ca3.json deleted file mode 100644 index b786d9979..000000000 --- a/data/hfopenllm_v2/Triangle104/Phi-4-AbliteratedRP/c29d47af-a9de-4edb-acac-6763c0d44ca3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Phi-4-AbliteratedRP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-AbliteratedRP", - "id": "Triangle104/Phi-4-AbliteratedRP", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4923 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6709 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5308 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Phi4-RP-o1-Ablit/22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json b/data/hfopenllm_v2/Triangle104/Phi4-RP-o1-Ablit/22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json deleted file mode 100644 index 82381bf4b..000000000 --- a/data/hfopenllm_v2/Triangle104/Phi4-RP-o1-Ablit/22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Phi4-RP-o1-Ablit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi4-RP-o1-Ablit", - "id": "Triangle104/Phi4-RP-o1-Ablit", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0239 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.663 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Phi4-RP-o1/2bea7014-460d-470b-918f-468b58d70fd6.json b/data/hfopenllm_v2/Triangle104/Phi4-RP-o1/2bea7014-460d-470b-918f-468b58d70fd6.json deleted file mode 100644 index 49827c7aa..000000000 --- a/data/hfopenllm_v2/Triangle104/Phi4-RP-o1/2bea7014-460d-470b-918f-468b58d70fd6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Phi4-RP-o1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi4-RP-o1", - "id": "Triangle104/Phi4-RP-o1", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.022 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6653 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4756 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Porpoise-R1-Llama3.2-3b/3927a5dd-002b-441a-b769-ba68547cd5f3.json b/data/hfopenllm_v2/Triangle104/Porpoise-R1-Llama3.2-3b/3927a5dd-002b-441a-b769-ba68547cd5f3.json deleted file mode 100644 index 341da9ea4..000000000 --- a/data/hfopenllm_v2/Triangle104/Porpoise-R1-Llama3.2-3b/3927a5dd-002b-441a-b769-ba68547cd5f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Porpoise-R1-Llama3.2-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Porpoise-R1-Llama3.2-3b", - "id": "Triangle104/Porpoise-R1-Llama3.2-3b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4352 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3824 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3576 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-14B-Instruct-1M-Harmony/476fc734-dedd-4192-aa59-eb2f9dabf16b.json b/data/hfopenllm_v2/Triangle104/Q2.5-14B-Instruct-1M-Harmony/476fc734-dedd-4192-aa59-eb2f9dabf16b.json deleted file mode 100644 index b32ca4aa3..000000000 --- a/data/hfopenllm_v2/Triangle104/Q2.5-14B-Instruct-1M-Harmony/476fc734-dedd-4192-aa59-eb2f9dabf16b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-14B-Instruct-1M-Harmony/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-14B-Instruct-1M-Harmony", - "id": "Triangle104/Q2.5-14B-Instruct-1M-Harmony", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5986 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4795 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-AthensCOT/817e2fbe-0866-489f-b987-391228a68c53.json b/data/hfopenllm_v2/Triangle104/Q2.5-AthensCOT/817e2fbe-0866-489f-b987-391228a68c53.json deleted file mode 100644 index 46f11cded..000000000 --- a/data/hfopenllm_v2/Triangle104/Q2.5-AthensCOT/817e2fbe-0866-489f-b987-391228a68c53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-AthensCOT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-AthensCOT", - "id": "Triangle104/Q2.5-AthensCOT", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4573 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5542 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2915 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4578 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-CodeR1-3B/f25f5eb1-ff22-4be3-a639-a9d25207078f.json b/data/hfopenllm_v2/Triangle104/Q2.5-CodeR1-3B/f25f5eb1-ff22-4be3-a639-a9d25207078f.json deleted file mode 100644 index 61a0bc9d8..000000000 --- a/data/hfopenllm_v2/Triangle104/Q2.5-CodeR1-3B/f25f5eb1-ff22-4be3-a639-a9d25207078f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-CodeR1-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-CodeR1-3B", - "id": "Triangle104/Q2.5-CodeR1-3B", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.085 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3588 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4661 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2979 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-EVACOT-7b/f71d1c31-184b-46be-a288-bdc92f0ebe09.json b/data/hfopenllm_v2/Triangle104/Q2.5-EVACOT-7b/f71d1c31-184b-46be-a288-bdc92f0ebe09.json deleted file mode 100644 index 2603a2922..000000000 --- a/data/hfopenllm_v2/Triangle104/Q2.5-EVACOT-7b/f71d1c31-184b-46be-a288-bdc92f0ebe09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-EVACOT-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-EVACOT-7b", - "id": "Triangle104/Q2.5-EVACOT-7b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5784 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5506 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2825 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4499 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-EvaHumane-RP/0d9547b3-7bef-4815-9c44-7d714fe81bbb.json b/data/hfopenllm_v2/Triangle104/Q2.5-EvaHumane-RP/0d9547b3-7bef-4815-9c44-7d714fe81bbb.json deleted file mode 100644 index 356d6f7ee..000000000 --- a/data/hfopenllm_v2/Triangle104/Q2.5-EvaHumane-RP/0d9547b3-7bef-4815-9c44-7d714fe81bbb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-EvaHumane-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-EvaHumane-RP", - "id": "Triangle104/Q2.5-EvaHumane-RP", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3676 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5328 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4276 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-Humane-RP/22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json b/data/hfopenllm_v2/Triangle104/Q2.5-Humane-RP/22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json deleted file mode 100644 index 99a18a092..000000000 --- a/data/hfopenllm_v2/Triangle104/Q2.5-Humane-RP/22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-Humane-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-Humane-RP", - "id": "Triangle104/Q2.5-Humane-RP", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5649 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3391 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4492 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-Instruct-1M_Harmony/afedb249-f1a5-42d6-b6c0-54b2cc303f64.json b/data/hfopenllm_v2/Triangle104/Q2.5-Instruct-1M_Harmony/afedb249-f1a5-42d6-b6c0-54b2cc303f64.json deleted file mode 100644 index cd9816ee7..000000000 --- a/data/hfopenllm_v2/Triangle104/Q2.5-Instruct-1M_Harmony/afedb249-f1a5-42d6-b6c0-54b2cc303f64.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-Instruct-1M_Harmony/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-Instruct-1M_Harmony", - "id": "Triangle104/Q2.5-Instruct-1M_Harmony", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6038 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5373 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3323 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-R1-3B/61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json b/data/hfopenllm_v2/Triangle104/Q2.5-R1-3B/61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json deleted file mode 100644 index 7e4fafbbe..000000000 --- a/data/hfopenllm_v2/Triangle104/Q2.5-R1-3B/61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-R1-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-R1-3B", - "id": "Triangle104/Q2.5-R1-3B", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.085 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4214 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4812 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2674 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3813 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-R1-7B/c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json b/data/hfopenllm_v2/Triangle104/Q2.5-R1-7B/c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json deleted file mode 100644 index 7d190bdde..000000000 --- a/data/hfopenllm_v2/Triangle104/Q2.5-R1-7B/c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-R1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-R1-7B", - "id": "Triangle104/Q2.5-R1-7B", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1346 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3007 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Robo-Gutenberg_V1.0/cc57e6f0-ab55-4ab9-983c-63d74632d016.json b/data/hfopenllm_v2/Triangle104/Robo-Gutenberg_V1.0/cc57e6f0-ab55-4ab9-983c-63d74632d016.json deleted file mode 100644 index 96cf9ac6e..000000000 --- a/data/hfopenllm_v2/Triangle104/Robo-Gutenberg_V1.0/cc57e6f0-ab55-4ab9-983c-63d74632d016.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Robo-Gutenberg_V1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Robo-Gutenberg_V1.0", - "id": "Triangle104/Robo-Gutenberg_V1.0", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6008 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6537 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4744 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.0/0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json b/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.0/0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json deleted file mode 100644 index 670f07b44..000000000 --- a/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.0/0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Rocinante-Prism_V2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rocinante-Prism_V2.0", - "id": "Triangle104/Rocinante-Prism_V2.0", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2616 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5361 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.1/a6ec2934-e9fd-481d-8f00-932603bc6e0a.json b/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.1/a6ec2934-e9fd-481d-8f00-932603bc6e0a.json deleted file mode 100644 index 5d588967a..000000000 --- a/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.1/a6ec2934-e9fd-481d-8f00-932603bc6e0a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Rocinante-Prism_V2.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rocinante-Prism_V2.1", - "id": "Triangle104/Rocinante-Prism_V2.1", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2558 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5333 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/RomboHermes3-R1-Llama3.2-3b/e2553c93-60df-4126-9e64-ecd4a5003389.json b/data/hfopenllm_v2/Triangle104/RomboHermes3-R1-Llama3.2-3b/e2553c93-60df-4126-9e64-ecd4a5003389.json deleted file mode 100644 index 1d3c45558..000000000 --- a/data/hfopenllm_v2/Triangle104/RomboHermes3-R1-Llama3.2-3b/e2553c93-60df-4126-9e64-ecd4a5003389.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_RomboHermes3-R1-Llama3.2-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RomboHermes3-R1-Llama3.2-3b", - "id": "Triangle104/RomboHermes3-R1-Llama3.2-3b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3007 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4264 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3657 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2957 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Rombos-Novasky-7B_V1c/e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json b/data/hfopenllm_v2/Triangle104/Rombos-Novasky-7B_V1c/e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json deleted file mode 100644 index d45631435..000000000 --- a/data/hfopenllm_v2/Triangle104/Rombos-Novasky-7B_V1c/e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Rombos-Novasky-7B_V1c/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-Novasky-7B_V1c", - "id": "Triangle104/Rombos-Novasky-7B_V1c", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4349 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4465 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Triangle104/Set-70b/a807ee8c-509e-4b6d-a414-df24444d8a0a.json b/data/hfopenllm_v2/Triangle104/Set-70b/a807ee8c-509e-4b6d-a414-df24444d8a0a.json deleted file mode 100644 index c29b4d9d5..000000000 --- a/data/hfopenllm_v2/Triangle104/Set-70b/a807ee8c-509e-4b6d-a414-df24444d8a0a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Triangle104_Set-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Set-70b", - "id": "Triangle104/Set-70b", - "developer": "Triangle104", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7643 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7014 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4463 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4696 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5442 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5-7B-Instruct/2199024b-7944-4950-8335-32a536efad02.json b/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5-7B-Instruct/2199024b-7944-4950-8335-32a536efad02.json deleted file mode 100644 index 8d21eff8a..000000000 --- a/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5-7B-Instruct/2199024b-7944-4950-8335-32a536efad02.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Tsunami-th_Tsunami-0.5-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tsunami-0.5-7B-Instruct", - "id": "Tsunami-th/Tsunami-0.5-7B-Instruct", - "developer": "Tsunami-th", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4257 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5x-7B-Instruct/97919c86-6161-4548-95b9-d44263a29f8a.json b/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5x-7B-Instruct/97919c86-6161-4548-95b9-d44263a29f8a.json deleted file mode 100644 index 2f1a0d0d7..000000000 --- a/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5x-7B-Instruct/97919c86-6161-4548-95b9-d44263a29f8a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Tsunami-th_Tsunami-0.5x-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tsunami-0.5x-7B-Instruct", - "id": "Tsunami-th/Tsunami-0.5x-7B-Instruct", - "developer": "Tsunami-th", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7099 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5593 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4667 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4458 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-14B-Instruct/c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json b/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-14B-Instruct/c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json deleted file mode 100644 index 640901714..000000000 --- a/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-14B-Instruct/c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Tsunami-th_Tsunami-1.0-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tsunami-1.0-14B-Instruct", - "id": "Tsunami-th/Tsunami-1.0-14B-Instruct", - "developer": "Tsunami-th", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7829 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6439 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4585 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4459 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5249 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-7B-Instruct/c1294268-b5f5-4d64-b91a-147f58a21a47.json b/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-7B-Instruct/c1294268-b5f5-4d64-b91a-147f58a21a47.json deleted file mode 100644 index 25c1c6f74..000000000 --- a/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-7B-Instruct/c1294268-b5f5-4d64-b91a-147f58a21a47.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Tsunami-th_Tsunami-1.0-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tsunami-1.0-7B-Instruct", - "id": "Tsunami-th/Tsunami-1.0-7B-Instruct", - "developer": "Tsunami-th", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7309 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5491 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4335 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4493 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4424 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1/2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json b/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1/2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json deleted file mode 100644 index b9787022d..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1/2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-9B-It-SPPO-Iter1", - "id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3082 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5969 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4099 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3907 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2/b926ca6c-60c9-4353-9671-0453b46d0222.json b/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2/b926ca6c-60c9-4353-9671-0453b46d0222.json deleted file mode 100644 index 51f77c00b..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2/b926ca6c-60c9-4353-9671-0453b46d0222.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-9B-It-SPPO-Iter2", - "id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4139 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3/44db30b4-2010-4f96-a39e-9ccc8568374f.json b/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3/44db30b4-2010-4f96-a39e-9ccc8568374f.json deleted file mode 100644 index 6ac45353a..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3/44db30b4-2010-4f96-a39e-9ccc8568374f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-9B-It-SPPO-Iter3", - "id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6007 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4166 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1/2210d673-d417-46be-aeca-de48cd846e01.json b/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1/2210d673-d417-46be-aeca-de48cd846e01.json deleted file mode 100644 index 336128396..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1/2210d673-d417-46be-aeca-de48cd846e01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SPPO-Iter1", - "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7299 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5058 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3568 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3711 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2/892d27cc-dfb3-40c7-ae0f-a7cd06784808.json b/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2/892d27cc-dfb3-40c7-ae0f-a7cd06784808.json deleted file mode 100644 index 22ac5bc50..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2/892d27cc-dfb3-40c7-ae0f-a7cd06784808.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SPPO-Iter2", - "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6989 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5089 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3594 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3692 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/49b3f293-721d-4d44-9748-88d1ce275050.json b/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/49b3f293-721d-4d44-9748-88d1ce275050.json deleted file mode 100644 index 973c80332..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/49b3f293-721d-4d44-9748-88d1ce275050.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SPPO-Iter3", - "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6834 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/70fb41fe-46af-49e3-8270-5882e12f710f.json b/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/70fb41fe-46af-49e3-8270-5882e12f710f.json deleted file mode 100644 index 9aab7af07..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/70fb41fe-46af-49e3-8270-5882e12f710f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SPPO-Iter3", - "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6703 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5076 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3647 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1/13e2489f-9d96-4f68-8e22-c937604c2145.json b/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1/13e2489f-9d96-4f68-8e22-c937604c2145.json deleted file mode 100644 index 44afb2750..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1/13e2489f-9d96-4f68-8e22-c937604c2145.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral7B-PairRM-SPPO-Iter1", - "id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5047 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4468 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3992 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2695 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2/0c386ea0-4706-4a6f-994c-b6ee21dbce92.json b/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2/0c386ea0-4706-4a6f-994c-b6ee21dbce92.json deleted file mode 100644 index b7fd6fe39..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2/0c386ea0-4706-4a6f-994c-b6ee21dbce92.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral7B-PairRM-SPPO-Iter2", - "id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4446 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4085 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2677 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3/a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json b/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3/a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json deleted file mode 100644 index 34544a27f..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3/a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral7B-PairRM-SPPO-Iter3", - "id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4397 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO/4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json b/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO/4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json deleted file mode 100644 index d29d93827..000000000 --- a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO/4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UCLA-AGI_Mistral7B-PairRM-SPPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral7B-PairRM-SPPO", - "id": "UCLA-AGI/Mistral7B-PairRM-SPPO", - "developer": "UCLA-AGI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4355 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4439 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3965 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2621 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/UKzExecution/LlamaExecutor-8B-3.0.5/568072cb-118d-41af-bfe8-fa14cb4c7348.json b/data/hfopenllm_v2/UKzExecution/LlamaExecutor-8B-3.0.5/568072cb-118d-41af-bfe8-fa14cb4c7348.json deleted file mode 100644 index d992c32d0..000000000 --- a/data/hfopenllm_v2/UKzExecution/LlamaExecutor-8B-3.0.5/568072cb-118d-41af-bfe8-fa14cb4c7348.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/UKzExecution_LlamaExecutor-8B-3.0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LlamaExecutor-8B-3.0.5", - "id": "UKzExecution/LlamaExecutor-8B-3.0.5", - "developer": "UKzExecution", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7403 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5006 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Unbabel/TowerInstruct-Mistral-7B-v0.2/a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json b/data/hfopenllm_v2/Unbabel/TowerInstruct-Mistral-7B-v0.2/a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json deleted file mode 100644 index 21f0ff869..000000000 --- a/data/hfopenllm_v2/Unbabel/TowerInstruct-Mistral-7B-v0.2/a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Unbabel_TowerInstruct-Mistral-7B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TowerInstruct-Mistral-7B-v0.2", - "id": "Unbabel/TowerInstruct-Mistral-7B-v0.2", - "developer": "Unbabel", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2843 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4522 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1968 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Undi95/MG-FinalMix-72B/2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json b/data/hfopenllm_v2/Undi95/MG-FinalMix-72B/2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json deleted file mode 100644 index c0dc3e377..000000000 --- a/data/hfopenllm_v2/Undi95/MG-FinalMix-72B/2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Undi95_MG-FinalMix-72B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MG-FinalMix-72B", - "id": "Undi95/MG-FinalMix-72B", - "developer": "Undi95", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8014 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6973 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3973 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4823 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5427 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Undi95/Phi4-abliterated/359dde31-d9dc-4c22-b829-77df652dcc73.json b/data/hfopenllm_v2/Undi95/Phi4-abliterated/359dde31-d9dc-4c22-b829-77df652dcc73.json deleted file mode 100644 index 8b09b9cab..000000000 --- a/data/hfopenllm_v2/Undi95/Phi4-abliterated/359dde31-d9dc-4c22-b829-77df652dcc73.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Undi95_Phi4-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi4-abliterated", - "id": "Undi95/Phi4-abliterated", - "developer": "Undi95", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6618 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6809 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5281 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/V3N0M/Jenna-Tiny-2.0/34a79823-b993-402a-89a7-538e126ee02a.json b/data/hfopenllm_v2/V3N0M/Jenna-Tiny-2.0/34a79823-b993-402a-89a7-538e126ee02a.json deleted file mode 100644 index 05681b21a..000000000 --- a/data/hfopenllm_v2/V3N0M/Jenna-Tiny-2.0/34a79823-b993-402a-89a7-538e126ee02a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/V3N0M_Jenna-Tiny-2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jenna-Tiny-2.0", - "id": "V3N0M/Jenna-Tiny-2.0", - "developer": "V3N0M", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.631 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2309 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3148 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct/f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json b/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct/f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json deleted file mode 100644 index a2a4784dd..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct/f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_Llama-3-SauerkrautLM-70b-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-SauerkrautLM-70b-Instruct", - "id": "VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8045 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6663 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2281 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4339 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5392 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct/73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json b/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct/73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json deleted file mode 100644 index af84212e1..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct/73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_Llama-3-SauerkrautLM-8b-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-SauerkrautLM-8b-Instruct", - "id": "VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4943 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3857 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct/72a66eae-9c94-40e3-b3c9-211303e5cba8.json b/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct/72a66eae-9c94-40e3-b3c9-211303e5cba8.json deleted file mode 100644 index c22f37f6c..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct/72a66eae-9c94-40e3-b3c9-211303e5cba8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_Llama-3.1-SauerkrautLM-70b-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-SauerkrautLM-70b-Instruct", - "id": "VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8656 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7006 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4711 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5335 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct/ef7390b5-599b-4354-805b-9486e4ce34fa.json b/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct/ef7390b5-599b-4354-805b-9486e4ce34fa.json deleted file mode 100644 index c83915153..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct/ef7390b5-599b-4354-805b-9486e4ce34fa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_Llama-3.1-SauerkrautLM-8b-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-SauerkrautLM-8b-Instruct", - "id": "VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8017 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5115 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1941 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4148 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-1.5b/57f964c3-0504-4b60-9539-ce0e369816ea.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-1.5b/57f964c3-0504-4b60-9539-ce0e369816ea.json deleted file mode 100644 index f33769597..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-1.5b/57f964c3-0504-4b60-9539-ce0e369816ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-1.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-1.5b", - "id": "VAGOsolutions/SauerkrautLM-1.5b", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2404 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3704 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2151 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-HerO/4e6c0336-5d94-4417-a194-92a4d6f38481.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-HerO/4e6c0336-5d94-4417-a194-92a4d6f38481.json deleted file mode 100644 index dde4ae4ac..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-HerO/4e6c0336-5d94-4417-a194-92a4d6f38481.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-7b-HerO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-7b-HerO", - "id": "VAGOsolutions/SauerkrautLM-7b-HerO", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5346 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4904 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3924 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3046 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-LaserChat/fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-LaserChat/fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json deleted file mode 100644 index e0f9d6aca..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-LaserChat/fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-7b-LaserChat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-7b-LaserChat", - "id": "VAGOsolutions/SauerkrautLM-7b-LaserChat", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5988 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4543 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4148 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-2b/5ced7497-5a05-40d2-80cb-cae63ca62022.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-2b/5ced7497-5a05-40d2-80cb-cae63ca62022.json deleted file mode 100644 index 66467bc2c..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-2b/5ced7497-5a05-40d2-80cb-cae63ca62022.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Gemma-2b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-Gemma-2b", - "id": "VAGOsolutions/SauerkrautLM-Gemma-2b", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3676 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1469 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-7b/52a66aaa-193a-48ca-b693-4dcab811eaa3.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-7b/52a66aaa-193a-48ca-b693-4dcab811eaa3.json deleted file mode 100644 index f14465432..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-7b/52a66aaa-193a-48ca-b693-4dcab811eaa3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Gemma-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-Gemma-7b", - "id": "VAGOsolutions/SauerkrautLM-Gemma-7b", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3407 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3594 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct/e0e4bcef-cb73-436b-9353-b18ade293e8b.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct/e0e4bcef-cb73-436b-9353-b18ade293e8b.json deleted file mode 100644 index 56dde9357..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct/e0e4bcef-cb73-436b-9353-b18ade293e8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Mixtral-8x7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-Mixtral-8x7B-Instruct", - "id": "VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5602 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5277 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4204 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.365 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct/1ae45791-7e47-4083-bd72-4530fa26893c.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct/1ae45791-7e47-4083-bd72-4530fa26893c.json deleted file mode 100644 index fa2eb1b02..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct/1ae45791-7e47-4083-bd72-4530fa26893c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Nemo-12b-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-Nemo-12b-Instruct", - "id": "VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6113 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5214 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4469 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3385 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Phi-3-medium/b2731f04-a9bd-4e36-a545-85be5b66f5a7.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Phi-3-medium/b2731f04-a9bd-4e36-a545-85be5b66f5a7.json deleted file mode 100644 index b7b0c2153..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Phi-3-medium/b2731f04-a9bd-4e36-a545-85be5b66f5a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Phi-3-medium/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-Phi-3-medium", - "id": "VAGOsolutions/SauerkrautLM-Phi-3-medium", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4409 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6433 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1601 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4845 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4665 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-SOLAR-Instruct/ed6de552-d04b-4d51-8456-610e2cb41d85.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-SOLAR-Instruct/ed6de552-d04b-4d51-8456-610e2cb41d85.json deleted file mode 100644 index feb5d2dda..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-SOLAR-Instruct/ed6de552-d04b-4d51-8456-610e2cb41d85.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-SOLAR-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-SOLAR-Instruct", - "id": "VAGOsolutions/SauerkrautLM-SOLAR-Instruct", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4917 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5169 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3965 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3183 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-2b-it/3e08a589-d2b3-487b-900e-85725522a2e4.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-2b-it/3e08a589-d2b3-487b-900e-85725522a2e4.json deleted file mode 100644 index 49079f005..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-2b-it/3e08a589-d2b3-487b-900e-85725522a2e4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-gemma-2-2b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-gemma-2-2b-it", - "id": "VAGOsolutions/SauerkrautLM-gemma-2-2b-it", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3995 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-9b-it/b2717503-d081-40ee-b1ed-fcadaf239049.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-9b-it/b2717503-d081-40ee-b1ed-fcadaf239049.json deleted file mode 100644 index e6e7502ff..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-9b-it/b2717503-d081-40ee-b1ed-fcadaf239049.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-gemma-2-9b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-gemma-2-9b-it", - "id": "VAGOsolutions/SauerkrautLM-gemma-2-9b-it", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3024 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4318 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4091 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-DPO/9915eb01-5c45-42b6-82a3-ad782411642f.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-DPO/9915eb01-5c45-42b6-82a3-ad782411642f.json deleted file mode 100644 index 7d69802d9..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-DPO/9915eb01-5c45-42b6-82a3-ad782411642f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-v2-14b-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-v2-14b-DPO", - "id": "VAGOsolutions/SauerkrautLM-v2-14b-DPO", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3165 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-SFT/190eb7ca-46db-4e1d-8b71-9bb20af74ede.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-SFT/190eb7ca-46db-4e1d-8b71-9bb20af74ede.json deleted file mode 100644 index ac7db90ff..000000000 --- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-SFT/190eb7ca-46db-4e1d-8b71-9bb20af74ede.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-v2-14b-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-v2-14b-SFT", - "id": "VAGOsolutions/SauerkrautLM-v2-14b-SFT", - "developer": "VAGOsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6949 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3285 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4179 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5205 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B-r-v-0.1/86b9077d-9ec3-411d-84c5-326ba97742c1.json b/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B-r-v-0.1/86b9077d-9ec3-411d-84c5-326ba97742c1.json deleted file mode 100644 index 40a4383a3..000000000 --- a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B-r-v-0.1/86b9077d-9ec3-411d-84c5-326ba97742c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VIRNECT_llama-3-Korean-8B-r-v-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-Korean-8B-r-v-0.1", - "id": "VIRNECT/llama-3-Korean-8B-r-v-0.1", - "developer": "VIRNECT", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 16.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4916 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4806 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3675 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.326 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/18bfa50c-20be-4027-8ee7-f6cd1411c882.json b/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/18bfa50c-20be-4027-8ee7-f6cd1411c882.json deleted file mode 100644 index f155f09ea..000000000 --- a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/18bfa50c-20be-4027-8ee7-f6cd1411c882.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VIRNECT_llama-3-Korean-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-Korean-8B", - "id": "VIRNECT/llama-3-Korean-8B", - "developer": "VIRNECT", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5058 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4908 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3662 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3539 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/eb1a099a-48c7-412b-b62f-143537c41f06.json b/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/eb1a099a-48c7-412b-b62f-143537c41f06.json deleted file mode 100644 index 92a869ede..000000000 --- a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/eb1a099a-48c7-412b-b62f-143537c41f06.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/VIRNECT_llama-3-Korean-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-Korean-8B", - "id": "VIRNECT/llama-3-Korean-8B", - "developer": "VIRNECT", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4918 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3648 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3536 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3-70B-Fireplace/e530a4b7-c2f6-4bad-bab5-2895e950ed63.json b/data/hfopenllm_v2/ValiantLabs/Llama3-70B-Fireplace/e530a4b7-c2f6-4bad-bab5-2895e950ed63.json deleted file mode 100644 index 60a42d8a2..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3-70B-Fireplace/e530a4b7-c2f6-4bad-bab5-2895e950ed63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3-70B-Fireplace/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-70B-Fireplace", - "id": "ValiantLabs/Llama3-70B-Fireplace", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7774 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6489 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4893 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3-70B-ShiningValiant2/52ad7152-feea-46a6-b2d8-20e1a70514ce.json b/data/hfopenllm_v2/ValiantLabs/Llama3-70B-ShiningValiant2/52ad7152-feea-46a6-b2d8-20e1a70514ce.json deleted file mode 100644 index 7ec6b966f..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3-70B-ShiningValiant2/52ad7152-feea-46a6-b2d8-20e1a70514ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3-70B-ShiningValiant2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-70B-ShiningValiant2", - "id": "ValiantLabs/Llama3-70B-ShiningValiant2", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6122 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6338 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4326 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4898 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-70B-ShiningValiant2/a61162a6-ef3e-46f4-8aa2-241547fadea2.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-70B-ShiningValiant2/a61162a6-ef3e-46f4-8aa2-241547fadea2.json deleted file mode 100644 index 3438ef88c..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-70B-ShiningValiant2/a61162a6-ef3e-46f4-8aa2-241547fadea2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-70B-ShiningValiant2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-70B-ShiningValiant2", - "id": "ValiantLabs/Llama3.1-70B-ShiningValiant2", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5355 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6738 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2915 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4681 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/9f208aef-8544-47c8-bb1f-a3841aff208b.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/9f208aef-8544-47c8-bb1f-a3841aff208b.json deleted file mode 100644 index e206b8681..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/9f208aef-8544-47c8-bb1f-a3841aff208b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Cobalt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-Cobalt", - "id": "ValiantLabs/Llama3.1-8B-Cobalt", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7168 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4911 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3512 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/da237ab6-df39-460f-9efc-e1649e1ac202.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/da237ab6-df39-460f-9efc-e1649e1ac202.json deleted file mode 100644 index aa9dee935..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/da237ab6-df39-460f-9efc-e1649e1ac202.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Cobalt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-Cobalt", - "id": "ValiantLabs/Llama3.1-8B-Cobalt", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4947 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3959 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Enigma/c81b3193-9d01-4590-8b72-da97aa3c9dc4.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Enigma/c81b3193-9d01-4590-8b72-da97aa3c9dc4.json deleted file mode 100644 index c8bdca2cb..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Enigma/c81b3193-9d01-4590-8b72-da97aa3c9dc4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Enigma/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-Enigma", - "id": "ValiantLabs/Llama3.1-8B-Enigma", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2681 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4478 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3409 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Esper2/1a9ffe50-69ae-48bc-b636-89431391eb37.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Esper2/1a9ffe50-69ae-48bc-b636-89431391eb37.json deleted file mode 100644 index ba7ad45fc..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Esper2/1a9ffe50-69ae-48bc-b636-89431391eb37.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Esper2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-Esper2", - "id": "ValiantLabs/Llama3.1-8B-Esper2", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.447 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3561 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2904 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/b0c67359-1da0-4f55-aa1c-f54f88038bd7.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/b0c67359-1da0-4f55-aa1c-f54f88038bd7.json deleted file mode 100644 index 2f2d8e167..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/b0c67359-1da0-4f55-aa1c-f54f88038bd7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Fireplace2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-Fireplace2", - "id": "ValiantLabs/Llama3.1-8B-Fireplace2", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5483 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.461 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3433 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2407 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/c700798b-583a-41be-94dd-382669bb495f.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/c700798b-583a-41be-94dd-382669bb495f.json deleted file mode 100644 index fca031505..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/c700798b-583a-41be-94dd-382669bb495f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Fireplace2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-Fireplace2", - "id": "ValiantLabs/Llama3.1-8B-Fireplace2", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5328 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4613 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json deleted file mode 100644 index 2c1716490..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-ShiningValiant2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-ShiningValiant2", - "id": "ValiantLabs/Llama3.1-8B-ShiningValiant2", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4774 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/e8c9501b-c985-4b78-a902-a1a030c72e60.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/e8c9501b-c985-4b78-a902-a1a030c72e60.json deleted file mode 100644 index b45bd8cf9..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/e8c9501b-c985-4b78-a902-a1a030c72e60.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-ShiningValiant2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-ShiningValiant2", - "id": "ValiantLabs/Llama3.1-8B-ShiningValiant2", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2678 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3959 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2927 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Enigma/df978fce-3373-4073-8c44-d6a83df1d9d1.json b/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Enigma/df978fce-3373-4073-8c44-d6a83df1d9d1.json deleted file mode 100644 index be7b22dec..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Enigma/df978fce-3373-4073-8c44-d6a83df1d9d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.2-3B-Enigma/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2-3B-Enigma", - "id": "ValiantLabs/Llama3.2-3B-Enigma", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2786 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3921 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2428 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Esper2/e46ee8d9-81af-4259-8fef-3d3113fb6168.json b/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Esper2/e46ee8d9-81af-4259-8fef-3d3113fb6168.json deleted file mode 100644 index 40bf9a025..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Esper2/e46ee8d9-81af-4259-8fef-3d3113fb6168.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.2-3B-Esper2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2-3B-Esper2", - "id": "ValiantLabs/Llama3.2-3B-Esper2", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3808 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2257 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-ShiningValiant2/aa6ab404-89ef-4336-b811-7c8064e26107.json b/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-ShiningValiant2/aa6ab404-89ef-4336-b811-7c8064e26107.json deleted file mode 100644 index aad1ebcc3..000000000 --- a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-ShiningValiant2/aa6ab404-89ef-4336-b811-7c8064e26107.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.2-3B-ShiningValiant2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2-3B-ShiningValiant2", - "id": "ValiantLabs/Llama3.2-3B-ShiningValiant2", - "developer": "ValiantLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2625 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0823 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3866 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2829 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24/a14e6c79-4a78-4c02-a7ca-35e783f32be1.json b/data/hfopenllm_v2/Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24/a14e6c79-4a78-4c02-a7ca-35e783f32be1.json deleted file mode 100644 index 3f2934658..000000000 --- a/data/hfopenllm_v2/Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24/a14e6c79-4a78-4c02-a7ca-35e783f32be1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Vikhrmodels_Vikhr-Llama3.1-8B-Instruct-R-21-09-24/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Vikhr-Llama3.1-8B-Instruct-R-21-09-24", - "id": "Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24", - "developer": "Vikhrmodels", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6431 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5272 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2175 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3547 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24/ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json b/data/hfopenllm_v2/Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24/ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json deleted file mode 100644 index 5191b77e3..000000000 --- a/data/hfopenllm_v2/Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24/ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Vikhrmodels_Vikhr-Nemo-12B-Instruct-R-21-09-24/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Vikhr-Nemo-12B-Instruct-R-21-09-24", - "id": "Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24", - "developer": "Vikhrmodels", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5999 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5212 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1715 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-2x34B/f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json b/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-2x34B/f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json deleted file mode 100644 index a15c3dfdc..000000000 --- a/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-2x34B/f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Weyaxi_Bagel-Hermes-2x34B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bagel-Hermes-2x34B", - "id": "Weyaxi/Bagel-Hermes-2x34B", - "developer": "Weyaxi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 60.814 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5432 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4917 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4589 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-34B-Slerp/335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json b/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-34B-Slerp/335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json deleted file mode 100644 index b2da0af11..000000000 --- a/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-34B-Slerp/335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Weyaxi_Bagel-Hermes-34B-Slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bagel-Hermes-34B-Slerp", - "id": "Weyaxi/Bagel-Hermes-34B-Slerp", - "developer": "Weyaxi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4603 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5922 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4622 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4703 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v4-7B/b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json b/data/hfopenllm_v2/Weyaxi/Einstein-v4-7B/b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json deleted file mode 100644 index 42818be4e..000000000 --- a/data/hfopenllm_v2/Weyaxi/Einstein-v4-7B/b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v4-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Einstein-v4-7B", - "id": "Weyaxi/Einstein-v4-7B", - "developer": "Weyaxi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4708 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3849 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4682 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2259 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-Llama3-8B/112f01a2-f0fb-4257-86bf-61c9a184eb92.json b/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-Llama3-8B/112f01a2-f0fb-4257-86bf-61c9a184eb92.json deleted file mode 100644 index f43785b03..000000000 --- a/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-Llama3-8B/112f01a2-f0fb-4257-86bf-61c9a184eb92.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v6.1-Llama3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Einstein-v6.1-Llama3-8B", - "id": "Weyaxi/Einstein-v6.1-Llama3-8B", - "developer": "Weyaxi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4568 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4213 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json b/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json deleted file mode 100644 index 4375030ce..000000000 --- a/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Einstein-v6.1-developed-by-Weyaxi-Llama3-8B", - "id": "Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B", - "developer": "Weyaxi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3927 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v7-Qwen2-7B/16ff8fa3-4676-473c-99ad-908ddb59d8ed.json b/data/hfopenllm_v2/Weyaxi/Einstein-v7-Qwen2-7B/16ff8fa3-4676-473c-99ad-908ddb59d8ed.json deleted file mode 100644 index 3fd32bab2..000000000 --- a/data/hfopenllm_v2/Weyaxi/Einstein-v7-Qwen2-7B/16ff8fa3-4676-473c-99ad-908ddb59d8ed.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v7-Qwen2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Einstein-v7-Qwen2-7B", - "id": "Weyaxi/Einstein-v7-Qwen2-7B", - "developer": "Weyaxi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.41 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5161 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1994 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v8-Llama3.2-1B/9b153ac9-f95b-419b-b7f9-beccd769ddad.json b/data/hfopenllm_v2/Weyaxi/Einstein-v8-Llama3.2-1B/9b153ac9-f95b-419b-b7f9-beccd769ddad.json deleted file mode 100644 index 96e3825ec..000000000 --- a/data/hfopenllm_v2/Weyaxi/Einstein-v8-Llama3.2-1B/9b153ac9-f95b-419b-b7f9-beccd769ddad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v8-Llama3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Einstein-v8-Llama3.2-1B", - "id": "Weyaxi/Einstein-v8-Llama3.2-1B", - "developer": "Weyaxi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1862 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3018 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1161 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct/8a5df3c2-eb71-4e12-b013-fb43685f2916.json b/data/hfopenllm_v2/Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct/8a5df3c2-eb71-4e12-b013-fb43685f2916.json deleted file mode 100644 index e6c896811..000000000 --- a/data/hfopenllm_v2/Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct/8a5df3c2-eb71-4e12-b013-fb43685f2916.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Weyaxi_SauerkrautLM-UNA-SOLAR-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerkrautLM-UNA-SOLAR-Instruct", - "id": "Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct", - "developer": "Weyaxi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4573 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5166 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3979 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3153 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.0/35fa3213-5c08-4b19-ae76-237fdd25444e.json b/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.0/35fa3213-5c08-4b19-ae76-237fdd25444e.json deleted file mode 100644 index ea8afc65e..000000000 --- a/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.0/35fa3213-5c08-4b19-ae76-237fdd25444e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/WizardLMTeam_WizardLM-13B-V1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WizardLM-13B-V1.0", - "id": "WizardLMTeam/WizardLM-13B-V1.0", - "developer": "WizardLMTeam", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.185 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2913 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.2/242ce55f-1471-435e-bcd7-d28b5fc87fc4.json b/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.2/242ce55f-1471-435e-bcd7-d28b5fc87fc4.json deleted file mode 100644 index 9777bbd90..000000000 --- a/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.2/242ce55f-1471-435e-bcd7-d28b5fc87fc4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/WizardLMTeam_WizardLM-13B-V1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WizardLM-13B-V1.2", - "id": "WizardLMTeam/WizardLM-13B-V1.2", - "developer": "WizardLMTeam", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3392 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4378 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2519 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/WizardLMTeam/WizardLM-70B-V1.0/95f509f2-5e67-404a-968d-f7488d684e32.json b/data/hfopenllm_v2/WizardLMTeam/WizardLM-70B-V1.0/95f509f2-5e67-404a-968d-f7488d684e32.json deleted file mode 100644 index 8f43a920a..000000000 --- a/data/hfopenllm_v2/WizardLMTeam/WizardLM-70B-V1.0/95f509f2-5e67-404a-968d-f7488d684e32.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/WizardLMTeam_WizardLM-70B-V1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WizardLM-70B-V1.0", - "id": "WizardLMTeam/WizardLM-70B-V1.0", - "developer": "WizardLMTeam", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4951 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4391 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Wladastic/Mini-Think-Base-1B/bcbcdfe9-0663-417c-9a29-60906e63db8f.json b/data/hfopenllm_v2/Wladastic/Mini-Think-Base-1B/bcbcdfe9-0663-417c-9a29-60906e63db8f.json deleted file mode 100644 index ab878d450..000000000 --- a/data/hfopenllm_v2/Wladastic/Mini-Think-Base-1B/bcbcdfe9-0663-417c-9a29-60906e63db8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Wladastic_Mini-Think-Base-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mini-Think-Base-1B", - "id": "Wladastic/Mini-Think-Base-1B", - "developer": "Wladastic", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5588 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1772 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xclbr7/Arcanum-12b/d95a7493-2f99-4c10-8067-711c7388af7d.json b/data/hfopenllm_v2/Xclbr7/Arcanum-12b/d95a7493-2f99-4c10-8067-711c7388af7d.json deleted file mode 100644 index 7bafc055c..000000000 --- a/data/hfopenllm_v2/Xclbr7/Arcanum-12b/d95a7493-2f99-4c10-8067-711c7388af7d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xclbr7_Arcanum-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Arcanum-12b", - "id": "Xclbr7/Arcanum-12b", - "developer": "Xclbr7", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2907 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5265 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3586 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xclbr7/Hyena-12b/789848a0-6d8a-4583-93c3-a72df74d0071.json b/data/hfopenllm_v2/Xclbr7/Hyena-12b/789848a0-6d8a-4583-93c3-a72df74d0071.json deleted file mode 100644 index eff456eeb..000000000 --- a/data/hfopenllm_v2/Xclbr7/Hyena-12b/789848a0-6d8a-4583-93c3-a72df74d0071.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xclbr7_Hyena-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hyena-12b", - "id": "Xclbr7/Hyena-12b", - "developer": "Xclbr7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3404 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5457 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3984 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3439 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xclbr7/caliburn-12b/14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json b/data/hfopenllm_v2/Xclbr7/caliburn-12b/14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json deleted file mode 100644 index 48b753f2c..000000000 --- a/data/hfopenllm_v2/Xclbr7/caliburn-12b/14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xclbr7_caliburn-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "caliburn-12b", - "id": "Xclbr7/caliburn-12b", - "developer": "Xclbr7", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3576 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5519 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4292 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3675 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xclbr7/caliburn-v2-12b/379f559f-9bfa-444f-b477-562c25b4c299.json b/data/hfopenllm_v2/Xclbr7/caliburn-v2-12b/379f559f-9bfa-444f-b477-562c25b4c299.json deleted file mode 100644 index 475721de8..000000000 --- a/data/hfopenllm_v2/Xclbr7/caliburn-v2-12b/379f559f-9bfa-444f-b477-562c25b4c299.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xclbr7_caliburn-v2-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "caliburn-v2-12b", - "id": "Xclbr7/caliburn-v2-12b", - "developer": "Xclbr7", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2967 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5141 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2/effb6a3d-c98f-4c3a-be77-902c61cda21b.json b/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2/effb6a3d-c98f-4c3a-be77-902c61cda21b.json deleted file mode 100644 index b05dd8d93..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2/effb6a3d-c98f-4c3a-be77-902c61cda21b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Llama3.2-1B-THREADRIPPER-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2-1B-THREADRIPPER-v0.2", - "id": "Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5318 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3528 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1745 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER/6c1c1405-afa4-412d-ba1f-49dc1cac4509.json b/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER/6c1c1405-afa4-412d-ba1f-49dc1cac4509.json deleted file mode 100644 index cd6966cd3..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER/6c1c1405-afa4-412d-ba1f-49dc1cac4509.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Llama3.2-1B-THREADRIPPER/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2-1B-THREADRIPPER", - "id": "Xiaojian9992024/Llama3.2-1B-THREADRIPPER", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5576 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.313 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1763 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Phi-4-Megatron-Empathetic/6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json b/data/hfopenllm_v2/Xiaojian9992024/Phi-4-Megatron-Empathetic/6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json deleted file mode 100644 index b61bf3793..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Phi-4-Megatron-Empathetic/6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Phi-4-Megatron-Empathetic/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Megatron-Empathetic", - "id": "Xiaojian9992024/Phi-4-Megatron-Empathetic", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0173 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6673 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2696 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5082 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Phi-4-mini-UNOFFICAL/5fd5206b-186a-43b9-a4f4-07e75aa0293a.json b/data/hfopenllm_v2/Xiaojian9992024/Phi-4-mini-UNOFFICAL/5fd5206b-186a-43b9-a4f4-07e75aa0293a.json deleted file mode 100644 index 76e78b910..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Phi-4-mini-UNOFFICAL/5fd5206b-186a-43b9-a4f4-07e75aa0293a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Phi-4-mini-UNOFFICAL/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-mini-UNOFFICAL", - "id": "Xiaojian9992024/Phi-4-mini-UNOFFICAL", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.754 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1273 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2944 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-7B-MS-Destroyer/b707ecbf-0658-4226-803d-53456d16d54b.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-7B-MS-Destroyer/b707ecbf-0658-4226-803d-53456d16d54b.json deleted file mode 100644 index ddb71d89b..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-7B-MS-Destroyer/b707ecbf-0658-4226-803d-53456d16d54b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-7B-MS-Destroyer/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-MS-Destroyer", - "id": "Xiaojian9992024/Qwen2.5-7B-MS-Destroyer", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7296 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.547 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4592 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.427 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2/dca1ee57-5e86-4532-a2f3-ac6a619ca576.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2/dca1ee57-5e86-4532-a2f3-ac6a619ca576.json deleted file mode 100644 index cb19a6f74..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2/dca1ee57-5e86-4532-a2f3-ac6a619ca576.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Dyanka-7B-Preview-v0.2", - "id": "Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6702 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4721 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4467 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview/1233476a-7839-4a22-a7ca-1d0f237d8888.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview/1233476a-7839-4a22-a7ca-1d0f237d8888.json deleted file mode 100644 index 61a8f0cb6..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview/1233476a-7839-4a22-a7ca-1d0f237d8888.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Dyanka-7B-Preview", - "id": "Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.764 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5543 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4879 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4481 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored/5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored/5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json deleted file mode 100644 index 6183b3899..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored/5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-THREADRIPPER-Medium-Censored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-THREADRIPPER-Medium-Censored", - "id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8112 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6431 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4929 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/18f5fd6c-2b79-4d48-b7e9-18845db16271.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/18f5fd6c-2b79-4d48-b7e9-18845db16271.json deleted file mode 100644 index 889a250b8..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/18f5fd6c-2b79-4d48-b7e9-18845db16271.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-THREADRIPPER-Small-AnniversaryEdition", - "id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7404 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small/a9039374-fa5a-4b8b-800f-5f4651cf812d.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small/a9039374-fa5a-4b8b-800f-5f4651cf812d.json deleted file mode 100644 index 5724e01ef..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small/a9039374-fa5a-4b8b-800f-5f4651cf812d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-THREADRIPPER-Small", - "id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Small", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7689 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4736 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4349 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp/3f9704b4-bf25-40da-b6dc-b927c3569f40.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp/3f9704b4-bf25-40da-b6dc-b927c3569f40.json deleted file mode 100644 index 35e891112..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp/3f9704b4-bf25-40da-b6dc-b927c3569f40.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-Ultra-1.5B-25.02-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Ultra-1.5B-25.02-Exp", - "id": "Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3383 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2641 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B/a8f858d8-a792-409f-b79d-948a19e2aa87.json b/data/hfopenllm_v2/Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B/a8f858d8-a792-409f-b79d-948a19e2aa87.json deleted file mode 100644 index 163b699a4..000000000 --- a/data/hfopenllm_v2/Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B/a8f858d8-a792-409f-b79d-948a19e2aa87.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Reflection-L3.2-JametMiniMix-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflection-L3.2-JametMiniMix-3B", - "id": "Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B", - "developer": "Xiaojian9992024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4619 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.439 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3667 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2988 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Xkev/Llama-3.2V-11B-cot/5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json b/data/hfopenllm_v2/Xkev/Llama-3.2V-11B-cot/5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json deleted file mode 100644 index 2b105fc64..000000000 --- a/data/hfopenllm_v2/Xkev/Llama-3.2V-11B-cot/5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Xkev_Llama-3.2V-11B-cot/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2V-11B-cot", - "id": "Xkev/Llama-3.2V-11B-cot", - "developer": "Xkev", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MllamaForConditionalGeneration", - "params_billions": 10.67 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4158 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4959 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4159 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3587 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-1M-YOYO-V3/77092cfe-9820-45e8-94c5-31d27f1daa7c.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-1M-YOYO-V3/77092cfe-9820-45e8-94c5-31d27f1daa7c.json deleted file mode 100644 index 750d1c03c..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-1M-YOYO-V3/77092cfe-9820-45e8-94c5-31d27f1daa7c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-1M-YOYO-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-1M-YOYO-V3", - "id": "YOYO-AI/Qwen2.5-14B-1M-YOYO-V3", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8398 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0505/cab8fed8-de68-4fa5-b4fc-d9483fc56571.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0505/cab8fed8-de68-4fa5-b4fc-d9483fc56571.json deleted file mode 100644 index 7845c81e9..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0505/cab8fed8-de68-4fa5-b4fc-d9483fc56571.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-0505/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-0505", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-0505", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4434 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0510-v2/a8103350-b208-4856-8e7b-8ea8918ba0d1.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0510-v2/a8103350-b208-4856-8e7b-8ea8918ba0d1.json deleted file mode 100644 index 7d6288961..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0510-v2/a8103350-b208-4856-8e7b-8ea8918ba0d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-0510-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-0510-v2", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-0510-v2", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5947 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6553 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4744 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0805/e849c03c-c569-4059-8fc5-6a98cf391342.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0805/e849c03c-c569-4059-8fc5-6a98cf391342.json deleted file mode 100644 index 60cbcc72d..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0805/e849c03c-c569-4059-8fc5-6a98cf391342.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-0805/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-0805", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-0805", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4434 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005-v2/f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005-v2/f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json deleted file mode 100644 index 62d6d9013..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005-v2/f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1005-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-1005-v2", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-1005-v2", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5953 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6551 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4434 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4731 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005/97bdb352-2e9d-4cc5-8b70-55348ef3a217.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005/97bdb352-2e9d-4cc5-8b70-55348ef3a217.json deleted file mode 100644 index 0d6a7c3b6..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005/97bdb352-2e9d-4cc5-8b70-55348ef3a217.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1005/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-1005", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-1005", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5972 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6542 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4524 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010-v2/78053a33-24c8-4e9f-8791-f127f21eec1c.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010-v2/78053a33-24c8-4e9f-8791-f127f21eec1c.json deleted file mode 100644 index ba66678f4..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010-v2/78053a33-24c8-4e9f-8791-f127f21eec1c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1010-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-1010-v2", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-1010-v2", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5947 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6553 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4744 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/03082966-87ba-4560-a784-5d8677003500.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/03082966-87ba-4560-a784-5d8677003500.json deleted file mode 100644 index 0f3f65bb4..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/03082966-87ba-4560-a784-5d8677003500.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1010/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-1010", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-1010", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5899 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.654 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4744 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/97f26b20-db66-4a30-ba2a-c18a31081271.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/97f26b20-db66-4a30-ba2a-c18a31081271.json deleted file mode 100644 index 2b3d5ce24..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/97f26b20-db66-4a30-ba2a-c18a31081271.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1010/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-1010", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-1010", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7905 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6406 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4181 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4944 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-SCE/85f9ccda-8c47-4fa1-9d47-e9da4730b077.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-SCE/85f9ccda-8c47-4fa1-9d47-e9da4730b077.json deleted file mode 100644 index b49788cb1..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-SCE/85f9ccda-8c47-4fa1-9d47-e9da4730b077.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-SCE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-SCE", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-SCE", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5844 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6489 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4615 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4704 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p1/2a57d6f4-643b-4b30-8d67-03032d454887.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p1/2a57d6f4-643b-4b30-8d67-03032d454887.json deleted file mode 100644 index 84039bdda..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p1/2a57d6f4-643b-4b30-8d67-03032d454887.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-V4-p1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-V4-p1", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-V4-p1", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8203 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4194 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p2/d333f360-c1c3-4916-8480-4a1fc490875a.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p2/d333f360-c1c3-4916-8480-4a1fc490875a.json deleted file mode 100644 index 54ba3e37e..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p2/d333f360-c1c3-4916-8480-4a1fc490875a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-V4-p2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-V4-p2", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-V4-p2", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8048 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4968 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4/37a41261-a7b0-44b2-916f-770cdfa0ad39.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4/37a41261-a7b0-44b2-916f-770cdfa0ad39.json deleted file mode 100644 index 40718d9ee..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4/37a41261-a7b0-44b2-916f-770cdfa0ad39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-V4", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-V4", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8398 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4115 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest-V2/c46cd6cc-b56d-44c5-a03c-b49381ba3462.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest-V2/c46cd6cc-b56d-44c5-a03c-b49381ba3462.json deleted file mode 100644 index 285272c1d..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest-V2/c46cd6cc-b56d-44c5-a03c-b49381ba3462.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-latest-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-latest-V2", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-latest-V2", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7771 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6299 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4299 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5224 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest/612b6226-c25d-42e0-bcd7-be7faa844530.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest/612b6226-c25d-42e0-bcd7-be7faa844530.json deleted file mode 100644 index 045738a4d..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest/612b6226-c25d-42e0-bcd7-be7faa844530.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-latest/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-YOYO-latest", - "id": "YOYO-AI/Qwen2.5-14B-YOYO-latest", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5911 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6656 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4418 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4691 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-it-restore/2fc7a4d6-88e0-4f11-9110-dc53942870a4.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-it-restore/2fc7a4d6-88e0-4f11-9110-dc53942870a4.json deleted file mode 100644 index 69cc906a3..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-it-restore/2fc7a4d6-88e0-4f11-9110-dc53942870a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-it-restore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-it-restore", - "id": "YOYO-AI/Qwen2.5-14B-it-restore", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8209 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4087 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-7B-it-restore/34665752-58d8-48ee-81a6-f1a068c23026.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-7B-it-restore/34665752-58d8-48ee-81a6-f1a068c23026.json deleted file mode 100644 index 8d8fd9130..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-7B-it-restore/34665752-58d8-48ee-81a6-f1a068c23026.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-7B-it-restore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-it-restore", - "id": "YOYO-AI/Qwen2.5-7B-it-restore", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7531 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5407 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4007 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4288 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010/cc0767b5-4aaa-4418-8f68-72a721323e9c.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010/cc0767b5-4aaa-4418-8f68-72a721323e9c.json deleted file mode 100644 index 72afbd4ba..000000000 --- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010/cc0767b5-4aaa-4418-8f68-72a721323e9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-Coder-14B-YOYO-1010/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-14B-YOYO-1010", - "id": "YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6187 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4075 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2/ea507a41-1654-4515-94cc-ce2e38800c61.json b/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2/ea507a41-1654-4515-94cc-ce2e38800c61.json deleted file mode 100644 index 6a8e45bb1..000000000 --- a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2/ea507a41-1654-4515-94cc-ce2e38800c61.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZYH-LLM-Qwen2.5-14B-V2", - "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5071 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4689 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3/c44e773f-4cca-4780-bdd4-f486e65c18e0.json b/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3/c44e773f-4cca-4780-bdd4-f486e65c18e0.json deleted file mode 100644 index e0448d4a8..000000000 --- a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3/c44e773f-4cca-4780-bdd4-f486e65c18e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZYH-LLM-Qwen2.5-14B-V3", - "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8578 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6359 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4022 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4881 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4/f8a46bda-d53b-484e-8832-7939f7d0762d.json b/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4/f8a46bda-d53b-484e-8832-7939f7d0762d.json deleted file mode 100644 index 4f4bd31c8..000000000 --- a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4/f8a46bda-d53b-484e-8832-7939f7d0762d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZYH-LLM-Qwen2.5-14B-V4", - "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8365 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5204 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B/c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json b/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B/c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json deleted file mode 100644 index 7cecb6db8..000000000 --- a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B/c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YOYO-AI_ZYH-LLM-Qwen2.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZYH-LLM-Qwen2.5-14B", - "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B", - "developer": "YOYO-AI", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5941 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6644 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4116 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5351 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Yash21/TinyYi-7B-Test/da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json b/data/hfopenllm_v2/Yash21/TinyYi-7B-Test/da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json deleted file mode 100644 index f956d0abc..000000000 --- a/data/hfopenllm_v2/Yash21/TinyYi-7B-Test/da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Yash21_TinyYi-7B-Test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TinyYi-7B-Test", - "id": "Yash21/TinyYi-7B-Test", - "developer": "Yash21", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1856 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.291 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1091 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/1PARAMMYL-8B-ModelStock/ac078124-85d9-4715-bf7c-1428b1063732.json b/data/hfopenllm_v2/Youlln/1PARAMMYL-8B-ModelStock/ac078124-85d9-4715-bf7c-1428b1063732.json deleted file mode 100644 index 0dfc3106e..000000000 --- a/data/hfopenllm_v2/Youlln/1PARAMMYL-8B-ModelStock/ac078124-85d9-4715-bf7c-1428b1063732.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_1PARAMMYL-8B-ModelStock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "1PARAMMYL-8B-ModelStock", - "id": "Youlln/1PARAMMYL-8B-ModelStock", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5216 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1488 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/2PRYMMAL-Yi1.5-6B-SLERP/9c1dcd75-8491-4890-ac6f-000868099a3e.json b/data/hfopenllm_v2/Youlln/2PRYMMAL-Yi1.5-6B-SLERP/9c1dcd75-8491-4890-ac6f-000868099a3e.json deleted file mode 100644 index 9c066ab38..000000000 --- a/data/hfopenllm_v2/Youlln/2PRYMMAL-Yi1.5-6B-SLERP/9c1dcd75-8491-4890-ac6f-000868099a3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_2PRYMMAL-Yi1.5-6B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "2PRYMMAL-Yi1.5-6B-SLERP", - "id": "Youlln/2PRYMMAL-Yi1.5-6B-SLERP", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2826 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4665 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4756 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.317 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/3PRYMMAL-PHI3-3B-SLERP/7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json b/data/hfopenllm_v2/Youlln/3PRYMMAL-PHI3-3B-SLERP/7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json deleted file mode 100644 index d99b33828..000000000 --- a/data/hfopenllm_v2/Youlln/3PRYMMAL-PHI3-3B-SLERP/7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_3PRYMMAL-PHI3-3B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "3PRYMMAL-PHI3-3B-SLERP", - "id": "Youlln/3PRYMMAL-PHI3-3B-SLERP", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5422 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1715 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4648 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4002 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/4PRYMMAL-GEMMA2-9B-SLERP/8f38374e-f373-4639-9278-24441ebd0325.json b/data/hfopenllm_v2/Youlln/4PRYMMAL-GEMMA2-9B-SLERP/8f38374e-f373-4639-9278-24441ebd0325.json deleted file mode 100644 index 676bdf378..000000000 --- a/data/hfopenllm_v2/Youlln/4PRYMMAL-GEMMA2-9B-SLERP/8f38374e-f373-4639-9278-24441ebd0325.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_4PRYMMAL-GEMMA2-9B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "4PRYMMAL-GEMMA2-9B-SLERP", - "id": "Youlln/4PRYMMAL-GEMMA2-9B-SLERP", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2714 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5923 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4672 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-12B/c007938e-3427-4896-8493-1500abdfbd2b.json b/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-12B/c007938e-3427-4896-8493-1500abdfbd2b.json deleted file mode 100644 index 7f4f92c00..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-12B/c007938e-3427-4896-8493-1500abdfbd2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-MIRAGE-1-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-MIRAGE-1-12B", - "id": "Youlln/ECE-MIRAGE-1-12B", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 15.21 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.207 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3011 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3219 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-15B/df81dc0d-6c72-49e9-862b-02e9b6642cb6.json b/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-15B/df81dc0d-6c72-49e9-862b-02e9b6642cb6.json deleted file mode 100644 index 331e6d3a6..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-15B/df81dc0d-6c72-49e9-862b-02e9b6642cb6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-MIRAGE-1-15B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-MIRAGE-1-15B", - "id": "Youlln/ECE-MIRAGE-1-15B", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 15.21 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.207 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3011 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3219 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR/46c96d8e-568c-48f8-a74b-9dd4b4195037.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR/46c96d8e-568c-48f8-a74b-9dd4b4195037.json deleted file mode 100644 index 89fe56fd5..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR/46c96d8e-568c-48f8-a74b-9dd4b4195037.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-FT-V3-MUSR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-V3-MUSR", - "id": "Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3041 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1645 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3/1f4f7181-8a81-49f4-9e81-925d5d69a37c.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3/1f4f7181-8a81-49f4-9e81-925d5d69a37c.json deleted file mode 100644 index 19716554b..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3/1f4f7181-8a81-49f4-9e81-925d5d69a37c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-FT-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-V3", - "id": "Youlln/ECE-PRYMMAL-0.5B-FT-V3", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1642 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1161 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR/3ea343b6-93f6-4c61-a164-3db95d13cbdf.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR/3ea343b6-93f6-4c61-a164-3db95d13cbdf.json deleted file mode 100644 index 8d60c846e..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR/3ea343b6-93f6-4c61-a164-3db95d13cbdf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-FT-V4-MUSR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR", - "id": "Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1138 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3038 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1321 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V2/a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V2/a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json deleted file mode 100644 index 76582f768..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V2/a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-SLERP-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-SLERP-V2", - "id": "Youlln/ECE-PRYMMAL-0.5B-SLERP-V2", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1612 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2935 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V3/0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V3/0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json deleted file mode 100644 index cf85a25de..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V3/0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-SLERP-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-SLERP-V3", - "id": "Youlln/ECE-PRYMMAL-0.5B-SLERP-V3", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.167 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2938 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1087 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1/6896faa7-7204-4091-8f4e-9cc0b53d673a.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1/6896faa7-7204-4091-8f4e-9cc0b53d673a.json deleted file mode 100644 index af72a7152..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1/6896faa7-7204-4091-8f4e-9cc0b53d673a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-1B-SLERP-V1", - "id": "Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3251 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4266 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2/88064453-fd8c-4bd9-adf1-39f43972bec1.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2/88064453-fd8c-4bd9-adf1-39f43972bec1.json deleted file mode 100644 index b5b27cb03..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2/88064453-fd8c-4bd9-adf1-39f43972bec1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-1B-SLERP-V2", - "id": "Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3251 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4266 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4/a18ade45-acba-4059-b969-445e529a82e2.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4/a18ade45-acba-4059-b969-445e529a82e2.json deleted file mode 100644 index b630ec36d..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4/a18ade45-acba-4059-b969-445e529a82e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-YL-7B-SLERP-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-7B-SLERP-V4", - "id": "Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.251 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.377 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3745 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2132 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5-FT/6c0e4132-71e7-44af-95fc-83b0a6be2a82.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5-FT/6c0e4132-71e7-44af-95fc-83b0a6be2a82.json deleted file mode 100644 index 67a332158..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5-FT/6c0e4132-71e7-44af-95fc-83b0a6be2a82.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL0.5-FT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL0.5-FT", - "id": "Youlln/ECE-PRYMMAL0.5-FT", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1851 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3132 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1477 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5B-Youri/5d9ab422-4f4f-460d-bd39-51266b43d7e5.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5B-Youri/5d9ab422-4f4f-460d-bd39-51266b43d7e5.json deleted file mode 100644 index 21bba30b8..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5B-Youri/5d9ab422-4f4f-460d-bd39-51266b43d7e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL0.5B-Youri/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL0.5B-Youri", - "id": "Youlln/ECE-PRYMMAL0.5B-Youri", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1446 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL1B-FT-V1/cda03c45-0782-40cc-a17d-67d808657b83.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL1B-FT-V1/cda03c45-0782-40cc-a17d-67d808657b83.json deleted file mode 100644 index 1633dc538..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL1B-FT-V1/cda03c45-0782-40cc-a17d-67d808657b83.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL1B-FT-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL1B-FT-V1", - "id": "Youlln/ECE-PRYMMAL1B-FT-V1", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2144 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4033 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE-Qwen0.5B-FT-V2/50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json b/data/hfopenllm_v2/Youlln/ECE-Qwen0.5B-FT-V2/50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json deleted file mode 100644 index 01985d185..000000000 --- a/data/hfopenllm_v2/Youlln/ECE-Qwen0.5B-FT-V2/50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE-Qwen0.5B-FT-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-Qwen0.5B-FT-V2", - "id": "Youlln/ECE-Qwen0.5B-FT-V2", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2526 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1666 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Youlln/ECE.EIFFEIL.ia-0.5B-SLERP/cf758994-6e94-434d-bf68-74cca188b5e8.json b/data/hfopenllm_v2/Youlln/ECE.EIFFEIL.ia-0.5B-SLERP/cf758994-6e94-434d-bf68-74cca188b5e8.json deleted file mode 100644 index 69c6a2377..000000000 --- a/data/hfopenllm_v2/Youlln/ECE.EIFFEIL.ia-0.5B-SLERP/cf758994-6e94-434d-bf68-74cca188b5e8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Youlln_ECE.EIFFEIL.ia-0.5B-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE.EIFFEIL.ia-0.5B-SLERP", - "id": "Youlln/ECE.EIFFEIL.ia-0.5B-SLERP", - "developer": "Youlln", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2561 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3306 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3102 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1903 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/YoungPanda/qwenqwen/611f9549-0788-44e9-8125-18df06cd80d6.json b/data/hfopenllm_v2/YoungPanda/qwenqwen/611f9549-0788-44e9-8125-18df06cd80d6.json deleted file mode 100644 index 92e685942..000000000 --- a/data/hfopenllm_v2/YoungPanda/qwenqwen/611f9549-0788-44e9-8125-18df06cd80d6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/YoungPanda_qwenqwen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwenqwen", - "id": "YoungPanda/qwenqwen", - "developer": "YoungPanda", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 14.316 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1264 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3379 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1168 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Yuma42/KangalKhan-RawRuby-7B/59cf23ba-027d-4bac-a0e1-526376396b4d.json b/data/hfopenllm_v2/Yuma42/KangalKhan-RawRuby-7B/59cf23ba-027d-4bac-a0e1-526376396b4d.json deleted file mode 100644 index 2c57a54ac..000000000 --- a/data/hfopenllm_v2/Yuma42/KangalKhan-RawRuby-7B/59cf23ba-027d-4bac-a0e1-526376396b4d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Yuma42_KangalKhan-RawRuby-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KangalKhan-RawRuby-7B", - "id": "Yuma42/KangalKhan-RawRuby-7B", - "developer": "Yuma42", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5477 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4755 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3023 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Yuma42/Llama3.1-IgneousIguana-8B/1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json b/data/hfopenllm_v2/Yuma42/Llama3.1-IgneousIguana-8B/1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json deleted file mode 100644 index 168e27964..000000000 --- a/data/hfopenllm_v2/Yuma42/Llama3.1-IgneousIguana-8B/1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Yuma42_Llama3.1-IgneousIguana-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-IgneousIguana-8B", - "id": "Yuma42/Llama3.1-IgneousIguana-8B", - "developer": "Yuma42", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8133 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5191 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3974 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Yuma42/Llama3.1-SuperHawk-8B/1e737e28-d926-43e8-9e4c-e39fa91d7977.json b/data/hfopenllm_v2/Yuma42/Llama3.1-SuperHawk-8B/1e737e28-d926-43e8-9e4c-e39fa91d7977.json deleted file mode 100644 index 75ea95504..000000000 --- a/data/hfopenllm_v2/Yuma42/Llama3.1-SuperHawk-8B/1e737e28-d926-43e8-9e4c-e39fa91d7977.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Yuma42_Llama3.1-SuperHawk-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-SuperHawk-8B", - "id": "Yuma42/Llama3.1-SuperHawk-8B", - "developer": "Yuma42", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7986 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2349 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4084 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3945 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/Z1-Coder/Z1-Coder-7B/43ef8eee-5d8a-47e7-ac71-1a898421370a.json b/data/hfopenllm_v2/Z1-Coder/Z1-Coder-7B/43ef8eee-5d8a-47e7-ac71-1a898421370a.json deleted file mode 100644 index 49916e11f..000000000 --- a/data/hfopenllm_v2/Z1-Coder/Z1-Coder-7B/43ef8eee-5d8a-47e7-ac71-1a898421370a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/Z1-Coder_Z1-Coder-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Z1-Coder-7B", - "id": "Z1-Coder/Z1-Coder-7B", - "developer": "Z1-Coder", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3215 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4842 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3622 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3759 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-dpo-avg/d8d03c71-942f-4aff-8a5e-5c265c639b44.json b/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-dpo-avg/d8d03c71-942f-4aff-8a5e-5c265c639b44.json deleted file mode 100644 index 8775cade5..000000000 --- a/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-dpo-avg/d8d03c71-942f-4aff-8a5e-5c265c639b44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZHLiu627_zephyr-7b-gemma-dpo-avg/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-7b-gemma-dpo-avg", - "id": "ZHLiu627/zephyr-7b-gemma-dpo-avg", - "developer": "ZHLiu627", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.309 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4149 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4107 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2851 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-rpo-avg/96262938-1146-4993-92a1-a2ddb2519f8a.json b/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-rpo-avg/96262938-1146-4993-92a1-a2ddb2519f8a.json deleted file mode 100644 index 399559c0f..000000000 --- a/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-rpo-avg/96262938-1146-4993-92a1-a2ddb2519f8a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZHLiu627_zephyr-7b-gemma-rpo-avg/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-7b-gemma-rpo-avg", - "id": "ZHLiu627/zephyr-7b-gemma-rpo-avg", - "developer": "ZHLiu627", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3006 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4183 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/L3-Aspire-Heart-Matrix-8B/292d7cfb-3e3c-47d8-8cca-33507f9ff081.json b/data/hfopenllm_v2/ZeroXClem/L3-Aspire-Heart-Matrix-8B/292d7cfb-3e3c-47d8-8cca-33507f9ff081.json deleted file mode 100644 index 8a789283c..000000000 --- a/data/hfopenllm_v2/ZeroXClem/L3-Aspire-Heart-Matrix-8B/292d7cfb-3e3c-47d8-8cca-33507f9ff081.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_L3-Aspire-Heart-Matrix-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Aspire-Heart-Matrix-8B", - "id": "ZeroXClem/L3-Aspire-Heart-Matrix-8B", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4834 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1828 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3785 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix/3f29c10f-57ef-435b-85df-2cae30ae72fa.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix/3f29c10f-57ef-435b-85df-2cae30ae72fa.json deleted file mode 100644 index a87472533..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix/3f29c10f-57ef-435b-85df-2cae30ae72fa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-AthenaSky-MegaMix/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-AthenaSky-MegaMix", - "id": "ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6301 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5163 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2795 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3504 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix/d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix/d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json deleted file mode 100644 index eb07cdad6..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix/d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-RainbowLight-EtherealMix/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-RainbowLight-EtherealMix", - "id": "ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4973 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5155 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3947 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SpecialTitanFusion/baa35c90-c494-4dff-af28-cb549e40bed8.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SpecialTitanFusion/baa35c90-c494-4dff-af28-cb549e40bed8.json deleted file mode 100644 index 4a6a4d956..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SpecialTitanFusion/baa35c90-c494-4dff-af28-cb549e40bed8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-SpecialTitanFusion/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-SpecialTitanFusion", - "id": "ZeroXClem/Llama-3.1-8B-SpecialTitanFusion", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7402 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5439 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2334 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes/2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes/2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json deleted file mode 100644 index 554e92142..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes/2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-SuperNova-EtherealHermes/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-SuperNova-EtherealHermes", - "id": "ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7339 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5244 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1745 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4066 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3745 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova/f687df8b-42b5-4d94-b741-1b516d9221b2.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova/f687df8b-42b5-4d94-b741-1b516d9221b2.json deleted file mode 100644 index 561402998..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova/f687df8b-42b5-4d94-b741-1b516d9221b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-SuperTulu-LexiNova/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-SuperTulu-LexiNova", - "id": "ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4165 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5079 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.253 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3971 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B/c3a8a952-6869-4eee-a59f-4ae33ac72986.json b/data/hfopenllm_v2/ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B/c3a8a952-6869-4eee-a59f-4ae33ac72986.json deleted file mode 100644 index 352964db5..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B/c3a8a952-6869-4eee-a59f-4ae33ac72986.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen-2.5-Aether-SlerpFusion-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-Aether-SlerpFusion-7B", - "id": "ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6262 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2734 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4327 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M/a7a74117-71e4-49b2-bd65-add82c9165d8.json b/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M/a7a74117-71e4-49b2-bd65-add82c9165d8.json deleted file mode 100644 index 1324b28ff..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M/a7a74117-71e4-49b2-bd65-add82c9165d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen2.5-7B-CelestialHarmony-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-CelestialHarmony-1M", - "id": "ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5944 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3474 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix/04ee694c-0c89-4f25-b10f-315a24743ba2.json b/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix/04ee694c-0c89-4f25-b10f-315a24743ba2.json deleted file mode 100644 index 444e35d1b..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix/04ee694c-0c89-4f25-b10f-315a24743ba2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen2.5-7B-HomerAnvita-NerdMix/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-HomerAnvita-NerdMix", - "id": "ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7708 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5541 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3837 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4391 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix/47fd4acb-acc3-4f12-8af5-c425d3754c38.json b/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix/47fd4acb-acc3-4f12-8af5-c425d3754c38.json deleted file mode 100644 index 3f9b0472a..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix/47fd4acb-acc3-4f12-8af5-c425d3754c38.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen2.5-7B-HomerCreative-Mix/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-HomerCreative-Mix", - "id": "ZeroXClem/Qwen2.5-7B-HomerCreative-Mix", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7835 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4447 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-Qandora-CySec/e19577f5-d1ba-45ad-8500-d18ae2b14440.json b/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-Qandora-CySec/e19577f5-d1ba-45ad-8500-d18ae2b14440.json deleted file mode 100644 index b81b82415..000000000 --- a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-Qandora-CySec/e19577f5-d1ba-45ad-8500-d18ae2b14440.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen2.5-7B-Qandora-CySec/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Qandora-CySec", - "id": "ZeroXClem/Qwen2.5-7B-Qandora-CySec", - "developer": "ZeroXClem", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6773 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2931 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4286 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4485 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZeusLabs/L3-Aethora-15B-V2/e86443cd-453b-4ca0-8e7e-054764fe4bb9.json b/data/hfopenllm_v2/ZeusLabs/L3-Aethora-15B-V2/e86443cd-453b-4ca0-8e7e-054764fe4bb9.json deleted file mode 100644 index 575ec1a14..000000000 --- a/data/hfopenllm_v2/ZeusLabs/L3-Aethora-15B-V2/e86443cd-453b-4ca0-8e7e-054764fe4bb9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZeusLabs_L3-Aethora-15B-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Aethora-15B-V2", - "id": "ZeusLabs/L3-Aethora-15B-V2", - "developer": "ZeusLabs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 15.01 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5011 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3871 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3/24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json b/data/hfopenllm_v2/ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3/24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json deleted file mode 100644 index 63d3d967e..000000000 --- a/data/hfopenllm_v2/ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3/24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ZhangShenao_SELM-Llama-3-8B-Instruct-iter-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SELM-Llama-3-8B-Instruct-iter-3", - "id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3", - "developer": "ZhangShenao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6903 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5046 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3845 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3783 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/aaditya/Llama3-OpenBioLLM-70B/1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json b/data/hfopenllm_v2/aaditya/Llama3-OpenBioLLM-70B/1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json deleted file mode 100644 index 82061858d..000000000 --- a/data/hfopenllm_v2/aaditya/Llama3-OpenBioLLM-70B/1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/aaditya_Llama3-OpenBioLLM-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-OpenBioLLM-70B", - "id": "aaditya/Llama3-OpenBioLLM-70B", - "developer": "aaditya", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7597 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6399 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1971 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4867 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/Dracarys-72B-Instruct/4b1f2aab-ef92-4231-9bdd-96918b26914c.json b/data/hfopenllm_v2/abacusai/Dracarys-72B-Instruct/4b1f2aab-ef92-4231-9bdd-96918b26914c.json deleted file mode 100644 index 46a1a26ce..000000000 --- a/data/hfopenllm_v2/abacusai/Dracarys-72B-Instruct/4b1f2aab-ef92-4231-9bdd-96918b26914c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_Dracarys-72B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dracarys-72B-Instruct", - "id": "abacusai/Dracarys-72B-Instruct", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7856 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6944 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3965 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4558 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5456 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/Liberated-Qwen1.5-14B/4956e127-14a1-405e-a0e0-76fe94ea727b.json b/data/hfopenllm_v2/abacusai/Liberated-Qwen1.5-14B/4956e127-14a1-405e-a0e0-76fe94ea727b.json deleted file mode 100644 index afc41b73e..000000000 --- a/data/hfopenllm_v2/abacusai/Liberated-Qwen1.5-14B/4956e127-14a1-405e-a0e0-76fe94ea727b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_Liberated-Qwen1.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Liberated-Qwen1.5-14B", - "id": "abacusai/Liberated-Qwen1.5-14B", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4948 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1601 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4175 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3512 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/Llama-3-Smaug-8B/90fb6e40-88f7-4ce2-ae99-308d87e69718.json b/data/hfopenllm_v2/abacusai/Llama-3-Smaug-8B/90fb6e40-88f7-4ce2-ae99-308d87e69718.json deleted file mode 100644 index aee6436ea..000000000 --- a/data/hfopenllm_v2/abacusai/Llama-3-Smaug-8B/90fb6e40-88f7-4ce2-ae99-308d87e69718.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_Llama-3-Smaug-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Smaug-8B", - "id": "abacusai/Llama-3-Smaug-8B", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4867 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4931 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3622 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3185 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/Smaug-34B-v0.1/cdad0f08-1c60-4493-bed0-9733894b367a.json b/data/hfopenllm_v2/abacusai/Smaug-34B-v0.1/cdad0f08-1c60-4493-bed0-9733894b367a.json deleted file mode 100644 index a0a920244..000000000 --- a/data/hfopenllm_v2/abacusai/Smaug-34B-v0.1/cdad0f08-1c60-4493-bed0-9733894b367a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_Smaug-34B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Smaug-34B-v0.1", - "id": "abacusai/Smaug-34B-v0.1", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5016 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5358 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3979 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4543 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/Smaug-72B-v0.1/8e83b4f7-736f-4e03-8256-2a1fc421b04f.json b/data/hfopenllm_v2/abacusai/Smaug-72B-v0.1/8e83b4f7-736f-4e03-8256-2a1fc421b04f.json deleted file mode 100644 index d0562870a..000000000 --- a/data/hfopenllm_v2/abacusai/Smaug-72B-v0.1/8e83b4f7-736f-4e03-8256-2a1fc421b04f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_Smaug-72B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Smaug-72B-v0.1", - "id": "abacusai/Smaug-72B-v0.1", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 72.289 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5167 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5996 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1911 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/Smaug-Llama-3-70B-Instruct-32K/f0d6639d-8485-4bcd-b069-046a747dfbfa.json b/data/hfopenllm_v2/abacusai/Smaug-Llama-3-70B-Instruct-32K/f0d6639d-8485-4bcd-b069-046a747dfbfa.json deleted file mode 100644 index cb2b7cd68..000000000 --- a/data/hfopenllm_v2/abacusai/Smaug-Llama-3-70B-Instruct-32K/f0d6639d-8485-4bcd-b069-046a747dfbfa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_Smaug-Llama-3-70B-Instruct-32K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Smaug-Llama-3-70B-Instruct-32K", - "id": "abacusai/Smaug-Llama-3-70B-Instruct-32K", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7761 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2749 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4765 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/Smaug-Mixtral-v0.1/d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json b/data/hfopenllm_v2/abacusai/Smaug-Mixtral-v0.1/d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json deleted file mode 100644 index e773b34ab..000000000 --- a/data/hfopenllm_v2/abacusai/Smaug-Mixtral-v0.1/d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_Smaug-Mixtral-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Smaug-Mixtral-v0.1", - "id": "abacusai/Smaug-Mixtral-v0.1", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5554 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5162 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0952 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/Smaug-Qwen2-72B-Instruct/5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json b/data/hfopenllm_v2/abacusai/Smaug-Qwen2-72B-Instruct/5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json deleted file mode 100644 index 064650e3a..000000000 --- a/data/hfopenllm_v2/abacusai/Smaug-Qwen2-72B-Instruct/5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_Smaug-Qwen2-72B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Smaug-Qwen2-72B-Instruct", - "id": "abacusai/Smaug-Qwen2-72B-Instruct", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7825 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.691 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4131 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/bigstral-12b-32k/de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json b/data/hfopenllm_v2/abacusai/bigstral-12b-32k/de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json deleted file mode 100644 index 0c72ea1cb..000000000 --- a/data/hfopenllm_v2/abacusai/bigstral-12b-32k/de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_bigstral-12b-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bigstral-12b-32k", - "id": "abacusai/bigstral-12b-32k", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.476 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4194 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.456 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2641 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abacusai/bigyi-15b/db96601a-2f7f-438f-915b-55fee0e0d1d1.json b/data/hfopenllm_v2/abacusai/bigyi-15b/db96601a-2f7f-438f-915b-55fee0e0d1d1.json deleted file mode 100644 index 78bf1bb7e..000000000 --- a/data/hfopenllm_v2/abacusai/bigyi-15b/db96601a-2f7f-438f-915b-55fee0e0d1d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abacusai_bigyi-15b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bigyi-15b", - "id": "abacusai/bigyi-15b", - "developer": "abacusai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 15.058 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2094 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4345 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/27912f7d-7033-4b7c-b93a-af1673ce4a9b.json b/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/27912f7d-7033-4b7c-b93a-af1673ce4a9b.json deleted file mode 100644 index 7ebb312b2..000000000 --- a/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/27912f7d-7033-4b7c-b93a-af1673ce4a9b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abhishek_autotrain-0tmgq-5tpbg/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "autotrain-0tmgq-5tpbg", - "id": "abhishek/autotrain-0tmgq-5tpbg", - "developer": "abhishek", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1957 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3135 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.365 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1151 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/da58a484-4a45-4a70-a651-031ada8023d5.json b/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/da58a484-4a45-4a70-a651-031ada8023d5.json deleted file mode 100644 index f682d2d9d..000000000 --- a/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/da58a484-4a45-4a70-a651-031ada8023d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abhishek_autotrain-0tmgq-5tpbg/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "autotrain-0tmgq-5tpbg", - "id": "abhishek/autotrain-0tmgq-5tpbg", - "developer": "abhishek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1952 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3584 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v1/e8bd221d-8a89-4e3c-8815-0bff27574053.json b/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v1/e8bd221d-8a89-4e3c-8815-0bff27574053.json deleted file mode 100644 index 3051b9372..000000000 --- a/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v1/e8bd221d-8a89-4e3c-8815-0bff27574053.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abhishek_autotrain-llama3-70b-orpo-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "autotrain-llama3-70b-orpo-v1", - "id": "abhishek/autotrain-llama3-70b-orpo-v1", - "developer": "abhishek", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4233 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5998 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v2/ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json b/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v2/ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json deleted file mode 100644 index cfef2a069..000000000 --- a/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v2/ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abhishek_autotrain-llama3-70b-orpo-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "autotrain-llama3-70b-orpo-v2", - "id": "abhishek/autotrain-llama3-70b-orpo-v2", - "developer": "abhishek", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5899 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2107 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4113 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4818 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abhishek/autotrain-llama3-orpo-v2/1e506afa-0d08-45d6-9242-b06104aa67e8.json b/data/hfopenllm_v2/abhishek/autotrain-llama3-orpo-v2/1e506afa-0d08-45d6-9242-b06104aa67e8.json deleted file mode 100644 index 6b46e2ceb..000000000 --- a/data/hfopenllm_v2/abhishek/autotrain-llama3-orpo-v2/1e506afa-0d08-45d6-9242-b06104aa67e8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abhishek_autotrain-llama3-orpo-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "autotrain-llama3-orpo-v2", - "id": "abhishek/autotrain-llama3-orpo-v2", - "developer": "abhishek", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3159 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2218 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abhishek/autotrain-vr4a1-e5mms/7d66bb93-cb2f-4be6-b133-1f0325be58e1.json b/data/hfopenllm_v2/abhishek/autotrain-vr4a1-e5mms/7d66bb93-cb2f-4be6-b133-1f0325be58e1.json deleted file mode 100644 index c2368e06c..000000000 --- a/data/hfopenllm_v2/abhishek/autotrain-vr4a1-e5mms/7d66bb93-cb2f-4be6-b133-1f0325be58e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abhishek_autotrain-vr4a1-e5mms/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "autotrain-vr4a1-e5mms", - "id": "abhishek/autotrain-vr4a1-e5mms", - "developer": "abhishek", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 16.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2142 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5001 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3891 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3667 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/abideen/MedPhi-4-14B-v1/936f3c5f-7817-4118-96c8-e4061d4560fb.json b/data/hfopenllm_v2/abideen/MedPhi-4-14B-v1/936f3c5f-7817-4118-96c8-e4061d4560fb.json deleted file mode 100644 index 73811a080..000000000 --- a/data/hfopenllm_v2/abideen/MedPhi-4-14B-v1/936f3c5f-7817-4118-96c8-e4061d4560fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/abideen_MedPhi-4-14B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MedPhi-4-14B-v1", - "id": "abideen/MedPhi-4-14B-v1", - "developer": "abideen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6277 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6897 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2931 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4155 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5338 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/adamo1139/Yi-34B-200K-AEZAKMI-v2/7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json b/data/hfopenllm_v2/adamo1139/Yi-34B-200K-AEZAKMI-v2/7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json deleted file mode 100644 index a4e2aa16d..000000000 --- a/data/hfopenllm_v2/adamo1139/Yi-34B-200K-AEZAKMI-v2/7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/adamo1139_Yi-34B-200K-AEZAKMI-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-34B-200K-AEZAKMI-v2", - "id": "adamo1139/Yi-34B-200K-AEZAKMI-v2", - "developer": "adamo1139", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3886 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4513 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/77cace56-503f-4531-a4eb-0178a68cc283.json b/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/77cace56-503f-4531-a4eb-0178a68cc283.json deleted file mode 100644 index e2903971c..000000000 --- a/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/77cace56-503f-4531-a4eb-0178a68cc283.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/adriszmar_QAIMath-Qwen2.5-7B-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QAIMath-Qwen2.5-7B-TIES", - "id": "adriszmar/QAIMath-Qwen2.5-7B-TIES", - "developer": "adriszmar", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3124 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3963 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1066 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/9e49b710-2413-42f3-8943-bc9dbf68cb3c.json b/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/9e49b710-2413-42f3-8943-bc9dbf68cb3c.json deleted file mode 100644 index dbd70e39f..000000000 --- a/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/9e49b710-2413-42f3-8943-bc9dbf68cb3c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/adriszmar_QAIMath-Qwen2.5-7B-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QAIMath-Qwen2.5-7B-TIES", - "id": "adriszmar/QAIMath-Qwen2.5-7B-TIES", - "developer": "adriszmar", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1746 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3126 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1087 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/aevalone/distill_qw_test/9a5b3564-97df-4661-a171-37322386ac4d.json b/data/hfopenllm_v2/aevalone/distill_qw_test/9a5b3564-97df-4661-a171-37322386ac4d.json deleted file mode 100644 index 4e684cee1..000000000 --- a/data/hfopenllm_v2/aevalone/distill_qw_test/9a5b3564-97df-4661-a171-37322386ac4d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/aevalone_distill_qw_test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "distill_qw_test", - "id": "aevalone/distill_qw_test", - "developer": "aevalone", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7409 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5246 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4781 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4092 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/agentlans/Gemma2-9B-AdvancedFuse/0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json b/data/hfopenllm_v2/agentlans/Gemma2-9B-AdvancedFuse/0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json deleted file mode 100644 index 65f6c605e..000000000 --- a/data/hfopenllm_v2/agentlans/Gemma2-9B-AdvancedFuse/0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/agentlans_Gemma2-9B-AdvancedFuse/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2-9B-AdvancedFuse", - "id": "agentlans/Gemma2-9B-AdvancedFuse", - "developer": "agentlans", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1543 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5859 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1005 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/agentlans/Llama-3.2-1B-Instruct-CrashCourse12K/7f06c78c-f95e-4e50-aa57-da0579adcdae.json b/data/hfopenllm_v2/agentlans/Llama-3.2-1B-Instruct-CrashCourse12K/7f06c78c-f95e-4e50-aa57-da0579adcdae.json deleted file mode 100644 index 9800d3c2e..000000000 --- a/data/hfopenllm_v2/agentlans/Llama-3.2-1B-Instruct-CrashCourse12K/7f06c78c-f95e-4e50-aa57-da0579adcdae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/agentlans_Llama-3.2-1B-Instruct-CrashCourse12K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-Instruct-CrashCourse12K", - "id": "agentlans/Llama-3.2-1B-Instruct-CrashCourse12K", - "developer": "agentlans", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.321 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-8B-drill/06e55e47-9995-4fa2-877a-c728e9f9f1a1.json b/data/hfopenllm_v2/agentlans/Llama3.1-8B-drill/06e55e47-9995-4fa2-877a-c728e9f9f1a1.json deleted file mode 100644 index 3dd94e869..000000000 --- a/data/hfopenllm_v2/agentlans/Llama3.1-8B-drill/06e55e47-9995-4fa2-877a-c728e9f9f1a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-8B-drill/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-drill", - "id": "agentlans/Llama3.1-8B-drill", - "developer": "agentlans", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7652 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5016 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1715 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3672 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish-Instruct/39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json b/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish-Instruct/39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json deleted file mode 100644 index 8cbe2b1db..000000000 --- a/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish-Instruct/39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-Daredevilish-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-Daredevilish-Instruct", - "id": "agentlans/Llama3.1-Daredevilish-Instruct", - "developer": "agentlans", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7926 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5235 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1722 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3877 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish/f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json b/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish/f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json deleted file mode 100644 index 8fc5cbbdf..000000000 --- a/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish/f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-Daredevilish/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-Daredevilish", - "id": "agentlans/Llama3.1-Daredevilish", - "developer": "agentlans", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6292 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5013 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4091 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-LexiHermes-SuperStorm/7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json b/data/hfopenllm_v2/agentlans/Llama3.1-LexiHermes-SuperStorm/7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json deleted file mode 100644 index 1c22faf44..000000000 --- a/data/hfopenllm_v2/agentlans/Llama3.1-LexiHermes-SuperStorm/7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-LexiHermes-SuperStorm/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-LexiHermes-SuperStorm", - "id": "agentlans/Llama3.1-LexiHermes-SuperStorm", - "developer": "agentlans", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7835 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1616 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3963 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K/ce80ac07-22d2-4883-ac6c-40b080e00b81.json b/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K/ce80ac07-22d2-4883-ac6c-40b080e00b81.json deleted file mode 100644 index 806df32bb..000000000 --- a/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K/ce80ac07-22d2-4883-ac6c-40b080e00b81.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-SuperDeepFuse-CrashCourse12K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-SuperDeepFuse-CrashCourse12K", - "id": "agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K", - "developer": "agentlans", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7187 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5216 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4026 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse/cbece170-f872-485f-a6c2-5db17ced73bc.json b/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse/cbece170-f872-485f-a6c2-5db17ced73bc.json deleted file mode 100644 index 0f8c139a6..000000000 --- a/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse/cbece170-f872-485f-a6c2-5db17ced73bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-SuperDeepFuse/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-SuperDeepFuse", - "id": "agentlans/Llama3.1-SuperDeepFuse", - "developer": "agentlans", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5049 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1828 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout/c1fd751b-c6c3-4350-9618-f4b4840e1b69.json b/data/hfopenllm_v2/agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout/c1fd751b-c6c3-4350-9618-f4b4840e1b69.json deleted file mode 100644 index 8da33be6f..000000000 --- a/data/hfopenllm_v2/agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout/c1fd751b-c6c3-4350-9618-f4b4840e1b69.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/agentlans_Qwen2.5-0.5B-Instruct-CrashCourse-dropout/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-Instruct-CrashCourse-dropout", - "id": "agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout", - "developer": "agentlans", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2949 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1608 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b/bfd28b91-3a72-4417-b52b-804d2cbae12f.json b/data/hfopenllm_v2/ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b/bfd28b91-3a72-4417-b52b-804d2cbae12f.json deleted file mode 100644 index d883b1cb1..000000000 --- a/data/hfopenllm_v2/ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b/bfd28b91-3a72-4417-b52b-804d2cbae12f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ahmeda335_13_outOf_32_pruned_layers_llama3.1-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "13_outOf_32_pruned_layers_llama3.1-8b", - "id": "ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b", - "developer": "ahmeda335", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 5.195 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1748 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2883 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3803 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ai21labs/Jamba-v0.1/32c26cbc-3697-47a6-bd12-18187df9dda9.json b/data/hfopenllm_v2/ai21labs/Jamba-v0.1/32c26cbc-3697-47a6-bd12-18187df9dda9.json deleted file mode 100644 index 61193fc54..000000000 --- a/data/hfopenllm_v2/ai21labs/Jamba-v0.1/32c26cbc-3697-47a6-bd12-18187df9dda9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ai21labs_Jamba-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jamba-v0.1", - "id": "ai21labs/Jamba-v0.1", - "developer": "ai21labs", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "JambaForCausalLM", - "params_billions": 51.57 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2026 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3602 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.359 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ai4bharat/Airavata/02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json b/data/hfopenllm_v2/ai4bharat/Airavata/02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json deleted file mode 100644 index 0badacdc8..000000000 --- a/data/hfopenllm_v2/ai4bharat/Airavata/02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ai4bharat_Airavata/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Airavata", - "id": "ai4bharat/Airavata", - "developer": "ai4bharat", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.87 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3628 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3763 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1635 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/aixonlab/Aether-12b/a57d2d49-5ccf-48f5-8035-b1d480c80f40.json b/data/hfopenllm_v2/aixonlab/Aether-12b/a57d2d49-5ccf-48f5-8035-b1d480c80f40.json deleted file mode 100644 index 0edbd3648..000000000 --- a/data/hfopenllm_v2/aixonlab/Aether-12b/a57d2d49-5ccf-48f5-8035-b1d480c80f40.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/aixonlab_Aether-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aether-12b", - "id": "aixonlab/Aether-12b", - "developer": "aixonlab", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2347 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5179 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3829 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.341 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/aixonlab/Grey-12b/6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json b/data/hfopenllm_v2/aixonlab/Grey-12b/6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json deleted file mode 100644 index ae2b8e520..000000000 --- a/data/hfopenllm_v2/aixonlab/Grey-12b/6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/aixonlab_Grey-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Grey-12b", - "id": "aixonlab/Grey-12b", - "developer": "aixonlab", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5699 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4516 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3779 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/aixonlab/Zara-14b-v1.2/fe0665dd-b976-4d90-b16b-6c2acfef15ff.json b/data/hfopenllm_v2/aixonlab/Zara-14b-v1.2/fe0665dd-b976-4d90-b16b-6c2acfef15ff.json deleted file mode 100644 index fff6f1b78..000000000 --- a/data/hfopenllm_v2/aixonlab/Zara-14b-v1.2/fe0665dd-b976-4d90-b16b-6c2acfef15ff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/aixonlab_Zara-14b-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zara-14b-v1.2", - "id": "aixonlab/Zara-14b-v1.2", - "developer": "aixonlab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6197 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6405 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4675 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5263 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-First/8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-First/8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json deleted file mode 100644 index d89f7563d..000000000 --- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-First/8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.0.01-First/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2.1B.0.01-First", - "id": "akhadangi/Llama3.2.1B.0.01-First", - "developer": "akhadangi", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0814 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3194 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1197 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-Last/e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-Last/e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json deleted file mode 100644 index c7af9c6fd..000000000 --- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-Last/e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.0.01-Last/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2.1B.0.01-Last", - "id": "akhadangi/Llama3.2.1B.0.01-Last", - "developer": "akhadangi", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0917 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3159 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3206 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1227 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-First/26c4c993-ae49-42a0-be0a-f157be9f7d58.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-First/26c4c993-ae49-42a0-be0a-f157be9f7d58.json deleted file mode 100644 index 79629d7c0..000000000 --- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-First/26c4c993-ae49-42a0-be0a-f157be9f7d58.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.0.1-First/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2.1B.0.1-First", - "id": "akhadangi/Llama3.2.1B.0.1-First", - "developer": "akhadangi", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1001 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.312 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1169 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-Last/19adf124-c120-4e97-80cf-49c40a66eb81.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-Last/19adf124-c120-4e97-80cf-49c40a66eb81.json deleted file mode 100644 index 75815fcac..000000000 --- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-Last/19adf124-c120-4e97-80cf-49c40a66eb81.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.0.1-Last/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2.1B.0.1-Last", - "id": "akhadangi/Llama3.2.1B.0.1-Last", - "developer": "akhadangi", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.095 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3164 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2383 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.BaseFiT/66bc5d38-8d25-4934-bce8-41ce4ea0e385.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.BaseFiT/66bc5d38-8d25-4934-bce8-41ce4ea0e385.json deleted file mode 100644 index 37dbf8a34..000000000 --- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.BaseFiT/66bc5d38-8d25-4934-bce8-41ce4ea0e385.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.BaseFiT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.2.1B.BaseFiT", - "id": "akhadangi/Llama3.2.1B.BaseFiT", - "developer": "akhadangi", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3175 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/541eafe5-807e-44b0-b652-a0752210fc71.json b/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/541eafe5-807e-44b0-b652-a0752210fc71.json deleted file mode 100644 index a6b87b5af..000000000 --- a/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/541eafe5-807e-44b0-b652-a0752210fc71.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/akjindal53244_Llama-3.1-Storm-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Storm-8B", - "id": "akjindal53244/Llama-3.1-Storm-8B", - "developer": "akjindal53244", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8051 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1722 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4028 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3803 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/845a2484-9f17-4c0e-b06b-6250992298bc.json b/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/845a2484-9f17-4c0e-b06b-6250992298bc.json deleted file mode 100644 index 8d84dffa8..000000000 --- a/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/845a2484-9f17-4c0e-b06b-6250992298bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/akjindal53244_Llama-3.1-Storm-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Storm-8B", - "id": "akjindal53244/Llama-3.1-Storm-8B", - "developer": "akjindal53244", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8033 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5196 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1624 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4028 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/alcholjung/llama3_medical_tuned/e62b6b26-5f3c-42c9-9541-bb8b23caee66.json b/data/hfopenllm_v2/alcholjung/llama3_medical_tuned/e62b6b26-5f3c-42c9-9541-bb8b23caee66.json deleted file mode 100644 index e59f216fd..000000000 --- a/data/hfopenllm_v2/alcholjung/llama3_medical_tuned/e62b6b26-5f3c-42c9-9541-bb8b23caee66.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/alcholjung_llama3_medical_tuned/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3_medical_tuned", - "id": "alcholjung/llama3_medical_tuned", - "developer": "alcholjung", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 16.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4513 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.466 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2946 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-DPO/ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-DPO/ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json deleted file mode 100644 index a5182fb56..000000000 --- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-DPO/ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-70B-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Tulu-3-70B-DPO", - "id": "allenai/Llama-3.1-Tulu-3-70B-DPO", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8282 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6146 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4494 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4923 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4633 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-SFT/a70b8356-94ce-4f0d-b44a-2215076eed5e.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-SFT/a70b8356-94ce-4f0d-b44a-2215076eed5e.json deleted file mode 100644 index 1684989ec..000000000 --- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-SFT/a70b8356-94ce-4f0d-b44a-2215076eed5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-70B-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Tulu-3-70B-SFT", - "id": "allenai/Llama-3.1-Tulu-3-70B-SFT", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8051 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5951 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5026 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/b182807d-587e-4702-bf30-dab11983b8db.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/b182807d-587e-4702-bf30-dab11983b8db.json deleted file mode 100644 index 80ba532bf..000000000 --- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/b182807d-587e-4702-bf30-dab11983b8db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Tulu-3-70B", - "id": "allenai/Llama-3.1-Tulu-3-70B", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8291 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6164 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4502 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4948 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4645 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/c1f0944a-c44c-42e9-90ba-a847509cbd66.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/c1f0944a-c44c-42e9-90ba-a847509cbd66.json deleted file mode 100644 index 2e028f4fd..000000000 --- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/c1f0944a-c44c-42e9-90ba-a847509cbd66.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Tulu-3-70B", - "id": "allenai/Llama-3.1-Tulu-3-70B", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8379 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6157 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3829 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4988 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4656 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-DPO/64bb8530-7071-402e-ba9b-1d15ecbe275c.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-DPO/64bb8530-7071-402e-ba9b-1d15ecbe275c.json deleted file mode 100644 index 012b3e9c5..000000000 --- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-DPO/64bb8530-7071-402e-ba9b-1d15ecbe275c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Tulu-3-8B-DPO", - "id": "allenai/Llama-3.1-Tulu-3-8B-DPO", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8029 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4079 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2364 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4161 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2898 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-RM/4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-RM/4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json deleted file mode 100644 index 9397e2c60..000000000 --- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-RM/4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B-RM/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Tulu-3-8B-RM", - "id": "allenai/Llama-3.1-Tulu-3-8B-RM", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForSequenceClassification", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.167 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.295 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3764 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1082 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-SFT/1420df5c-690e-4b01-b99c-c21c793689ae.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-SFT/1420df5c-690e-4b01-b99c-c21c793689ae.json deleted file mode 100644 index f10b3e75a..000000000 --- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-SFT/1420df5c-690e-4b01-b99c-c21c793689ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Tulu-3-8B-SFT", - "id": "allenai/Llama-3.1-Tulu-3-8B-SFT", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7403 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4268 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json deleted file mode 100644 index 32b0d8b0b..000000000 --- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Tulu-3-8B", - "id": "allenai/Llama-3.1-Tulu-3-8B", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8255 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4061 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4175 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2821 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/dfabd777-8620-40e3-b19c-a9227f57b638.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/dfabd777-8620-40e3-b19c-a9227f57b638.json deleted file mode 100644 index fe1b7eb80..000000000 --- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/dfabd777-8620-40e3-b19c-a9227f57b638.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Tulu-3-8B", - "id": "allenai/Llama-3.1-Tulu-3-8B", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8267 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.405 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4175 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/OLMo-1.7-7B-hf/08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json b/data/hfopenllm_v2/allenai/OLMo-1.7-7B-hf/08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json deleted file mode 100644 index 6810a21ef..000000000 --- a/data/hfopenllm_v2/allenai/OLMo-1.7-7B-hf/08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_OLMo-1.7-7B-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo-1.7-7B-hf", - "id": "allenai/OLMo-1.7-7B-hf", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Unknown", - "params_billions": 0.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1569 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3014 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/OLMo-1B-hf/4b264bb0-bd7e-4b15-9591-50b5a521f100.json b/data/hfopenllm_v2/allenai/OLMo-1B-hf/4b264bb0-bd7e-4b15-9591-50b5a521f100.json deleted file mode 100644 index 667c017f1..000000000 --- a/data/hfopenllm_v2/allenai/OLMo-1B-hf/4b264bb0-bd7e-4b15-9591-50b5a521f100.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_OLMo-1B-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo-1B-hf", - "id": "allenai/OLMo-1B-hf", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "OlmoForCausalLM", - "params_billions": 1.177 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2182 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3052 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1174 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/OLMo-2-1124-7B-Instruct/a8cfe336-0c3e-401c-a1e9-d951e64918ec.json b/data/hfopenllm_v2/allenai/OLMo-2-1124-7B-Instruct/a8cfe336-0c3e-401c-a1e9-d951e64918ec.json deleted file mode 100644 index a7c81df8a..000000000 --- a/data/hfopenllm_v2/allenai/OLMo-2-1124-7B-Instruct/a8cfe336-0c3e-401c-a1e9-d951e64918ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_OLMo-2-1124-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo-2-1124-7B-Instruct", - "id": "allenai/OLMo-2-1124-7B-Instruct", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Olmo2ForCausalLM", - "params_billions": 7.299 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7244 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4022 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1488 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3508 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2672 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/OLMo-7B-Instruct-hf/5e66c653-41b1-46de-b677-ffd8426ba5ec.json b/data/hfopenllm_v2/allenai/OLMo-7B-Instruct-hf/5e66c653-41b1-46de-b677-ffd8426ba5ec.json deleted file mode 100644 index d0864d416..000000000 --- a/data/hfopenllm_v2/allenai/OLMo-7B-Instruct-hf/5e66c653-41b1-46de-b677-ffd8426ba5ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_OLMo-7B-Instruct-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo-7B-Instruct-hf", - "id": "allenai/OLMo-7B-Instruct-hf", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "OlmoForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3706 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3765 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1785 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/OLMo-7B-hf/9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json b/data/hfopenllm_v2/allenai/OLMo-7B-hf/9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json deleted file mode 100644 index f93b86dbe..000000000 --- a/data/hfopenllm_v2/allenai/OLMo-7B-hf/9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_OLMo-7B-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMo-7B-hf", - "id": "allenai/OLMo-7B-hf", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "OlmoForCausalLM", - "params_billions": 6.888 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3279 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0125-Instruct/cc64a143-4f1e-42ee-ade1-fafc4b316336.json b/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0125-Instruct/cc64a143-4f1e-42ee-ade1-fafc4b316336.json deleted file mode 100644 index d3d1048ff..000000000 --- a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0125-Instruct/cc64a143-4f1e-42ee-ade1-fafc4b316336.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_OLMoE-1B-7B-0125-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMoE-1B-7B-0125-Instruct", - "id": "allenai/OLMoE-1B-7B-0125-Instruct", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "OlmoeForCausalLM", - "params_billions": 6.919 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3636 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1915 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924-Instruct/cf322e64-2682-4a9a-a48f-c4ec47b852f2.json b/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924-Instruct/cf322e64-2682-4a9a-a48f-c4ec47b852f2.json deleted file mode 100644 index 9edb56ae9..000000000 --- a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924-Instruct/cf322e64-2682-4a9a-a48f-c4ec47b852f2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_OLMoE-1B-7B-0924-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMoE-1B-7B-0924-Instruct", - "id": "allenai/OLMoE-1B-7B-0924-Instruct", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "OlmoeForCausalLM", - "params_billions": 6.919 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4667 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3902 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3848 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1876 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924/30b32261-b24a-49e3-ba57-172dc1d03ba0.json b/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924/30b32261-b24a-49e3-ba57-172dc1d03ba0.json deleted file mode 100644 index 9a8c2b352..000000000 --- a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924/30b32261-b24a-49e3-ba57-172dc1d03ba0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allenai_OLMoE-1B-7B-0924/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OLMoE-1B-7B-0924", - "id": "allenai/OLMoE-1B-7B-0924", - "developer": "allenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "OlmoeForCausalLM", - "params_billions": 6.919 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2185 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3393 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.174 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Chocolatine-24B/0681c01d-23f3-4b8b-9516-a5cc41761fc4.json b/data/hfopenllm_v2/allknowingroger/Chocolatine-24B/0681c01d-23f3-4b8b-9516-a5cc41761fc4.json deleted file mode 100644 index 159a92fad..000000000 --- a/data/hfopenllm_v2/allknowingroger/Chocolatine-24B/0681c01d-23f3-4b8b-9516-a5cc41761fc4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Chocolatine-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-24B", - "id": "allknowingroger/Chocolatine-24B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 24.184 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1958 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6191 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4566 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-2.6B/7693ed8a-f76d-482b-92c1-f11810e522ca.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-2.6B/7693ed8a-f76d-482b-92c1-f11810e522ca.json deleted file mode 100644 index 7d68b1a32..000000000 --- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-2.6B/7693ed8a-f76d-482b-92c1-f11810e522ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp1-2.6B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2Slerp1-2.6B", - "id": "allknowingroger/Gemma2Slerp1-2.6B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5354 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2689 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-27B/f8dc0128-c606-490a-b965-59d5377dd778.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-27B/f8dc0128-c606-490a-b965-59d5377dd778.json deleted file mode 100644 index 8516f0711..000000000 --- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-27B/f8dc0128-c606-490a-b965-59d5377dd778.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp1-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2Slerp1-27B", - "id": "allknowingroger/Gemma2Slerp1-27B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7186 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6399 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2583 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4767 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-2.6B/844547f7-658f-41dd-ab4c-dc0569030e59.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-2.6B/844547f7-658f-41dd-ab4c-dc0569030e59.json deleted file mode 100644 index 58ea93670..000000000 --- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-2.6B/844547f7-658f-41dd-ab4c-dc0569030e59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp2-2.6B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2Slerp2-2.6B", - "id": "allknowingroger/Gemma2Slerp2-2.6B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5747 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4308 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4468 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2696 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-27B/75c291b5-6d60-4bde-8621-f865196a6ecc.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-27B/75c291b5-6d60-4bde-8621-f865196a6ecc.json deleted file mode 100644 index 921a675ca..000000000 --- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-27B/75c291b5-6d60-4bde-8621-f865196a6ecc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp2-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2Slerp2-27B", - "id": "allknowingroger/Gemma2Slerp2-27B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7546 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6557 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2787 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4623 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp3-27B/36d54b12-594f-47fe-9637-a9b740416c5c.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp3-27B/36d54b12-594f-47fe-9637-a9b740416c5c.json deleted file mode 100644 index 59baa2f21..000000000 --- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp3-27B/36d54b12-594f-47fe-9637-a9b740416c5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp3-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2Slerp3-27B", - "id": "allknowingroger/Gemma2Slerp3-27B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7426 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.65 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2742 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4641 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp4-27B/57733383-9573-463d-a467-068d2685014c.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp4-27B/57733383-9573-463d-a467-068d2685014c.json deleted file mode 100644 index 5c534391b..000000000 --- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp4-27B/57733383-9573-463d-a467-068d2685014c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp4-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2Slerp4-27B", - "id": "allknowingroger/Gemma2Slerp4-27B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7497 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.653 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/GemmaSlerp-9B/eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json b/data/hfopenllm_v2/allknowingroger/GemmaSlerp-9B/eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json deleted file mode 100644 index edc562705..000000000 --- a/data/hfopenllm_v2/allknowingroger/GemmaSlerp-9B/eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaSlerp-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GemmaSlerp-9B", - "id": "allknowingroger/GemmaSlerp-9B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5921 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4673 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4161 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/GemmaSlerp2-9B/00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json b/data/hfopenllm_v2/allknowingroger/GemmaSlerp2-9B/00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json deleted file mode 100644 index 126811a1c..000000000 --- a/data/hfopenllm_v2/allknowingroger/GemmaSlerp2-9B/00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaSlerp2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GemmaSlerp2-9B", - "id": "allknowingroger/GemmaSlerp2-9B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7281 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5983 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2107 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4767 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4239 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/GemmaSlerp4-10B/0a3b9ad6-b853-471d-a292-413b30273034.json b/data/hfopenllm_v2/allknowingroger/GemmaSlerp4-10B/0a3b9ad6-b853-471d-a292-413b30273034.json deleted file mode 100644 index 4749c39dc..000000000 --- a/data/hfopenllm_v2/allknowingroger/GemmaSlerp4-10B/0a3b9ad6-b853-471d-a292-413b30273034.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaSlerp4-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GemmaSlerp4-10B", - "id": "allknowingroger/GemmaSlerp4-10B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7326 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6028 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.454 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/GemmaSlerp5-10B/d61c3ace-e353-4c0b-9472-c9a1928809cc.json b/data/hfopenllm_v2/allknowingroger/GemmaSlerp5-10B/d61c3ace-e353-4c0b-9472-c9a1928809cc.json deleted file mode 100644 index 16732735f..000000000 --- a/data/hfopenllm_v2/allknowingroger/GemmaSlerp5-10B/d61c3ace-e353-4c0b-9472-c9a1928809cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaSlerp5-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GemmaSlerp5-10B", - "id": "allknowingroger/GemmaSlerp5-10B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7353 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6054 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4608 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4328 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/GemmaStock1-27B/2293a19a-b650-436d-9448-1b641e63d407.json b/data/hfopenllm_v2/allknowingroger/GemmaStock1-27B/2293a19a-b650-436d-9448-1b641e63d407.json deleted file mode 100644 index c77b2f811..000000000 --- a/data/hfopenllm_v2/allknowingroger/GemmaStock1-27B/2293a19a-b650-436d-9448-1b641e63d407.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaStock1-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GemmaStock1-27B", - "id": "allknowingroger/GemmaStock1-27B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6566 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2636 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/HomerSlerp1-7B/c15b977c-c781-4b17-ac9f-25c77602c875.json b/data/hfopenllm_v2/allknowingroger/HomerSlerp1-7B/c15b977c-c781-4b17-ac9f-25c77602c875.json deleted file mode 100644 index a693fa309..000000000 --- a/data/hfopenllm_v2/allknowingroger/HomerSlerp1-7B/c15b977c-c781-4b17-ac9f-25c77602c875.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_HomerSlerp1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HomerSlerp1-7B", - "id": "allknowingroger/HomerSlerp1-7B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5518 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4359 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4504 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/HomerSlerp2-7B/42c191be-c0ae-4170-8b6f-565053ae7d9c.json b/data/hfopenllm_v2/allknowingroger/HomerSlerp2-7B/42c191be-c0ae-4170-8b6f-565053ae7d9c.json deleted file mode 100644 index a4cdc4f0b..000000000 --- a/data/hfopenllm_v2/allknowingroger/HomerSlerp2-7B/42c191be-c0ae-4170-8b6f-565053ae7d9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_HomerSlerp2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HomerSlerp2-7B", - "id": "allknowingroger/HomerSlerp2-7B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5649 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2968 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4515 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/HomerSlerp3-7B/f5cb910d-6e5b-404a-a751-d5cb90668150.json b/data/hfopenllm_v2/allknowingroger/HomerSlerp3-7B/f5cb910d-6e5b-404a-a751-d5cb90668150.json deleted file mode 100644 index 776bb888f..000000000 --- a/data/hfopenllm_v2/allknowingroger/HomerSlerp3-7B/f5cb910d-6e5b-404a-a751-d5cb90668150.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_HomerSlerp3-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HomerSlerp3-7B", - "id": "allknowingroger/HomerSlerp3-7B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4363 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5598 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3021 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4535 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/HomerSlerp4-7B/de806e4c-dbf8-48cc-a0d8-033a61dfc777.json b/data/hfopenllm_v2/allknowingroger/HomerSlerp4-7B/de806e4c-dbf8-48cc-a0d8-033a61dfc777.json deleted file mode 100644 index 304635237..000000000 --- a/data/hfopenllm_v2/allknowingroger/HomerSlerp4-7B/de806e4c-dbf8-48cc-a0d8-033a61dfc777.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_HomerSlerp4-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HomerSlerp4-7B", - "id": "allknowingroger/HomerSlerp4-7B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4374 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5571 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.327 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4472 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/LimyQstar-7B-slerp/59150b73-b05a-451e-ba3f-696d04effe05.json b/data/hfopenllm_v2/allknowingroger/LimyQstar-7B-slerp/59150b73-b05a-451e-ba3f-696d04effe05.json deleted file mode 100644 index 992931ce3..000000000 --- a/data/hfopenllm_v2/allknowingroger/LimyQstar-7B-slerp/59150b73-b05a-451e-ba3f-696d04effe05.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_LimyQstar-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LimyQstar-7B-slerp", - "id": "allknowingroger/LimyQstar-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3491 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5024 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4146 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3103 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Llama3.1-60B/84926b81-360a-480c-b240-f154ec7fe0ba.json b/data/hfopenllm_v2/allknowingroger/Llama3.1-60B/84926b81-360a-480c-b240-f154ec7fe0ba.json deleted file mode 100644 index 37c1e779c..000000000 --- a/data/hfopenllm_v2/allknowingroger/Llama3.1-60B/84926b81-360a-480c-b240-f154ec7fe0ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Llama3.1-60B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-60B", - "id": "allknowingroger/Llama3.1-60B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 61.997 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1815 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3242 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3596 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Marco-01-slerp1-7B/8e6edb04-302b-4dfc-b38f-94b437c921a8.json b/data/hfopenllm_v2/allknowingroger/Marco-01-slerp1-7B/8e6edb04-302b-4dfc-b38f-94b437c921a8.json deleted file mode 100644 index d13ebcc1a..000000000 --- a/data/hfopenllm_v2/allknowingroger/Marco-01-slerp1-7B/8e6edb04-302b-4dfc-b38f-94b437c921a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Marco-01-slerp1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Marco-01-slerp1-7B", - "id": "allknowingroger/Marco-01-slerp1-7B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4681 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5541 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4483 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Meme-7B-slerp/db92c564-1cf9-43db-9e25-1f450c7b1e7f.json b/data/hfopenllm_v2/allknowingroger/Meme-7B-slerp/db92c564-1cf9-43db-9e25-1f450c7b1e7f.json deleted file mode 100644 index 2747ba0f5..000000000 --- a/data/hfopenllm_v2/allknowingroger/Meme-7B-slerp/db92c564-1cf9-43db-9e25-1f450c7b1e7f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Meme-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meme-7B-slerp", - "id": "allknowingroger/Meme-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5164 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4661 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4223 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Ministral-8B-slerp/e3796243-cbba-4ec2-ad7c-89547ad24342.json b/data/hfopenllm_v2/allknowingroger/Ministral-8B-slerp/e3796243-cbba-4ec2-ad7c-89547ad24342.json deleted file mode 100644 index ae54b512b..000000000 --- a/data/hfopenllm_v2/allknowingroger/Ministral-8B-slerp/e3796243-cbba-4ec2-ad7c-89547ad24342.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Ministral-8B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ministral-8B-slerp", - "id": "allknowingroger/Ministral-8B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4686 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4285 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3119 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MistralPhi3-11B/1479be90-df8f-4e1d-b9db-03e84000187a.json b/data/hfopenllm_v2/allknowingroger/MistralPhi3-11B/1479be90-df8f-4e1d-b9db-03e84000187a.json deleted file mode 100644 index 97d6fd6ec..000000000 --- a/data/hfopenllm_v2/allknowingroger/MistralPhi3-11B/1479be90-df8f-4e1d-b9db-03e84000187a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MistralPhi3-11B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MistralPhi3-11B", - "id": "allknowingroger/MistralPhi3-11B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 11.234 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1943 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6234 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4267 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Mistralmash1-7B-s/d2e6c48c-1c18-45a6-ba1a-b335325c980c.json b/data/hfopenllm_v2/allknowingroger/Mistralmash1-7B-s/d2e6c48c-1c18-45a6-ba1a-b335325c980c.json deleted file mode 100644 index c3b192913..000000000 --- a/data/hfopenllm_v2/allknowingroger/Mistralmash1-7B-s/d2e6c48c-1c18-45a6-ba1a-b335325c980c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Mistralmash1-7B-s/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistralmash1-7B-s", - "id": "allknowingroger/Mistralmash1-7B-s", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5277 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0921 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4267 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3293 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Mistralmash2-7B-s/f843e45a-f66b-4091-a964-75583c2d7fc5.json b/data/hfopenllm_v2/allknowingroger/Mistralmash2-7B-s/f843e45a-f66b-4091-a964-75583c2d7fc5.json deleted file mode 100644 index 5d5a5cc33..000000000 --- a/data/hfopenllm_v2/allknowingroger/Mistralmash2-7B-s/f843e45a-f66b-4091-a964-75583c2d7fc5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Mistralmash2-7B-s/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistralmash2-7B-s", - "id": "allknowingroger/Mistralmash2-7B-s", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4102 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5305 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3345 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MixTAO-19B-pass/cbc3cd41-e187-4c4f-b207-37bceab423a4.json b/data/hfopenllm_v2/allknowingroger/MixTAO-19B-pass/cbc3cd41-e187-4c4f-b207-37bceab423a4.json deleted file mode 100644 index 860bc85e9..000000000 --- a/data/hfopenllm_v2/allknowingroger/MixTAO-19B-pass/cbc3cd41-e187-4c4f-b207-37bceab423a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MixTAO-19B-pass/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MixTAO-19B-pass", - "id": "allknowingroger/MixTAO-19B-pass", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 19.188 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5128 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4783 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MixTaoTruthful-13B-slerp/0f124566-5e94-4233-9a3f-5ff9cfdf160c.json b/data/hfopenllm_v2/allknowingroger/MixTaoTruthful-13B-slerp/0f124566-5e94-4233-9a3f-5ff9cfdf160c.json deleted file mode 100644 index 68ee45cec..000000000 --- a/data/hfopenllm_v2/allknowingroger/MixTaoTruthful-13B-slerp/0f124566-5e94-4233-9a3f-5ff9cfdf160c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MixTaoTruthful-13B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MixTaoTruthful-13B-slerp", - "id": "allknowingroger/MixTaoTruthful-13B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4139 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4292 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiCalm-7B-slerp/98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json b/data/hfopenllm_v2/allknowingroger/MultiCalm-7B-slerp/98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json deleted file mode 100644 index 717e85ceb..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiCalm-7B-slerp/98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiCalm-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiCalm-7B-slerp", - "id": "allknowingroger/MultiCalm-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3927 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5122 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4319 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3033 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash-12B-slerp/91522dad-529b-477c-8372-793f631e14b7.json b/data/hfopenllm_v2/allknowingroger/MultiMash-12B-slerp/91522dad-529b-477c-8372-793f631e14b7.json deleted file mode 100644 index 7be2befa5..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMash-12B-slerp/91522dad-529b-477c-8372-793f631e14b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash-12B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMash-12B-slerp", - "id": "allknowingroger/MultiMash-12B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3974 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5142 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4438 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3068 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash10-13B-slerp/cec22734-493c-4d11-ba86-6c7ae2005124.json b/data/hfopenllm_v2/allknowingroger/MultiMash10-13B-slerp/cec22734-493c-4d11-ba86-6c7ae2005124.json deleted file mode 100644 index 400c2095d..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMash10-13B-slerp/cec22734-493c-4d11-ba86-6c7ae2005124.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash10-13B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMash10-13B-slerp", - "id": "allknowingroger/MultiMash10-13B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4163 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5186 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4318 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash11-13B-slerp/704a6e19-0d86-42a5-b8f5-05a5856e9c29.json b/data/hfopenllm_v2/allknowingroger/MultiMash11-13B-slerp/704a6e19-0d86-42a5-b8f5-05a5856e9c29.json deleted file mode 100644 index d47ffa3a1..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMash11-13B-slerp/704a6e19-0d86-42a5-b8f5-05a5856e9c29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash11-13B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMash11-13B-slerp", - "id": "allknowingroger/MultiMash11-13B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3085 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash2-12B-slerp/bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json b/data/hfopenllm_v2/allknowingroger/MultiMash2-12B-slerp/bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json deleted file mode 100644 index ee50b15c0..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMash2-12B-slerp/bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash2-12B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMash2-12B-slerp", - "id": "allknowingroger/MultiMash2-12B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4261 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5134 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3043 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash5-12B-slerp/d20d533a-758b-477c-b4eb-073adaed640e.json b/data/hfopenllm_v2/allknowingroger/MultiMash5-12B-slerp/d20d533a-758b-477c-b4eb-073adaed640e.json deleted file mode 100644 index 195752d1b..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMash5-12B-slerp/d20d533a-758b-477c-b4eb-073adaed640e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash5-12B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMash5-12B-slerp", - "id": "allknowingroger/MultiMash5-12B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4142 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5145 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3028 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash6-12B-slerp/f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json b/data/hfopenllm_v2/allknowingroger/MultiMash6-12B-slerp/f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json deleted file mode 100644 index a7902f000..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMash6-12B-slerp/f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash6-12B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMash6-12B-slerp", - "id": "allknowingroger/MultiMash6-12B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5196 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4306 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3091 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash7-12B-slerp/9db1f823-e068-4a39-a5cc-b9c588099427.json b/data/hfopenllm_v2/allknowingroger/MultiMash7-12B-slerp/9db1f823-e068-4a39-a5cc-b9c588099427.json deleted file mode 100644 index 7142478be..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMash7-12B-slerp/9db1f823-e068-4a39-a5cc-b9c588099427.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash7-12B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMash7-12B-slerp", - "id": "allknowingroger/MultiMash7-12B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4213 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash8-13B-slerp/23818b45-bf5f-48a2-982f-1e2a0d35aac8.json b/data/hfopenllm_v2/allknowingroger/MultiMash8-13B-slerp/23818b45-bf5f-48a2-982f-1e2a0d35aac8.json deleted file mode 100644 index 55e001fb8..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMash8-13B-slerp/23818b45-bf5f-48a2-982f-1e2a0d35aac8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash8-13B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMash8-13B-slerp", - "id": "allknowingroger/MultiMash8-13B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4424 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash9-13B-slerp/de6eda66-b8f5-4b23-89e1-44bbac600953.json b/data/hfopenllm_v2/allknowingroger/MultiMash9-13B-slerp/de6eda66-b8f5-4b23-89e1-44bbac600953.json deleted file mode 100644 index b3fb05f58..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMash9-13B-slerp/de6eda66-b8f5-4b23-89e1-44bbac600953.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash9-13B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMash9-13B-slerp", - "id": "allknowingroger/MultiMash9-13B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiMerge-7B-slerp/632974c2-57e2-41f9-8c00-671e07e7594b.json b/data/hfopenllm_v2/allknowingroger/MultiMerge-7B-slerp/632974c2-57e2-41f9-8c00-671e07e7594b.json deleted file mode 100644 index a78094980..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiMerge-7B-slerp/632974c2-57e2-41f9-8c00-671e07e7594b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMerge-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiMerge-7B-slerp", - "id": "allknowingroger/MultiMerge-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3948 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Multimash3-12B-slerp/e86dcf4f-6282-4aa6-b645-00f93a2e9077.json b/data/hfopenllm_v2/allknowingroger/Multimash3-12B-slerp/e86dcf4f-6282-4aa6-b645-00f93a2e9077.json deleted file mode 100644 index 67777cbfb..000000000 --- a/data/hfopenllm_v2/allknowingroger/Multimash3-12B-slerp/e86dcf4f-6282-4aa6-b645-00f93a2e9077.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Multimash3-12B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Multimash3-12B-slerp", - "id": "allknowingroger/Multimash3-12B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5177 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4344 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3068 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Multimerge-19B-pass/b20be5c9-9720-4076-b587-728549dd19af.json b/data/hfopenllm_v2/allknowingroger/Multimerge-19B-pass/b20be5c9-9720-4076-b587-728549dd19af.json deleted file mode 100644 index 1204ac4a5..000000000 --- a/data/hfopenllm_v2/allknowingroger/Multimerge-19B-pass/b20be5c9-9720-4076-b587-728549dd19af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Multimerge-19B-pass/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Multimerge-19B-pass", - "id": "allknowingroger/Multimerge-19B-pass", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 19.188 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1773 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2892 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1169 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/MultiverseEx26-7B-slerp/5e193803-39d1-4f12-8726-ebbe5f71563c.json b/data/hfopenllm_v2/allknowingroger/MultiverseEx26-7B-slerp/5e193803-39d1-4f12-8726-ebbe5f71563c.json deleted file mode 100644 index b5df1f90d..000000000 --- a/data/hfopenllm_v2/allknowingroger/MultiverseEx26-7B-slerp/5e193803-39d1-4f12-8726-ebbe5f71563c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_MultiverseEx26-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MultiverseEx26-7B-slerp", - "id": "allknowingroger/MultiverseEx26-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5134 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3035 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/NeuralWestSeverus-7B-slerp/61131a6c-f412-42bf-814b-7d711a840d44.json b/data/hfopenllm_v2/allknowingroger/NeuralWestSeverus-7B-slerp/61131a6c-f412-42bf-814b-7d711a840d44.json deleted file mode 100644 index b88992a9a..000000000 --- a/data/hfopenllm_v2/allknowingroger/NeuralWestSeverus-7B-slerp/61131a6c-f412-42bf-814b-7d711a840d44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_NeuralWestSeverus-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralWestSeverus-7B-slerp", - "id": "allknowingroger/NeuralWestSeverus-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5244 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4529 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Neuralcoven-7B-slerp/535e72b1-17e0-40e3-9d66-d31f8ec70413.json b/data/hfopenllm_v2/allknowingroger/Neuralcoven-7B-slerp/535e72b1-17e0-40e3-9d66-d31f8ec70413.json deleted file mode 100644 index 035e133f8..000000000 --- a/data/hfopenllm_v2/allknowingroger/Neuralcoven-7B-slerp/535e72b1-17e0-40e3-9d66-d31f8ec70413.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Neuralcoven-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neuralcoven-7B-slerp", - "id": "allknowingroger/Neuralcoven-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5303 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3294 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Neuralmultiverse-7B-slerp/ea15479e-24a8-4924-a754-a8567c511e61.json b/data/hfopenllm_v2/allknowingroger/Neuralmultiverse-7B-slerp/ea15479e-24a8-4924-a754-a8567c511e61.json deleted file mode 100644 index 534b2a6eb..000000000 --- a/data/hfopenllm_v2/allknowingroger/Neuralmultiverse-7B-slerp/ea15479e-24a8-4924-a754-a8567c511e61.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Neuralmultiverse-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neuralmultiverse-7B-slerp", - "id": "allknowingroger/Neuralmultiverse-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5166 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3042 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Ph3della5-14B/5799f285-c61f-43a8-a6a6-053808cf4e8f.json b/data/hfopenllm_v2/allknowingroger/Ph3della5-14B/5799f285-c61f-43a8-a6a6-053808cf4e8f.json deleted file mode 100644 index 0a78188b8..000000000 --- a/data/hfopenllm_v2/allknowingroger/Ph3della5-14B/5799f285-c61f-43a8-a6a6-053808cf4e8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3della5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ph3della5-14B", - "id": "allknowingroger/Ph3della5-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4799 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6332 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1767 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4787 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Ph3merge-14B/36feef44-3d3b-4102-8606-ee6420bddcff.json b/data/hfopenllm_v2/allknowingroger/Ph3merge-14B/36feef44-3d3b-4102-8606-ee6420bddcff.json deleted file mode 100644 index 9d397a68f..000000000 --- a/data/hfopenllm_v2/allknowingroger/Ph3merge-14B/36feef44-3d3b-4102-8606-ee6420bddcff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3merge-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ph3merge-14B", - "id": "allknowingroger/Ph3merge-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.619 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6381 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4334 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4611 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Ph3merge2-14B/fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json b/data/hfopenllm_v2/allknowingroger/Ph3merge2-14B/fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json deleted file mode 100644 index 3229ae0a1..000000000 --- a/data/hfopenllm_v2/allknowingroger/Ph3merge2-14B/fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3merge2-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ph3merge2-14B", - "id": "allknowingroger/Ph3merge2-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.619 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1706 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1723 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Ph3merge3-14B/18e5decd-c95e-43d2-9ba2-007ba32e216f.json b/data/hfopenllm_v2/allknowingroger/Ph3merge3-14B/18e5decd-c95e-43d2-9ba2-007ba32e216f.json deleted file mode 100644 index 56d85e1a4..000000000 --- a/data/hfopenllm_v2/allknowingroger/Ph3merge3-14B/18e5decd-c95e-43d2-9ba2-007ba32e216f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3merge3-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ph3merge3-14B", - "id": "allknowingroger/Ph3merge3-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.619 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1645 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3597 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4082 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1647 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Ph3task1-14B/85a4996e-8c44-4e4f-9478-19a8c5513617.json b/data/hfopenllm_v2/allknowingroger/Ph3task1-14B/85a4996e-8c44-4e4f-9478-19a8c5513617.json deleted file mode 100644 index 7b39272a0..000000000 --- a/data/hfopenllm_v2/allknowingroger/Ph3task1-14B/85a4996e-8c44-4e4f-9478-19a8c5513617.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3task1-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ph3task1-14B", - "id": "allknowingroger/Ph3task1-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4695 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6318 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1669 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4508 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4734 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Ph3task2-14B/db6d57c8-df0b-407e-b937-67c55b513a5f.json b/data/hfopenllm_v2/allknowingroger/Ph3task2-14B/db6d57c8-df0b-407e-b937-67c55b513a5f.json deleted file mode 100644 index 32356567b..000000000 --- a/data/hfopenllm_v2/allknowingroger/Ph3task2-14B/db6d57c8-df0b-407e-b937-67c55b513a5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3task2-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ph3task2-14B", - "id": "allknowingroger/Ph3task2-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4713 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6098 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1465 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4535 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Ph3task3-14B/89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json b/data/hfopenllm_v2/allknowingroger/Ph3task3-14B/89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json deleted file mode 100644 index 817fe3f3d..000000000 --- a/data/hfopenllm_v2/allknowingroger/Ph3task3-14B/89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3task3-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ph3task3-14B", - "id": "allknowingroger/Ph3task3-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4771 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Ph3unsloth-3B-slerp/c79e690f-3e09-4fac-9412-937a3b7ef352.json b/data/hfopenllm_v2/allknowingroger/Ph3unsloth-3B-slerp/c79e690f-3e09-4fac-9412-937a3b7ef352.json deleted file mode 100644 index ba7ad3c78..000000000 --- a/data/hfopenllm_v2/allknowingroger/Ph3unsloth-3B-slerp/c79e690f-3e09-4fac-9412-937a3b7ef352.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3unsloth-3B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ph3unsloth-3B-slerp", - "id": "allknowingroger/Ph3unsloth-3B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1894 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5468 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Phi3mash1-17B-pass/ce74b7e3-8505-4c79-a7de-12d1e6b47155.json b/data/hfopenllm_v2/allknowingroger/Phi3mash1-17B-pass/ce74b7e3-8505-4c79-a7de-12d1e6b47155.json deleted file mode 100644 index fb008b871..000000000 --- a/data/hfopenllm_v2/allknowingroger/Phi3mash1-17B-pass/ce74b7e3-8505-4c79-a7de-12d1e6b47155.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Phi3mash1-17B-pass/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi3mash1-17B-pass", - "id": "allknowingroger/Phi3mash1-17B-pass", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 16.687 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1884 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6129 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4451 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4589 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Quen2-65B/3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json b/data/hfopenllm_v2/allknowingroger/Quen2-65B/3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json deleted file mode 100644 index bd16b28e4..000000000 --- a/data/hfopenllm_v2/allknowingroger/Quen2-65B/3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Quen2-65B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Quen2-65B", - "id": "allknowingroger/Quen2-65B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 63.923 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1758 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2757 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2357 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1114 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-42B-AGI/152b0cbe-e27b-4438-8326-e67f4e70e600.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-42B-AGI/152b0cbe-e27b-4438-8326-e67f4e70e600.json deleted file mode 100644 index a56e8fc53..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-42B-AGI/152b0cbe-e27b-4438-8326-e67f4e70e600.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-42B-AGI/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-42B-AGI", - "id": "allknowingroger/Qwen2.5-42B-AGI", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 42.516 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1913 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2942 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.362 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1168 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task2/c733c91f-79a9-49e5-9398-3a424ee1940a.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task2/c733c91f-79a9-49e5-9398-3a424ee1940a.json deleted file mode 100644 index c7409a22c..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task2/c733c91f-79a9-49e5-9398-3a424ee1940a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-task2", - "id": "allknowingroger/Qwen2.5-7B-task2", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4527 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5626 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task3/32d7b6c6-de5c-4864-a446-97dccce378c5.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task3/32d7b6c6-de5c-4864-a446-97dccce378c5.json deleted file mode 100644 index 0200a240d..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task3/32d7b6c6-de5c-4864-a446-97dccce378c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-task3", - "id": "allknowingroger/Qwen2.5-7B-task3", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5129 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5398 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2606 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4501 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task4/7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task4/7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json deleted file mode 100644 index fcd61a787..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task4/7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-task4", - "id": "allknowingroger/Qwen2.5-7B-task4", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5005 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5583 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4561 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task7/99650529-55d9-42b0-b812-761a30277e5e.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task7/99650529-55d9-42b0-b812-761a30277e5e.json deleted file mode 100644 index 3af393c86..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task7/99650529-55d9-42b0-b812-761a30277e5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-task7", - "id": "allknowingroger/Qwen2.5-7B-task7", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5552 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4326 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task8/81abbc2a-791b-4a39-bb46-97edfa14b9c0.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task8/81abbc2a-791b-4a39-bb46-97edfa14b9c0.json deleted file mode 100644 index fcd83a7a4..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task8/81abbc2a-791b-4a39-bb46-97edfa14b9c0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-task8", - "id": "allknowingroger/Qwen2.5-7B-task8", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4645 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5525 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4433 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-slerp-14B/c658e535-7098-40fc-bea0-f5734d8f4ca9.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-slerp-14B/c658e535-7098-40fc-bea0-f5734d8f4ca9.json deleted file mode 100644 index 9b625885c..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-slerp-14B/c658e535-7098-40fc-bea0-f5734d8f4ca9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-slerp-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-slerp-14B", - "id": "allknowingroger/Qwen2.5-slerp-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4928 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4622 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4744 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/QwenSlerp12-7B/9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json b/data/hfopenllm_v2/allknowingroger/QwenSlerp12-7B/9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json deleted file mode 100644 index e6896f1a4..000000000 --- a/data/hfopenllm_v2/allknowingroger/QwenSlerp12-7B/9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_QwenSlerp12-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp12-7B", - "id": "allknowingroger/QwenSlerp12-7B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5076 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5556 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2946 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4461 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/QwenSlerp4-14B/07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json b/data/hfopenllm_v2/allknowingroger/QwenSlerp4-14B/07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json deleted file mode 100644 index 38cf85407..000000000 --- a/data/hfopenllm_v2/allknowingroger/QwenSlerp4-14B/07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_QwenSlerp4-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp4-14B", - "id": "allknowingroger/QwenSlerp4-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6328 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6483 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.465 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5436 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/QwenSlerp5-14B/c41d8925-b56b-458e-b1a9-27dbbcaee149.json b/data/hfopenllm_v2/allknowingroger/QwenSlerp5-14B/c41d8925-b56b-458e-b1a9-27dbbcaee149.json deleted file mode 100644 index 615bfa0a9..000000000 --- a/data/hfopenllm_v2/allknowingroger/QwenSlerp5-14B/c41d8925-b56b-458e-b1a9-27dbbcaee149.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_QwenSlerp5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp5-14B", - "id": "allknowingroger/QwenSlerp5-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7119 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6357 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4675 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/QwenSlerp6-14B/9136feb4-5c3e-48b3-bc70-c7816b8b189b.json b/data/hfopenllm_v2/allknowingroger/QwenSlerp6-14B/9136feb4-5c3e-48b3-bc70-c7816b8b189b.json deleted file mode 100644 index 32bf693fb..000000000 --- a/data/hfopenllm_v2/allknowingroger/QwenSlerp6-14B/9136feb4-5c3e-48b3-bc70-c7816b8b189b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_QwenSlerp6-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp6-14B", - "id": "allknowingroger/QwenSlerp6-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6867 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.469 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/QwenStock1-14B/c395ef02-9a50-4696-aad2-bcb32ba05f67.json b/data/hfopenllm_v2/allknowingroger/QwenStock1-14B/c395ef02-9a50-4696-aad2-bcb32ba05f67.json deleted file mode 100644 index dd2bb81bf..000000000 --- a/data/hfopenllm_v2/allknowingroger/QwenStock1-14B/c395ef02-9a50-4696-aad2-bcb32ba05f67.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_QwenStock1-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenStock1-14B", - "id": "allknowingroger/QwenStock1-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5634 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6528 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5418 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/QwenStock2-14B/93f47969-556a-4fd4-b7bb-4d1c861a8d71.json b/data/hfopenllm_v2/allknowingroger/QwenStock2-14B/93f47969-556a-4fd4-b7bb-4d1c861a8d71.json deleted file mode 100644 index 83694e0b6..000000000 --- a/data/hfopenllm_v2/allknowingroger/QwenStock2-14B/93f47969-556a-4fd4-b7bb-4d1c861a8d71.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_QwenStock2-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenStock2-14B", - "id": "allknowingroger/QwenStock2-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5563 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6569 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4756 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/QwenStock3-14B/349ae559-6c1f-4b2f-954c-e83cba1e603a.json b/data/hfopenllm_v2/allknowingroger/QwenStock3-14B/349ae559-6c1f-4b2f-954c-e83cba1e603a.json deleted file mode 100644 index 5d15474d8..000000000 --- a/data/hfopenllm_v2/allknowingroger/QwenStock3-14B/349ae559-6c1f-4b2f-954c-e83cba1e603a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_QwenStock3-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenStock3-14B", - "id": "allknowingroger/QwenStock3-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5615 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6565 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4756 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5428 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwenslerp2-14B/3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json b/data/hfopenllm_v2/allknowingroger/Qwenslerp2-14B/3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json deleted file mode 100644 index 74527b77f..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwenslerp2-14B/3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwenslerp2-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenslerp2-14B", - "id": "allknowingroger/Qwenslerp2-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5007 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6555 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4729 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5403 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwenslerp2-7B/500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json b/data/hfopenllm_v2/allknowingroger/Qwenslerp2-7B/500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json deleted file mode 100644 index 0d48394c6..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwenslerp2-7B/500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwenslerp2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenslerp2-7B", - "id": "allknowingroger/Qwenslerp2-7B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5294 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5609 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3421 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4515 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwenslerp3-14B/340a3ebb-bc06-404f-84e7-aeccc016fd32.json b/data/hfopenllm_v2/allknowingroger/Qwenslerp3-14B/340a3ebb-bc06-404f-84e7-aeccc016fd32.json deleted file mode 100644 index 3dedb719e..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwenslerp3-14B/340a3ebb-bc06-404f-84e7-aeccc016fd32.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwenslerp3-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenslerp3-14B", - "id": "allknowingroger/Qwenslerp3-14B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5052 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6521 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4464 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4676 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Qwenslerp3-7B/a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json b/data/hfopenllm_v2/allknowingroger/Qwenslerp3-7B/a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json deleted file mode 100644 index c20870016..000000000 --- a/data/hfopenllm_v2/allknowingroger/Qwenslerp3-7B/a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Qwenslerp3-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenslerp3-7B", - "id": "allknowingroger/Qwenslerp3-7B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5018 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.558 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4515 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4542 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/ROGERphi-7B-slerp/bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json b/data/hfopenllm_v2/allknowingroger/ROGERphi-7B-slerp/bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json deleted file mode 100644 index 0ef272faf..000000000 --- a/data/hfopenllm_v2/allknowingroger/ROGERphi-7B-slerp/bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_ROGERphi-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ROGERphi-7B-slerp", - "id": "allknowingroger/ROGERphi-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3861 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5196 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4685 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3053 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/RogerMerge-7B-slerp/0e1e45d4-2747-480d-9b1f-2b200e250271.json b/data/hfopenllm_v2/allknowingroger/RogerMerge-7B-slerp/0e1e45d4-2747-480d-9b1f-2b200e250271.json deleted file mode 100644 index e44fe42b8..000000000 --- a/data/hfopenllm_v2/allknowingroger/RogerMerge-7B-slerp/0e1e45d4-2747-480d-9b1f-2b200e250271.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_RogerMerge-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RogerMerge-7B-slerp", - "id": "allknowingroger/RogerMerge-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3933 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.303 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Rombos-LLM-V2.5-Qwen-42b/00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json b/data/hfopenllm_v2/allknowingroger/Rombos-LLM-V2.5-Qwen-42b/00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json deleted file mode 100644 index 213c2250c..000000000 --- a/data/hfopenllm_v2/allknowingroger/Rombos-LLM-V2.5-Qwen-42b/00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Rombos-LLM-V2.5-Qwen-42b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5-Qwen-42b", - "id": "allknowingroger/Rombos-LLM-V2.5-Qwen-42b", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 42.516 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2969 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1168 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Strangecoven-7B-slerp/c9e57ab2-c2a4-4935-b976-4bf24647b777.json b/data/hfopenllm_v2/allknowingroger/Strangecoven-7B-slerp/c9e57ab2-c2a4-4935-b976-4bf24647b777.json deleted file mode 100644 index a4348c83f..000000000 --- a/data/hfopenllm_v2/allknowingroger/Strangecoven-7B-slerp/c9e57ab2-c2a4-4935-b976-4bf24647b777.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Strangecoven-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Strangecoven-7B-slerp", - "id": "allknowingroger/Strangecoven-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3746 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5368 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Weirdslerp2-25B/c22436a2-ec60-4220-82b3-123618165eb2.json b/data/hfopenllm_v2/allknowingroger/Weirdslerp2-25B/c22436a2-ec60-4220-82b3-123618165eb2.json deleted file mode 100644 index a8975282d..000000000 --- a/data/hfopenllm_v2/allknowingroger/Weirdslerp2-25B/c22436a2-ec60-4220-82b3-123618165eb2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Weirdslerp2-25B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Weirdslerp2-25B", - "id": "allknowingroger/Weirdslerp2-25B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 25.204 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1754 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2874 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3524 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/WestlakeMaziyar-7B-slerp/1f990438-dd84-44d2-99f9-a10035ecd652.json b/data/hfopenllm_v2/allknowingroger/WestlakeMaziyar-7B-slerp/1f990438-dd84-44d2-99f9-a10035ecd652.json deleted file mode 100644 index 75ae48ca3..000000000 --- a/data/hfopenllm_v2/allknowingroger/WestlakeMaziyar-7B-slerp/1f990438-dd84-44d2-99f9-a10035ecd652.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_WestlakeMaziyar-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WestlakeMaziyar-7B-slerp", - "id": "allknowingroger/WestlakeMaziyar-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4838 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5245 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3078 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/YamMaths-7B-slerp/f4564f5e-3595-466e-8201-0e2a4c50ff0d.json b/data/hfopenllm_v2/allknowingroger/YamMaths-7B-slerp/f4564f5e-3595-466e-8201-0e2a4c50ff0d.json deleted file mode 100644 index ab0338d1d..000000000 --- a/data/hfopenllm_v2/allknowingroger/YamMaths-7B-slerp/f4564f5e-3595-466e-8201-0e2a4c50ff0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_YamMaths-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "YamMaths-7B-slerp", - "id": "allknowingroger/YamMaths-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4148 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5156 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4384 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Yi-1.5-34B/040def3a-702d-4868-b429-39697ca36207.json b/data/hfopenllm_v2/allknowingroger/Yi-1.5-34B/040def3a-702d-4868-b429-39697ca36207.json deleted file mode 100644 index 73e379495..000000000 --- a/data/hfopenllm_v2/allknowingroger/Yi-1.5-34B/040def3a-702d-4868-b429-39697ca36207.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Yi-1.5-34B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-34B", - "id": "allknowingroger/Yi-1.5-34B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3857 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Yi-blossom-40B/9e24fd65-56ec-4160-b299-b34d702a3231.json b/data/hfopenllm_v2/allknowingroger/Yi-blossom-40B/9e24fd65-56ec-4160-b299-b34d702a3231.json deleted file mode 100644 index 6517b6a6e..000000000 --- a/data/hfopenllm_v2/allknowingroger/Yi-blossom-40B/9e24fd65-56ec-4160-b299-b34d702a3231.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Yi-blossom-40B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-blossom-40B", - "id": "allknowingroger/Yi-blossom-40B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 18.769 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2009 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Yibuddy-35B/216bf9f8-9521-4311-a40b-8a847271265c.json b/data/hfopenllm_v2/allknowingroger/Yibuddy-35B/216bf9f8-9521-4311-a40b-8a847271265c.json deleted file mode 100644 index a8d56323b..000000000 --- a/data/hfopenllm_v2/allknowingroger/Yibuddy-35B/216bf9f8-9521-4311-a40b-8a847271265c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Yibuddy-35B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yibuddy-35B", - "id": "allknowingroger/Yibuddy-35B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5916 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1571 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Yillama-40B/45f8c4fb-3591-44df-a4f0-57093b9bae23.json b/data/hfopenllm_v2/allknowingroger/Yillama-40B/45f8c4fb-3591-44df-a4f0-57093b9bae23.json deleted file mode 100644 index 925316eac..000000000 --- a/data/hfopenllm_v2/allknowingroger/Yillama-40B/45f8c4fb-3591-44df-a4f0-57093b9bae23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Yillama-40B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yillama-40B", - "id": "allknowingroger/Yillama-40B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1697 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4063 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1981 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Yislerp-34B/d17275ef-8a32-4fcb-94f4-fb24299ba50e.json b/data/hfopenllm_v2/allknowingroger/Yislerp-34B/d17275ef-8a32-4fcb-94f4-fb24299ba50e.json deleted file mode 100644 index 8c5bc4734..000000000 --- a/data/hfopenllm_v2/allknowingroger/Yislerp-34B/d17275ef-8a32-4fcb-94f4-fb24299ba50e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Yislerp-34B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yislerp-34B", - "id": "allknowingroger/Yislerp-34B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3692 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6159 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4566 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4751 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Yislerp2-34B/61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json b/data/hfopenllm_v2/allknowingroger/Yislerp2-34B/61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json deleted file mode 100644 index 15c44b31d..000000000 --- a/data/hfopenllm_v2/allknowingroger/Yislerp2-34B/61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Yislerp2-34B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yislerp2-34B", - "id": "allknowingroger/Yislerp2-34B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3999 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6246 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2296 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.453 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4724 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/Yunconglong-13B-slerp/113c3507-b738-4b06-ada8-da93b19c6ae2.json b/data/hfopenllm_v2/allknowingroger/Yunconglong-13B-slerp/113c3507-b738-4b06-ada8-da93b19c6ae2.json deleted file mode 100644 index ca824d478..000000000 --- a/data/hfopenllm_v2/allknowingroger/Yunconglong-13B-slerp/113c3507-b738-4b06-ada8-da93b19c6ae2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_Yunconglong-13B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yunconglong-13B-slerp", - "id": "allknowingroger/Yunconglong-13B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4242 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5166 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4161 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3036 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/limyClown-7B-slerp/8835d5c1-8350-4d42-a753-82b94dffda3b.json b/data/hfopenllm_v2/allknowingroger/limyClown-7B-slerp/8835d5c1-8350-4d42-a753-82b94dffda3b.json deleted file mode 100644 index b54682c35..000000000 --- a/data/hfopenllm_v2/allknowingroger/limyClown-7B-slerp/8835d5c1-8350-4d42-a753-82b94dffda3b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_limyClown-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "limyClown-7B-slerp", - "id": "allknowingroger/limyClown-7B-slerp", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4017 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5148 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3038 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/llama3-Jallabi-40B-s/dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json b/data/hfopenllm_v2/allknowingroger/llama3-Jallabi-40B-s/dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json deleted file mode 100644 index 3d7d9aa70..000000000 --- a/data/hfopenllm_v2/allknowingroger/llama3-Jallabi-40B-s/dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_llama3-Jallabi-40B-s/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3-Jallabi-40B-s", - "id": "allknowingroger/llama3-Jallabi-40B-s", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 18.769 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1921 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3252 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2374 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allknowingroger/llama3AnFeng-40B/0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json b/data/hfopenllm_v2/allknowingroger/llama3AnFeng-40B/0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json deleted file mode 100644 index ec77ddbf3..000000000 --- a/data/hfopenllm_v2/allknowingroger/llama3AnFeng-40B/0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allknowingroger_llama3AnFeng-40B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3AnFeng-40B", - "id": "allknowingroger/llama3AnFeng-40B", - "developer": "allknowingroger", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 39.971 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1742 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.198 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allura-org/L3.1-8b-RP-Ink/f2415b7a-2cd7-4a05-834b-7da992e1da1a.json b/data/hfopenllm_v2/allura-org/L3.1-8b-RP-Ink/f2415b7a-2cd7-4a05-834b-7da992e1da1a.json deleted file mode 100644 index acab59baa..000000000 --- a/data/hfopenllm_v2/allura-org/L3.1-8b-RP-Ink/f2415b7a-2cd7-4a05-834b-7da992e1da1a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allura-org_L3.1-8b-RP-Ink/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-8b-RP-Ink", - "id": "allura-org/L3.1-8b-RP-Ink", - "developer": "allura-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7811 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4828 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3608 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3428 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allura-org/MN-12b-RP-Ink/01af237f-40d8-4841-a90d-13dce6db8634.json b/data/hfopenllm_v2/allura-org/MN-12b-RP-Ink/01af237f-40d8-4841-a90d-13dce6db8634.json deleted file mode 100644 index 36c98b87c..000000000 --- a/data/hfopenllm_v2/allura-org/MN-12b-RP-Ink/01af237f-40d8-4841-a90d-13dce6db8634.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allura-org_MN-12b-RP-Ink/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12b-RP-Ink", - "id": "allura-org/MN-12b-RP-Ink", - "developer": "allura-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7186 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4834 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3818 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3514 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allura-org/MS-Meadowlark-22B/d69bb392-fd38-4f57-b567-24566896167b.json b/data/hfopenllm_v2/allura-org/MS-Meadowlark-22B/d69bb392-fd38-4f57-b567-24566896167b.json deleted file mode 100644 index 242f13b4f..000000000 --- a/data/hfopenllm_v2/allura-org/MS-Meadowlark-22B/d69bb392-fd38-4f57-b567-24566896167b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allura-org_MS-Meadowlark-22B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MS-Meadowlark-22B", - "id": "allura-org/MS-Meadowlark-22B", - "developer": "allura-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6697 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5163 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1835 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3823 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allura-org/Mistral-Small-24b-Sertraline-0304/63503943-1c1e-4dac-9c41-4933fbb44b70.json b/data/hfopenllm_v2/allura-org/Mistral-Small-24b-Sertraline-0304/63503943-1c1e-4dac-9c41-4933fbb44b70.json deleted file mode 100644 index a1e249c7d..000000000 --- a/data/hfopenllm_v2/allura-org/Mistral-Small-24b-Sertraline-0304/63503943-1c1e-4dac-9c41-4933fbb44b70.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allura-org_Mistral-Small-24b-Sertraline-0304/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-24b-Sertraline-0304", - "id": "allura-org/Mistral-Small-24b-Sertraline-0304", - "developer": "allura-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6525 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2228 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5106 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allura-org/Mistral-Small-Sisyphus-24b-2503/80c5d343-41e6-45d7-8921-62586a3cd270.json b/data/hfopenllm_v2/allura-org/Mistral-Small-Sisyphus-24b-2503/80c5d343-41e6-45d7-8921-62586a3cd270.json deleted file mode 100644 index 7a4d2fa7f..000000000 --- a/data/hfopenllm_v2/allura-org/Mistral-Small-Sisyphus-24b-2503/80c5d343-41e6-45d7-8921-62586a3cd270.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allura-org_Mistral-Small-Sisyphus-24b-2503/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-Sisyphus-24b-2503", - "id": "allura-org/Mistral-Small-Sisyphus-24b-2503", - "developer": "allura-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6848 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.627 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allura-org/MoE-Girl-1BA-7BT/2c27d7f6-60fd-49f3-8666-784f2a16031b.json b/data/hfopenllm_v2/allura-org/MoE-Girl-1BA-7BT/2c27d7f6-60fd-49f3-8666-784f2a16031b.json deleted file mode 100644 index 93ee2f96c..000000000 --- a/data/hfopenllm_v2/allura-org/MoE-Girl-1BA-7BT/2c27d7f6-60fd-49f3-8666-784f2a16031b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allura-org_MoE-Girl-1BA-7BT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MoE-Girl-1BA-7BT", - "id": "allura-org/MoE-Girl-1BA-7BT", - "developer": "allura-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "OlmoeForCausalLM", - "params_billions": 6.919 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2705 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3139 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3436 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1218 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allura-org/TQ2.5-14B-Aletheia-v1/cbcc1e64-8455-4382-8999-654d1757bbd6.json b/data/hfopenllm_v2/allura-org/TQ2.5-14B-Aletheia-v1/cbcc1e64-8455-4382-8999-654d1757bbd6.json deleted file mode 100644 index 200aa0dee..000000000 --- a/data/hfopenllm_v2/allura-org/TQ2.5-14B-Aletheia-v1/cbcc1e64-8455-4382-8999-654d1757bbd6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allura-org_TQ2.5-14B-Aletheia-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TQ2.5-14B-Aletheia-v1", - "id": "allura-org/TQ2.5-14B-Aletheia-v1", - "developer": "allura-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.753 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6585 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3399 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5241 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allura-org/TQ2.5-14B-Neon-v1/1bea4f6b-7a41-4907-baca-430c7ea179e9.json b/data/hfopenllm_v2/allura-org/TQ2.5-14B-Neon-v1/1bea4f6b-7a41-4907-baca-430c7ea179e9.json deleted file mode 100644 index 42bdffdcf..000000000 --- a/data/hfopenllm_v2/allura-org/TQ2.5-14B-Neon-v1/1bea4f6b-7a41-4907-baca-430c7ea179e9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allura-org_TQ2.5-14B-Neon-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TQ2.5-14B-Neon-v1", - "id": "allura-org/TQ2.5-14B-Neon-v1", - "developer": "allura-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6754 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6553 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5253 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/allura-org/Teleut-7b/298ce89b-966c-4f4e-9da5-3803a395188f.json b/data/hfopenllm_v2/allura-org/Teleut-7b/298ce89b-966c-4f4e-9da5-3803a395188f.json deleted file mode 100644 index 9486047d3..000000000 --- a/data/hfopenllm_v2/allura-org/Teleut-7b/298ce89b-966c-4f4e-9da5-3803a395188f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/allura-org_Teleut-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Teleut-7b", - "id": "allura-org/Teleut-7b", - "developer": "allura-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6379 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5141 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2409 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.464 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/aloobun/Meta-Llama-3-7B-28Layers/ea27a4d6-8c32-4b36-873d-1046ae6240e5.json b/data/hfopenllm_v2/aloobun/Meta-Llama-3-7B-28Layers/ea27a4d6-8c32-4b36-873d-1046ae6240e5.json deleted file mode 100644 index 1550e1cda..000000000 --- a/data/hfopenllm_v2/aloobun/Meta-Llama-3-7B-28Layers/ea27a4d6-8c32-4b36-873d-1046ae6240e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/aloobun_Meta-Llama-3-7B-28Layers/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3-7B-28Layers", - "id": "aloobun/Meta-Llama-3-7B-28Layers", - "developer": "aloobun", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.158 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3589 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.316 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/aloobun/d-SmolLM2-360M/73d5905d-7825-43ba-8051-7e1f5639b857.json b/data/hfopenllm_v2/aloobun/d-SmolLM2-360M/73d5905d-7825-43ba-8051-7e1f5639b857.json deleted file mode 100644 index 9ae14979c..000000000 --- a/data/hfopenllm_v2/aloobun/d-SmolLM2-360M/73d5905d-7825-43ba-8051-7e1f5639b857.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/aloobun_d-SmolLM2-360M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "d-SmolLM2-360M", - "id": "aloobun/d-SmolLM2-360M", - "developer": "aloobun", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.362 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2097 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1169 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/alpindale/WizardLM-2-8x22B/956b8589-a048-43be-9cfd-05658d3c57ca.json b/data/hfopenllm_v2/alpindale/WizardLM-2-8x22B/956b8589-a048-43be-9cfd-05658d3c57ca.json deleted file mode 100644 index 11db1c919..000000000 --- a/data/hfopenllm_v2/alpindale/WizardLM-2-8x22B/956b8589-a048-43be-9cfd-05658d3c57ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/alpindale_WizardLM-2-8x22B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WizardLM-2-8x22B", - "id": "alpindale/WizardLM-2-8x22B", - "developer": "alpindale", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 140.621 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5272 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6377 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4596 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/alpindale/magnum-72b-v1/36f597b4-8f53-4b40-9c0e-c9284743e456.json b/data/hfopenllm_v2/alpindale/magnum-72b-v1/36f597b4-8f53-4b40-9c0e-c9284743e456.json deleted file mode 100644 index 6b8bdd31e..000000000 --- a/data/hfopenllm_v2/alpindale/magnum-72b-v1/36f597b4-8f53-4b40-9c0e-c9284743e456.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/alpindale_magnum-72b-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-72b-v1", - "id": "alpindale/magnum-72b-v1", - "developer": "alpindale", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7606 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6982 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5468 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/altomek/YiSM-34B-0rn/7b67e526-7588-4c62-9293-55e77851c4c7.json b/data/hfopenllm_v2/altomek/YiSM-34B-0rn/7b67e526-7588-4c62-9293-55e77851c4c7.json deleted file mode 100644 index f2755bff5..000000000 --- a/data/hfopenllm_v2/altomek/YiSM-34B-0rn/7b67e526-7588-4c62-9293-55e77851c4c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/altomek_YiSM-34B-0rn/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "YiSM-34B-0rn", - "id": "altomek/YiSM-34B-0rn", - "developer": "altomek", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2281 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4696 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/amazon/MegaBeam-Mistral-7B-300k/8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json b/data/hfopenllm_v2/amazon/MegaBeam-Mistral-7B-300k/8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json deleted file mode 100644 index 6587a17ee..000000000 --- a/data/hfopenllm_v2/amazon/MegaBeam-Mistral-7B-300k/8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/amazon_MegaBeam-Mistral-7B-300k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MegaBeam-Mistral-7B-300k", - "id": "amazon/MegaBeam-Mistral-7B-300k", - "developer": "amazon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5203 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2549 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/amd/AMD-Llama-135m/6751a200-0bd9-498e-a991-ebe22375633d.json b/data/hfopenllm_v2/amd/AMD-Llama-135m/6751a200-0bd9-498e-a991-ebe22375633d.json deleted file mode 100644 index c580f04da..000000000 --- a/data/hfopenllm_v2/amd/AMD-Llama-135m/6751a200-0bd9-498e-a991-ebe22375633d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/amd_AMD-Llama-135m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AMD-Llama-135m", - "id": "amd/AMD-Llama-135m", - "developer": "amd", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.134 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2969 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3846 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1169 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/amd/AMD-Llama-135m/f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json b/data/hfopenllm_v2/amd/AMD-Llama-135m/f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json deleted file mode 100644 index ac33d4f61..000000000 --- a/data/hfopenllm_v2/amd/AMD-Llama-135m/f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/amd_AMD-Llama-135m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AMD-Llama-135m", - "id": "amd/AMD-Llama-135m", - "developer": "amd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1842 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2974 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1169 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anakin87/gemma-2b-orpo/b105b62a-ce77-4387-b679-1adf2782b2f4.json b/data/hfopenllm_v2/anakin87/gemma-2b-orpo/b105b62a-ce77-4387-b679-1adf2782b2f4.json deleted file mode 100644 index 83e2b1b5b..000000000 --- a/data/hfopenllm_v2/anakin87/gemma-2b-orpo/b105b62a-ce77-4387-b679-1adf2782b2f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anakin87_gemma-2b-orpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2b-orpo", - "id": "anakin87/gemma-2b-orpo", - "developer": "anakin87", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2478 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3426 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1306 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v1-72b/72180fd7-bf34-4758-b02f-7d11859700c7.json b/data/hfopenllm_v2/anthracite-org/magnum-v1-72b/72180fd7-bf34-4758-b02f-7d11859700c7.json deleted file mode 100644 index 18b204986..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v1-72b/72180fd7-bf34-4758-b02f-7d11859700c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v1-72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v1-72b", - "id": "anthracite-org/magnum-v1-72b", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7606 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6982 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5486 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v2-12b/ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json b/data/hfopenllm_v2/anthracite-org/magnum-v2-12b/ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json deleted file mode 100644 index 26fb22a26..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v2-12b/ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v2-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v2-12b", - "id": "anthracite-org/magnum-v2-12b", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4179 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v2-72b/2d266d7f-8edd-40fd-adfc-597a7742167b.json b/data/hfopenllm_v2/anthracite-org/magnum-v2-72b/2d266d7f-8edd-40fd-adfc-597a7742167b.json deleted file mode 100644 index ea532bb60..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v2-72b/2d266d7f-8edd-40fd-adfc-597a7742167b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v2-72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v2-72b", - "id": "anthracite-org/magnum-v2-72b", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.756 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7005 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5456 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v2.5-12b-kto/484ccbf2-87e2-423f-9de4-a4bd54291b54.json b/data/hfopenllm_v2/anthracite-org/magnum-v2.5-12b-kto/484ccbf2-87e2-423f-9de4-a4bd54291b54.json deleted file mode 100644 index d05197a33..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v2.5-12b-kto/484ccbf2-87e2-423f-9de4-a4bd54291b54.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v2.5-12b-kto/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v2.5-12b-kto", - "id": "anthracite-org/magnum-v2.5-12b-kto", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3866 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5077 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3215 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v3-27b-kto/4de79504-f9e8-4235-9aad-d38f0799e081.json b/data/hfopenllm_v2/anthracite-org/magnum-v3-27b-kto/4de79504-f9e8-4235-9aad-d38f0799e081.json deleted file mode 100644 index 81b95e27b..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v3-27b-kto/4de79504-f9e8-4235-9aad-d38f0799e081.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v3-27b-kto/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v3-27b-kto", - "id": "anthracite-org/magnum-v3-27b-kto", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5675 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.586 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1813 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3855 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4238 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v3-34b/b4bde9d8-f50c-448c-ada4-5bc05f302c04.json b/data/hfopenllm_v2/anthracite-org/magnum-v3-34b/b4bde9d8-f50c-448c-ada4-5bc05f302c04.json deleted file mode 100644 index 84369bf89..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v3-34b/b4bde9d8-f50c-448c-ada4-5bc05f302c04.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v3-34b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v3-34b", - "id": "anthracite-org/magnum-v3-34b", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5115 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6088 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1949 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4752 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-chatml/5da3240b-b5e3-4333-ba61-925343b56043.json b/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-chatml/5da3240b-b5e3-4333-ba61-925343b56043.json deleted file mode 100644 index 48d20512c..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-chatml/5da3240b-b5e3-4333-ba61-925343b56043.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v3-9b-chatml/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v3-9b-chatml", - "id": "anthracite-org/magnum-v3-9b-chatml", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5428 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4242 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-customgemma2/d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json b/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-customgemma2/d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json deleted file mode 100644 index 08eeb4ea5..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-customgemma2/d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v3-9b-customgemma2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v3-9b-customgemma2", - "id": "anthracite-org/magnum-v3-9b-customgemma2", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1273 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4565 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4205 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v4-12b/15b86bbf-8d3b-474b-98f0-abb3972a7271.json b/data/hfopenllm_v2/anthracite-org/magnum-v4-12b/15b86bbf-8d3b-474b-98f0-abb3972a7271.json deleted file mode 100644 index 98e18e6b7..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v4-12b/15b86bbf-8d3b-474b-98f0-abb3972a7271.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v4-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v4-12b", - "id": "anthracite-org/magnum-v4-12b", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3393 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5177 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4093 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v4-22b/c0b339f6-4a46-46eb-b2d0-945176afe676.json b/data/hfopenllm_v2/anthracite-org/magnum-v4-22b/c0b339f6-4a46-46eb-b2d0-945176afe676.json deleted file mode 100644 index 8cce65095..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v4-22b/c0b339f6-4a46-46eb-b2d0-945176afe676.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v4-22b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v4-22b", - "id": "anthracite-org/magnum-v4-22b", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5629 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5486 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2002 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.383 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v4-27b/79367289-6245-4bf0-99e9-42bc3ff7649c.json b/data/hfopenllm_v2/anthracite-org/magnum-v4-27b/79367289-6245-4bf0-99e9-42bc3ff7649c.json deleted file mode 100644 index fd3218df0..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v4-27b/79367289-6245-4bf0-99e9-42bc3ff7649c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v4-27b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v4-27b", - "id": "anthracite-org/magnum-v4-27b", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3454 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5867 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1798 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.438 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v4-9b/c3ec5505-1086-446a-9739-523810e93d13.json b/data/hfopenllm_v2/anthracite-org/magnum-v4-9b/c3ec5505-1086-446a-9739-523810e93d13.json deleted file mode 100644 index d1166e7fe..000000000 --- a/data/hfopenllm_v2/anthracite-org/magnum-v4-9b/c3ec5505-1086-446a-9739-523810e93d13.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v4-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magnum-v4-9b", - "id": "anthracite-org/magnum-v4-9b", - "developer": "anthracite-org", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3503 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5336 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4516 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3953 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/apple/DCLM-7B/c6c5e462-d373-4536-afc3-b740fb7e300f.json b/data/hfopenllm_v2/apple/DCLM-7B/c6c5e462-d373-4536-afc3-b740fb7e300f.json deleted file mode 100644 index 92760e54b..000000000 --- a/data/hfopenllm_v2/apple/DCLM-7B/c6c5e462-d373-4536-afc3-b740fb7e300f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/apple_DCLM-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DCLM-7B", - "id": "apple/DCLM-7B", - "developer": "apple", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "OpenLMModel", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2173 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3921 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/appvoid/arco-2-instruct/b7537abe-8177-4206-999f-5bb7e95c72c8.json b/data/hfopenllm_v2/appvoid/arco-2-instruct/b7537abe-8177-4206-999f-5bb7e95c72c8.json deleted file mode 100644 index c53efb7dc..000000000 --- a/data/hfopenllm_v2/appvoid/arco-2-instruct/b7537abe-8177-4206-999f-5bb7e95c72c8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/appvoid_arco-2-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "arco-2-instruct", - "id": "appvoid/arco-2-instruct", - "developer": "appvoid", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.514 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2164 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3133 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2383 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3496 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/appvoid/arco-2/eb2f6159-e37e-46db-9419-6a66cb7e539e.json b/data/hfopenllm_v2/appvoid/arco-2/eb2f6159-e37e-46db-9419-6a66cb7e539e.json deleted file mode 100644 index a19063325..000000000 --- a/data/hfopenllm_v2/appvoid/arco-2/eb2f6159-e37e-46db-9419-6a66cb7e539e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/appvoid_arco-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "arco-2", - "id": "appvoid/arco-2", - "developer": "appvoid", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.514 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1991 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3536 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1116 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Blitz/0b2d0a06-2907-4258-be33-1591e18ac6a2.json b/data/hfopenllm_v2/arcee-ai/Arcee-Blitz/0b2d0a06-2907-4258-be33-1591e18ac6a2.json deleted file mode 100644 index 8851d8d7b..000000000 --- a/data/hfopenllm_v2/arcee-ai/Arcee-Blitz/0b2d0a06-2907-4258-be33-1591e18ac6a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Blitz/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Arcee-Blitz", - "id": "arcee-ai/Arcee-Blitz", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5543 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6607 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5047 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6154 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Maestro-7B-Preview/0284d867-45c4-4fe4-883c-8e3ea169d66c.json b/data/hfopenllm_v2/arcee-ai/Arcee-Maestro-7B-Preview/0284d867-45c4-4fe4-883c-8e3ea169d66c.json deleted file mode 100644 index 9080738a7..000000000 --- a/data/hfopenllm_v2/arcee-ai/Arcee-Maestro-7B-Preview/0284d867-45c4-4fe4-883c-8e3ea169d66c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Maestro-7B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Arcee-Maestro-7B-Preview", - "id": "arcee-ai/Arcee-Maestro-7B-Preview", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4648 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4992 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3885 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3039 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Nova/1a2da513-104e-4074-b3b7-601ab11bf6d8.json b/data/hfopenllm_v2/arcee-ai/Arcee-Nova/1a2da513-104e-4074-b3b7-601ab11bf6d8.json deleted file mode 100644 index ec647a01e..000000000 --- a/data/hfopenllm_v2/arcee-ai/Arcee-Nova/1a2da513-104e-4074-b3b7-601ab11bf6d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Nova/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Arcee-Nova", - "id": "arcee-ai/Arcee-Nova", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7907 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6942 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5452 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Spark/189db16b-5e78-439f-9f79-6eec979c3a79.json b/data/hfopenllm_v2/arcee-ai/Arcee-Spark/189db16b-5e78-439f-9f79-6eec979c3a79.json deleted file mode 100644 index 38ed0acf4..000000000 --- a/data/hfopenllm_v2/arcee-ai/Arcee-Spark/189db16b-5e78-439f-9f79-6eec979c3a79.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Spark/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Arcee-Spark", - "id": "arcee-ai/Arcee-Spark", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5489 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3822 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Spark/d751f1c5-5505-4c12-8d51-091538b49949.json b/data/hfopenllm_v2/arcee-ai/Arcee-Spark/d751f1c5-5505-4c12-8d51-091538b49949.json deleted file mode 100644 index f4c610574..000000000 --- a/data/hfopenllm_v2/arcee-ai/Arcee-Spark/d751f1c5-5505-4c12-8d51-091538b49949.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Spark/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Arcee-Spark", - "id": "arcee-ai/Arcee-Spark", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5718 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5481 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4008 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3813 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Llama-3.1-SuperNova-Lite/b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json b/data/hfopenllm_v2/arcee-ai/Llama-3.1-SuperNova-Lite/b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json deleted file mode 100644 index 894a616bf..000000000 --- a/data/hfopenllm_v2/arcee-ai/Llama-3.1-SuperNova-Lite/b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Llama-3.1-SuperNova-Lite/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-SuperNova-Lite", - "id": "arcee-ai/Llama-3.1-SuperNova-Lite", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8017 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5152 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1828 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4163 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3877 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Llama-Spark/67dc7fb2-1455-4f60-9dcb-59a8197741d7.json b/data/hfopenllm_v2/arcee-ai/Llama-Spark/67dc7fb2-1455-4f60-9dcb-59a8197741d7.json deleted file mode 100644 index ffa60c546..000000000 --- a/data/hfopenllm_v2/arcee-ai/Llama-Spark/67dc7fb2-1455-4f60-9dcb-59a8197741d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Llama-Spark/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Spark", - "id": "arcee-ai/Llama-Spark", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7911 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5054 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.139 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3593 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3721 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/SuperNova-Medius/7f4ab590-29fa-473a-b617-00135dd1d6ee.json b/data/hfopenllm_v2/arcee-ai/SuperNova-Medius/7f4ab590-29fa-473a-b617-00135dd1d6ee.json deleted file mode 100644 index 4067be61a..000000000 --- a/data/hfopenllm_v2/arcee-ai/SuperNova-Medius/7f4ab590-29fa-473a-b617-00135dd1d6ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_SuperNova-Medius/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SuperNova-Medius", - "id": "arcee-ai/SuperNova-Medius", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7184 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6377 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.469 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4233 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5035 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Virtuoso-Lite/d67db62e-e21d-43c8-8b4c-bfa353e47636.json b/data/hfopenllm_v2/arcee-ai/Virtuoso-Lite/d67db62e-e21d-43c8-8b4c-bfa353e47636.json deleted file mode 100644 index e33d0d23e..000000000 --- a/data/hfopenllm_v2/arcee-ai/Virtuoso-Lite/d67db62e-e21d-43c8-8b4c-bfa353e47636.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Virtuoso-Lite/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Virtuoso-Lite", - "id": "arcee-ai/Virtuoso-Lite", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.81 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6099 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.253 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Virtuoso-Small-v2/85abff46-8ae5-4a75-9522-721793224363.json b/data/hfopenllm_v2/arcee-ai/Virtuoso-Small-v2/85abff46-8ae5-4a75-9522-721793224363.json deleted file mode 100644 index 5a503f2da..000000000 --- a/data/hfopenllm_v2/arcee-ai/Virtuoso-Small-v2/85abff46-8ae5-4a75-9522-721793224363.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Virtuoso-Small-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Virtuoso-Small-v2", - "id": "arcee-ai/Virtuoso-Small-v2", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8273 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6554 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.466 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4313 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5188 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/Virtuoso-Small/1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json b/data/hfopenllm_v2/arcee-ai/Virtuoso-Small/1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json deleted file mode 100644 index 4e3877887..000000000 --- a/data/hfopenllm_v2/arcee-ai/Virtuoso-Small/1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_Virtuoso-Small/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Virtuoso-Small", - "id": "arcee-ai/Virtuoso-Small", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7935 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6518 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4094 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4339 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5191 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arcee-ai/raspberry-3B/4777e427-8d17-4e06-8cbf-0883c95bbfd8.json b/data/hfopenllm_v2/arcee-ai/raspberry-3B/4777e427-8d17-4e06-8cbf-0883c95bbfd8.json deleted file mode 100644 index e1c6d4dab..000000000 --- a/data/hfopenllm_v2/arcee-ai/raspberry-3B/4777e427-8d17-4e06-8cbf-0883c95bbfd8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arcee-ai_raspberry-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "raspberry-3B", - "id": "arcee-ai/raspberry-3B", - "developer": "arcee-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4269 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4123 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2854 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/argilla-warehouse/Llama-3.1-8B-MagPie-Ultra/4df0b890-d4c5-408e-8994-88f7383e9235.json b/data/hfopenllm_v2/argilla-warehouse/Llama-3.1-8B-MagPie-Ultra/4df0b890-d4c5-408e-8994-88f7383e9235.json deleted file mode 100644 index c9896a281..000000000 --- a/data/hfopenllm_v2/argilla-warehouse/Llama-3.1-8B-MagPie-Ultra/4df0b890-d4c5-408e-8994-88f7383e9235.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/argilla-warehouse_Llama-3.1-8B-MagPie-Ultra/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-MagPie-Ultra", - "id": "argilla-warehouse/Llama-3.1-8B-MagPie-Ultra", - "developer": "argilla-warehouse", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3543 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/argilla/notus-7b-v1/76a5a59d-f5fd-4fb0-849e-7db7772b555a.json b/data/hfopenllm_v2/argilla/notus-7b-v1/76a5a59d-f5fd-4fb0-849e-7db7772b555a.json deleted file mode 100644 index d4480efe6..000000000 --- a/data/hfopenllm_v2/argilla/notus-7b-v1/76a5a59d-f5fd-4fb0-849e-7db7772b555a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/argilla_notus-7b-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "notus-7b-v1", - "id": "argilla/notus-7b-v1", - "developer": "argilla", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5082 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/argilla/notux-8x7b-v1/6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json b/data/hfopenllm_v2/argilla/notux-8x7b-v1/6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json deleted file mode 100644 index 8c4516521..000000000 --- a/data/hfopenllm_v2/argilla/notux-8x7b-v1/6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/argilla_notux-8x7b-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "notux-8x7b-v1", - "id": "argilla/notux-8x7b-v1", - "developer": "argilla", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5422 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5363 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4176 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arisin/orca-platypus-13B-slerp/92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json b/data/hfopenllm_v2/arisin/orca-platypus-13B-slerp/92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json deleted file mode 100644 index 593200335..000000000 --- a/data/hfopenllm_v2/arisin/orca-platypus-13B-slerp/92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arisin_orca-platypus-13B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca-platypus-13B-slerp", - "id": "arisin/orca-platypus-13B-slerp", - "developer": "arisin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.016 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2672 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4631 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/arshiaafshani/Arsh-V1/b40ef568-f277-4d5c-87cd-53feaa71598b.json b/data/hfopenllm_v2/arshiaafshani/Arsh-V1/b40ef568-f277-4d5c-87cd-53feaa71598b.json deleted file mode 100644 index 9bb389c7b..000000000 --- a/data/hfopenllm_v2/arshiaafshani/Arsh-V1/b40ef568-f277-4d5c-87cd-53feaa71598b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/arshiaafshani_Arsh-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Arsh-V1", - "id": "arshiaafshani/Arsh-V1", - "developer": "arshiaafshani", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2621 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4899 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5257 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/asharsha30/LLAMA_Harsha_8_B_ORDP_10k/893d5149-c535-41c7-8a1a-26bb6b33e407.json b/data/hfopenllm_v2/asharsha30/LLAMA_Harsha_8_B_ORDP_10k/893d5149-c535-41c7-8a1a-26bb6b33e407.json deleted file mode 100644 index 1530afe24..000000000 --- a/data/hfopenllm_v2/asharsha30/LLAMA_Harsha_8_B_ORDP_10k/893d5149-c535-41c7-8a1a-26bb6b33e407.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/asharsha30_LLAMA_Harsha_8_B_ORDP_10k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLAMA_Harsha_8_B_ORDP_10k", - "id": "asharsha30/LLAMA_Harsha_8_B_ORDP_10k", - "developer": "asharsha30", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3464 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4669 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ashercn97/a1-v0.0.1/0b649ed5-5af4-4910-b853-2408e3b58f1f.json b/data/hfopenllm_v2/ashercn97/a1-v0.0.1/0b649ed5-5af4-4910-b853-2408e3b58f1f.json deleted file mode 100644 index d679dcf37..000000000 --- a/data/hfopenllm_v2/ashercn97/a1-v0.0.1/0b649ed5-5af4-4910-b853-2408e3b58f1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ashercn97_a1-v0.0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "a1-v0.0.1", - "id": "ashercn97/a1-v0.0.1", - "developer": "ashercn97", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5188 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.412 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4165 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ashercn97/a1-v002/5c8edeba-5c65-4168-b67e-02143acbcafb.json b/data/hfopenllm_v2/ashercn97/a1-v002/5c8edeba-5c65-4168-b67e-02143acbcafb.json deleted file mode 100644 index ae1d8850d..000000000 --- a/data/hfopenllm_v2/ashercn97/a1-v002/5c8edeba-5c65-4168-b67e-02143acbcafb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ashercn97_a1-v002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "a1-v002", - "id": "ashercn97/a1-v002", - "developer": "ashercn97", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2585 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5261 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2341 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4159 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4175 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/assskelad/smollm2-360M-sft_SmallThoughts/67e657ef-d602-4f58-b898-874a22f4a009.json b/data/hfopenllm_v2/assskelad/smollm2-360M-sft_SmallThoughts/67e657ef-d602-4f58-b898-874a22f4a009.json deleted file mode 100644 index 72305cf3d..000000000 --- a/data/hfopenllm_v2/assskelad/smollm2-360M-sft_SmallThoughts/67e657ef-d602-4f58-b898-874a22f4a009.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/assskelad_smollm2-360M-sft_SmallThoughts/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smollm2-360M-sft_SmallThoughts", - "id": "assskelad/smollm2-360M-sft_SmallThoughts", - "developer": "assskelad", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.362 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2007 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1182 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/53d2bf07-689a-4e69-a534-b288313c8481.json b/data/hfopenllm_v2/athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/53d2bf07-689a-4e69-a534-b288313c8481.json deleted file mode 100644 index f4805d33d..000000000 --- a/data/hfopenllm_v2/athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/53d2bf07-689a-4e69-a534-b288313c8481.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/athirdpath_Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit", - "id": "athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit", - "developer": "athirdpath", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4521 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4939 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3864 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/automerger/YamshadowExperiment28-7B/34d6a184-d4d5-4609-8305-c0e2ee1c585b.json b/data/hfopenllm_v2/automerger/YamshadowExperiment28-7B/34d6a184-d4d5-4609-8305-c0e2ee1c585b.json deleted file mode 100644 index 2b426dfef..000000000 --- a/data/hfopenllm_v2/automerger/YamshadowExperiment28-7B/34d6a184-d4d5-4609-8305-c0e2ee1c585b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/automerger_YamshadowExperiment28-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "YamshadowExperiment28-7B", - "id": "automerger/YamshadowExperiment28-7B", - "developer": "automerger", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4306 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI/39b627ab-3e64-42f7-a88d-abe5764fcf4d.json b/data/hfopenllm_v2/avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI/39b627ab-3e64-42f7-a88d-abe5764fcf4d.json deleted file mode 100644 index a782f820f..000000000 --- a/data/hfopenllm_v2/avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI/39b627ab-3e64-42f7-a88d-abe5764fcf4d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/avemio_GRAG-NEMO-12B-ORPO-HESSIAN-AI/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GRAG-NEMO-12B-ORPO-HESSIAN-AI", - "id": "avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI", - "developer": "avemio", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2607 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1061 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-2/d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-2/d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json deleted file mode 100644 index edcb076d2..000000000 --- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-2/d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-1-over-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-v0.1-signtensors-1-over-2", - "id": "awnr/Mistral-7B-v0.1-signtensors-1-over-2", - "developer": "awnr", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2179 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4423 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4006 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-4/85bc5976-0d40-4416-bbf8-9b1dbf372343.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-4/85bc5976-0d40-4416-bbf8-9b1dbf372343.json deleted file mode 100644 index cddb44dc5..000000000 --- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-4/85bc5976-0d40-4416-bbf8-9b1dbf372343.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-1-over-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-v0.1-signtensors-1-over-4", - "id": "awnr/Mistral-7B-v0.1-signtensors-1-over-4", - "developer": "awnr", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2133 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2311 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-3-over-8/8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-3-over-8/8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json deleted file mode 100644 index d4c3cdd30..000000000 --- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-3-over-8/8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-3-over-8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-v0.1-signtensors-3-over-8", - "id": "awnr/Mistral-7B-v0.1-signtensors-3-over-8", - "developer": "awnr", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2394 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3818 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3001 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-5-over-16/de8651eb-16d1-46ee-a1df-b8c72caaf205.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-5-over-16/de8651eb-16d1-46ee-a1df-b8c72caaf205.json deleted file mode 100644 index cde7a7057..000000000 --- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-5-over-16/de8651eb-16d1-46ee-a1df-b8c72caaf205.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-5-over-16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-v0.1-signtensors-5-over-16", - "id": "awnr/Mistral-7B-v0.1-signtensors-5-over-16", - "developer": "awnr", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2118 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2958 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-7-over-16/6a744db8-814f-4e8e-b6e5-0d096267dfa5.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-7-over-16/6a744db8-814f-4e8e-b6e5-0d096267dfa5.json deleted file mode 100644 index 7819c9d49..000000000 --- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-7-over-16/6a744db8-814f-4e8e-b6e5-0d096267dfa5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-7-over-16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-v0.1-signtensors-7-over-16", - "id": "awnr/Mistral-7B-v0.1-signtensors-7-over-16", - "developer": "awnr", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2294 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3952 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.303 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/aws-prototyping/MegaBeam-Mistral-7B-512k/028b7c37-770e-4356-a7c6-0cc74650d5fd.json b/data/hfopenllm_v2/aws-prototyping/MegaBeam-Mistral-7B-512k/028b7c37-770e-4356-a7c6-0cc74650d5fd.json deleted file mode 100644 index 8bff123e1..000000000 --- a/data/hfopenllm_v2/aws-prototyping/MegaBeam-Mistral-7B-512k/028b7c37-770e-4356-a7c6-0cc74650d5fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/aws-prototyping_MegaBeam-Mistral-7B-512k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MegaBeam-Mistral-7B-512k", - "id": "aws-prototyping/MegaBeam-Mistral-7B-512k", - "developer": "aws-prototyping", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5973 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3662 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3994 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2589 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/axolotl-ai-co/romulus-mistral-nemo-12b-simpo/3b399c64-922a-48ba-9a25-862102749647.json b/data/hfopenllm_v2/axolotl-ai-co/romulus-mistral-nemo-12b-simpo/3b399c64-922a-48ba-9a25-862102749647.json deleted file mode 100644 index fa7e408a7..000000000 --- a/data/hfopenllm_v2/axolotl-ai-co/romulus-mistral-nemo-12b-simpo/3b399c64-922a-48ba-9a25-862102749647.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/axolotl-ai-co_romulus-mistral-nemo-12b-simpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "romulus-mistral-nemo-12b-simpo", - "id": "axolotl-ai-co/romulus-mistral-nemo-12b-simpo", - "developer": "axolotl-ai-co", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6079 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4233 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3469 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.0/d5e46a11-3e81-457d-9d26-9fd17f96f076.json b/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.0/d5e46a11-3e81-457d-9d26-9fd17f96f076.json deleted file mode 100644 index e7ee5d6e9..000000000 --- a/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.0/d5e46a11-3e81-457d-9d26-9fd17f96f076.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/baconnier_Napoleon_24B_V0.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Napoleon_24B_V0.0", - "id": "baconnier/Napoleon_24B_V0.0", - "developer": "baconnier", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1801 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6367 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2273 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.442 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.504 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.2/b3abfbc1-911a-43b7-a338-efb25f746f9d.json b/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.2/b3abfbc1-911a-43b7-a338-efb25f746f9d.json deleted file mode 100644 index d0b53963a..000000000 --- a/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.2/b3abfbc1-911a-43b7-a338-efb25f746f9d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/baconnier_Napoleon_24B_V0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Napoleon_24B_V0.2", - "id": "baconnier/Napoleon_24B_V0.2", - "developer": "baconnier", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2527 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5911 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1435 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/baebee/7B-Cetacea/6b471ee0-9444-45ff-92cf-da624aa59bf6.json b/data/hfopenllm_v2/baebee/7B-Cetacea/6b471ee0-9444-45ff-92cf-da624aa59bf6.json deleted file mode 100644 index a0e865f81..000000000 --- a/data/hfopenllm_v2/baebee/7B-Cetacea/6b471ee0-9444-45ff-92cf-da624aa59bf6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/baebee_7B-Cetacea/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "7B-Cetacea", - "id": "baebee/7B-Cetacea", - "developer": "baebee", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5279 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2955 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/baebee/mergekit-model_stock-nzjnheg/b56bd924-0a63-4ca2-8f2f-97b581e47a36.json b/data/hfopenllm_v2/baebee/mergekit-model_stock-nzjnheg/b56bd924-0a63-4ca2-8f2f-97b581e47a36.json deleted file mode 100644 index bee951289..000000000 --- a/data/hfopenllm_v2/baebee/mergekit-model_stock-nzjnheg/b56bd924-0a63-4ca2-8f2f-97b581e47a36.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/baebee_mergekit-model_stock-nzjnheg/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-model_stock-nzjnheg", - "id": "baebee/mergekit-model_stock-nzjnheg", - "developer": "baebee", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4844 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5287 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1677 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/baebee/mergekit-ties-fnjenli/bfe9098d-7207-4f8c-9a3f-549a29303b5f.json b/data/hfopenllm_v2/baebee/mergekit-ties-fnjenli/bfe9098d-7207-4f8c-9a3f-549a29303b5f.json deleted file mode 100644 index 73adf9632..000000000 --- a/data/hfopenllm_v2/baebee/mergekit-ties-fnjenli/bfe9098d-7207-4f8c-9a3f-549a29303b5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/baebee_mergekit-ties-fnjenli/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-ties-fnjenli", - "id": "baebee/mergekit-ties-fnjenli", - "developer": "baebee", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1988 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3024 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.1v/7856172d-ec3e-4e71-befe-54952478e330.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.1v/7856172d-ec3e-4e71-befe-54952478e330.json deleted file mode 100644 index 8956821d3..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.1v/7856172d-ec3e-4e71-befe-54952478e330.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.1v/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B-Mix_0.1v", - "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.1v", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3636 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5436 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4132 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.2v/a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.2v/a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json deleted file mode 100644 index 4b040ddba..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.2v/a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.2v/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B-Mix_0.2v", - "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.2v", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5434 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4158 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.3v/9d19c44f-4912-4c95-ab3f-2dddb055d932.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.3v/9d19c44f-4912-4c95-ab3f-2dddb055d932.json deleted file mode 100644 index f38472040..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.3v/9d19c44f-4912-4c95-ab3f-2dddb055d932.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.3v/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B-Mix_0.3v", - "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.3v", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.387 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4131 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3664 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.4v/6cef3550-27d7-4073-b4bb-0f19a2c5f553.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.4v/6cef3550-27d7-4073-b4bb-0f19a2c5f553.json deleted file mode 100644 index ac3a04ead..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.4v/6cef3550-27d7-4073-b4bb-0f19a2c5f553.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.4v/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B-Mix_0.4v", - "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.4v", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6508 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5094 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1352 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4176 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.5v/08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.5v/08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json deleted file mode 100644 index 95a75b4b9..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.5v/08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.5v/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B-Mix_0.5v", - "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.5v", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3746 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5422 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4132 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.6v/622f9379-6a30-43ba-a7a8-fbd08c484fa5.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.6v/622f9379-6a30-43ba-a7a8-fbd08c484fa5.json deleted file mode 100644 index a048d396f..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.6v/622f9379-6a30-43ba-a7a8-fbd08c484fa5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.6v/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B-Mix_0.6v", - "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.6v", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5449 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1254 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3662 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V/24f728e6-de5e-44cc-8b6d-51e0065c1475.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V/24f728e6-de5e-44cc-8b6d-51e0065c1475.json deleted file mode 100644 index 5db8c43d5..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V/24f728e6-de5e-44cc-8b6d-51e0065c1475.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_III_IV_V/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B-Mix_III_IV_V", - "id": "bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3664 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V/c3b2bf18-d355-40fc-a862-376c1b988305.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V/c3b2bf18-d355-40fc-a862-376c1b988305.json deleted file mode 100644 index 33f7812de..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V/c3b2bf18-d355-40fc-a862-376c1b988305.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_III_ex_V/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B-Mix_III_ex_V", - "id": "bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5449 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_Neo/79474be5-2587-4087-a2cc-1337e3b696dd.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_Neo/79474be5-2587-4087-a2cc-1337e3b696dd.json deleted file mode 100644 index ab7a30fec..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_Neo/79474be5-2587-4087-a2cc-1337e3b696dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_Neo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B-Mix_Neo", - "id": "bamec66557/MISCHIEVOUS-12B-Mix_Neo", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5078 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3685 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B/22ff2700-70c0-459e-96a2-0ce1710947bc.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B/22ff2700-70c0-459e-96a2-0ce1710947bc.json deleted file mode 100644 index 2f6482099..000000000 --- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B/22ff2700-70c0-459e-96a2-0ce1710947bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MISCHIEVOUS-12B", - "id": "bamec66557/MISCHIEVOUS-12B", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5405 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4145 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3672 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407/7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json b/data/hfopenllm_v2/bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407/7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json deleted file mode 100644 index d39a7d3db..000000000 --- a/data/hfopenllm_v2/bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407/7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_Mistral-Nemo-VICIOUS_MESH-12B-2407/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-VICIOUS_MESH-12B-2407", - "id": "bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6706 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5156 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3677 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/NameLess-12B-prob/69dc0f8e-16d7-4907-9741-484eafa62b8c.json b/data/hfopenllm_v2/bamec66557/NameLess-12B-prob/69dc0f8e-16d7-4907-9741-484eafa62b8c.json deleted file mode 100644 index e2704ecca..000000000 --- a/data/hfopenllm_v2/bamec66557/NameLess-12B-prob/69dc0f8e-16d7-4907-9741-484eafa62b8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_NameLess-12B-prob/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NameLess-12B-prob", - "id": "bamec66557/NameLess-12B-prob", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6602 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5158 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4336 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.1v/e516abc1-9c3c-4921-a385-e2533d45fed3.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.1v/e516abc1-9c3c-4921-a385-e2533d45fed3.json deleted file mode 100644 index 6ba84e23e..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.1v/e516abc1-9c3c-4921-a385-e2533d45fed3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-0.1v/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-0.1v", - "id": "bamec66557/VICIOUS_MESH-12B-0.1v", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3657 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5412 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4158 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.X.ver/8baa5832-cc07-4a31-a815-0e8151426ea6.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.X.ver/8baa5832-cc07-4a31-a815-0e8151426ea6.json deleted file mode 100644 index cd846c308..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.X.ver/8baa5832-cc07-4a31-a815-0e8151426ea6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-0.X.ver/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-0.X.ver", - "id": "bamec66557/VICIOUS_MESH-12B-0.X.ver", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-ALPHA/509fbca4-f405-4c27-85a9-1eea59025070.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-ALPHA/509fbca4-f405-4c27-85a9-1eea59025070.json deleted file mode 100644 index 98b8dfc2e..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-ALPHA/509fbca4-f405-4c27-85a9-1eea59025070.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-ALPHA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-ALPHA", - "id": "bamec66557/VICIOUS_MESH-12B-ALPHA", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6365 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5094 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-BETA/6f45ed56-6bec-4439-9adb-e79fcd74667c.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-BETA/6f45ed56-6bec-4439-9adb-e79fcd74667c.json deleted file mode 100644 index 5e63ad14b..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-BETA/6f45ed56-6bec-4439-9adb-e79fcd74667c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-BETA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-BETA", - "id": "bamec66557/VICIOUS_MESH-12B-BETA", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6721 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5156 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DELTA/512ff924-c1d3-4d75-a468-2bcdcda25cf6.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DELTA/512ff924-c1d3-4d75-a468-2bcdcda25cf6.json deleted file mode 100644 index d55e617b1..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DELTA/512ff924-c1d3-4d75-a468-2bcdcda25cf6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-DELTA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-DELTA", - "id": "bamec66557/VICIOUS_MESH-12B-DELTA", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6469 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5055 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1375 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4057 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DIGAMMA/86b561ae-c4d3-4293-a884-bcab26df026d.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DIGAMMA/86b561ae-c4d3-4293-a884-bcab26df026d.json deleted file mode 100644 index 33dd088e9..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DIGAMMA/86b561ae-c4d3-4293-a884-bcab26df026d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-DIGAMMA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-DIGAMMA", - "id": "bamec66557/VICIOUS_MESH-12B-DIGAMMA", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6429 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5061 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4097 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3659 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-EPSILON/516d1972-9731-4234-a4b3-b96423ebba5c.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-EPSILON/516d1972-9731-4234-a4b3-b96423ebba5c.json deleted file mode 100644 index e7811c1a1..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-EPSILON/516d1972-9731-4234-a4b3-b96423ebba5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-EPSILON/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-EPSILON", - "id": "bamec66557/VICIOUS_MESH-12B-EPSILON", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6305 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5038 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3648 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-GAMMA/274f6e02-c81f-4f2e-9747-e5de5cee1933.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-GAMMA/274f6e02-c81f-4f2e-9747-e5de5cee1933.json deleted file mode 100644 index 1e3744cbe..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-GAMMA/274f6e02-c81f-4f2e-9747-e5de5cee1933.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-GAMMA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-GAMMA", - "id": "bamec66557/VICIOUS_MESH-12B-GAMMA", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6362 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5182 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4363 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-NEMO/61638b55-296b-40fd-a39f-cc2276d9f94a.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-NEMO/61638b55-296b-40fd-a39f-cc2276d9f94a.json deleted file mode 100644 index 07670d514..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-NEMO/61638b55-296b-40fd-a39f-cc2276d9f94a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-NEMO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-NEMO", - "id": "bamec66557/VICIOUS_MESH-12B-NEMO", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4022 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5442 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-OMEGA/11c1b6fe-4815-415b-a4a8-d14073df6ee1.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-OMEGA/11c1b6fe-4815-415b-a4a8-d14073df6ee1.json deleted file mode 100644 index 6ef877a1b..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-OMEGA/11c1b6fe-4815-415b-a4a8-d14073df6ee1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-OMEGA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-OMEGA", - "id": "bamec66557/VICIOUS_MESH-12B-OMEGA", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.67 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5166 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3677 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-UNION/88e2cb24-288e-4f37-8753-f0daa825051c.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-UNION/88e2cb24-288e-4f37-8753-f0daa825051c.json deleted file mode 100644 index 149e4a3b1..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-UNION/88e2cb24-288e-4f37-8753-f0daa825051c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-UNION/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B-UNION", - "id": "bamec66557/VICIOUS_MESH-12B-UNION", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6429 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.139 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4257 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3672 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B/8a1a6c44-17fd-402e-a22e-e795a1f612e3.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B/8a1a6c44-17fd-402e-a22e-e795a1f612e3.json deleted file mode 100644 index d0b02080e..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B/8a1a6c44-17fd-402e-a22e-e795a1f612e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B", - "id": "bamec66557/VICIOUS_MESH-12B", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5436 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4105 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B_Razor/1121af0b-61fe-424a-bc66-3164bcb1d833.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B_Razor/1121af0b-61fe-424a-bc66-3164bcb1d833.json deleted file mode 100644 index c043f1c67..000000000 --- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B_Razor/1121af0b-61fe-424a-bc66-3164bcb1d833.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B_Razor/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VICIOUS_MESH-12B_Razor", - "id": "bamec66557/VICIOUS_MESH-12B_Razor", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3736 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5447 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4092 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3669 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/mergekit-model_stock-zdaysvi/35300d67-7ee1-4874-b351-87f46267cec9.json b/data/hfopenllm_v2/bamec66557/mergekit-model_stock-zdaysvi/35300d67-7ee1-4874-b351-87f46267cec9.json deleted file mode 100644 index ddc69cf5a..000000000 --- a/data/hfopenllm_v2/bamec66557/mergekit-model_stock-zdaysvi/35300d67-7ee1-4874-b351-87f46267cec9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_mergekit-model_stock-zdaysvi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-model_stock-zdaysvi", - "id": "bamec66557/mergekit-model_stock-zdaysvi", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6426 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5063 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1352 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bamec66557/mergekit-ties-sinbkow/6180b7b3-4b21-42aa-a62d-084a91568b43.json b/data/hfopenllm_v2/bamec66557/mergekit-ties-sinbkow/6180b7b3-4b21-42aa-a62d-084a91568b43.json deleted file mode 100644 index f59ec5045..000000000 --- a/data/hfopenllm_v2/bamec66557/mergekit-ties-sinbkow/6180b7b3-4b21-42aa-a62d-084a91568b43.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bamec66557_mergekit-ties-sinbkow/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-ties-sinbkow", - "id": "bamec66557/mergekit-ties-sinbkow", - "developer": "bamec66557", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 6.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6432 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5092 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4045 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/belztjti/dffghgjh/7414d344-0e67-424a-9e16-00de0487ce02.json b/data/hfopenllm_v2/belztjti/dffghgjh/7414d344-0e67-424a-9e16-00de0487ce02.json deleted file mode 100644 index 007ff11a3..000000000 --- a/data/hfopenllm_v2/belztjti/dffghgjh/7414d344-0e67-424a-9e16-00de0487ce02.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/belztjti_dffghgjh/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dffghgjh", - "id": "belztjti/dffghgjh", - "developer": "belztjti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GlmForCausalLM", - "params_billions": 9.543 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5784 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/belztjti/dtfgv/f5fcd407-080c-4cb7-a299-7a7f919c734d.json b/data/hfopenllm_v2/belztjti/dtfgv/f5fcd407-080c-4cb7-a299-7a7f919c734d.json deleted file mode 100644 index 05d2c4ee6..000000000 --- a/data/hfopenllm_v2/belztjti/dtfgv/f5fcd407-080c-4cb7-a299-7a7f919c734d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/belztjti_dtfgv/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dtfgv", - "id": "belztjti/dtfgv", - "developer": "belztjti", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 9.543 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3345 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3282 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1504 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/benhaotang/phi4-qwq-sky-t1/efe03731-6021-4dcf-b7fe-24cbf2d60fac.json b/data/hfopenllm_v2/benhaotang/phi4-qwq-sky-t1/efe03731-6021-4dcf-b7fe-24cbf2d60fac.json deleted file mode 100644 index 4a2fac32d..000000000 --- a/data/hfopenllm_v2/benhaotang/phi4-qwq-sky-t1/efe03731-6021-4dcf-b7fe-24cbf2d60fac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/benhaotang_phi4-qwq-sky-t1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi4-qwq-sky-t1", - "id": "benhaotang/phi4-qwq-sky-t1", - "developer": "benhaotang", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.046 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6711 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5244 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/beomi/gemma-mling-7b/6ffed624-cc22-4b62-a447-3c02b0e43ded.json b/data/hfopenllm_v2/beomi/gemma-mling-7b/6ffed624-cc22-4b62-a447-3c02b0e43ded.json deleted file mode 100644 index dda19f01b..000000000 --- a/data/hfopenllm_v2/beomi/gemma-mling-7b/6ffed624-cc22-4b62-a447-3c02b0e43ded.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/beomi_gemma-mling-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-mling-7b", - "id": "beomi/gemma-mling-7b", - "developer": "beomi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2029 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4068 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3759 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2633 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/beowolx/CodeNinja-1.0-OpenChat-7B/ed867fa8-be8a-49b0-8c94-38085808b58b.json b/data/hfopenllm_v2/beowolx/CodeNinja-1.0-OpenChat-7B/ed867fa8-be8a-49b0-8c94-38085808b58b.json deleted file mode 100644 index 4c755301d..000000000 --- a/data/hfopenllm_v2/beowolx/CodeNinja-1.0-OpenChat-7B/ed867fa8-be8a-49b0-8c94-38085808b58b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/beowolx_CodeNinja-1.0-OpenChat-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CodeNinja-1.0-OpenChat-7B", - "id": "beowolx/CodeNinja-1.0-OpenChat-7B", - "developer": "beowolx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5447 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4243 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3015 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/berkeley-nest/Starling-LM-7B-alpha/c8b9a56b-0933-4085-8d5f-a1d8294699db.json b/data/hfopenllm_v2/berkeley-nest/Starling-LM-7B-alpha/c8b9a56b-0933-4085-8d5f-a1d8294699db.json deleted file mode 100644 index 59ff6b3cf..000000000 --- a/data/hfopenllm_v2/berkeley-nest/Starling-LM-7B-alpha/c8b9a56b-0933-4085-8d5f-a1d8294699db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/berkeley-nest_Starling-LM-7B-alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Starling-LM-7B-alpha", - "id": "berkeley-nest/Starling-LM-7B-alpha", - "developer": "berkeley-nest", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.412 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bfuzzy1/Gunny/9b178661-ed9a-427d-b93c-b905b8089ad8.json b/data/hfopenllm_v2/bfuzzy1/Gunny/9b178661-ed9a-427d-b93c-b905b8089ad8.json deleted file mode 100644 index c2995af50..000000000 --- a/data/hfopenllm_v2/bfuzzy1/Gunny/9b178661-ed9a-427d-b93c-b905b8089ad8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bfuzzy1_Gunny/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gunny", - "id": "bfuzzy1/Gunny", - "developer": "bfuzzy1", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7129 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4546 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.173 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3583 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3039 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bfuzzy1/acheron-c/69588e07-7559-49c2-9423-19fd143e42f7.json b/data/hfopenllm_v2/bfuzzy1/acheron-c/69588e07-7559-49c2-9423-19fd143e42f7.json deleted file mode 100644 index ca1f71360..000000000 --- a/data/hfopenllm_v2/bfuzzy1/acheron-c/69588e07-7559-49c2-9423-19fd143e42f7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron-c/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "acheron-c", - "id": "bfuzzy1/acheron-c", - "developer": "bfuzzy1", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.514 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1929 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bfuzzy1/acheron-d/317589da-d673-4f90-93e9-59983f2ef54b.json b/data/hfopenllm_v2/bfuzzy1/acheron-d/317589da-d673-4f90-93e9-59983f2ef54b.json deleted file mode 100644 index e74d10763..000000000 --- a/data/hfopenllm_v2/bfuzzy1/acheron-d/317589da-d673-4f90-93e9-59983f2ef54b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron-d/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "acheron-d", - "id": "bfuzzy1/acheron-d", - "developer": "bfuzzy1", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.514 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1925 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2366 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1134 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bfuzzy1/acheron-m/efab322e-ea15-4fe7-9bfc-15246003e59c.json b/data/hfopenllm_v2/bfuzzy1/acheron-m/efab322e-ea15-4fe7-9bfc-15246003e59c.json deleted file mode 100644 index 21c3ead2f..000000000 --- a/data/hfopenllm_v2/bfuzzy1/acheron-m/efab322e-ea15-4fe7-9bfc-15246003e59c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron-m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "acheron-m", - "id": "bfuzzy1/acheron-m", - "developer": "bfuzzy1", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.514 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1758 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bfuzzy1/acheron-m1a-llama/b1eac68e-b292-414b-9594-c921f8e10818.json b/data/hfopenllm_v2/bfuzzy1/acheron-m1a-llama/b1eac68e-b292-414b-9594-c921f8e10818.json deleted file mode 100644 index a0d0cda28..000000000 --- a/data/hfopenllm_v2/bfuzzy1/acheron-m1a-llama/b1eac68e-b292-414b-9594-c921f8e10818.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron-m1a-llama/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "acheron-m1a-llama", - "id": "bfuzzy1/acheron-m1a-llama", - "developer": "bfuzzy1", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.514 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2956 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1146 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bfuzzy1/acheron/b7d08c65-8219-4067-9504-99e438a86038.json b/data/hfopenllm_v2/bfuzzy1/acheron/b7d08c65-8219-4067-9504-99e438a86038.json deleted file mode 100644 index 599c0add4..000000000 --- a/data/hfopenllm_v2/bfuzzy1/acheron/b7d08c65-8219-4067-9504-99e438a86038.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "acheron", - "id": "bfuzzy1/acheron", - "developer": "bfuzzy1", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.514 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1983 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3108 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3511 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1096 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bfuzzy1/llambses-1/e9c5b479-0dce-4de3-84d6-90c7515337f1.json b/data/hfopenllm_v2/bfuzzy1/llambses-1/e9c5b479-0dce-4de3-84d6-90c7515337f1.json deleted file mode 100644 index f8f06d954..000000000 --- a/data/hfopenllm_v2/bfuzzy1/llambses-1/e9c5b479-0dce-4de3-84d6-90c7515337f1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bfuzzy1_llambses-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llambses-1", - "id": "bfuzzy1/llambses-1", - "developer": "bfuzzy1", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3554 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4529 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.314 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bhuvneshsaini/merged_model/3c766465-29db-4b3d-b42f-a3222b38a096.json b/data/hfopenllm_v2/bhuvneshsaini/merged_model/3c766465-29db-4b3d-b42f-a3222b38a096.json deleted file mode 100644 index 477afd11a..000000000 --- a/data/hfopenllm_v2/bhuvneshsaini/merged_model/3c766465-29db-4b3d-b42f-a3222b38a096.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bhuvneshsaini_merged_model/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merged_model", - "id": "bhuvneshsaini/merged_model", - "developer": "bhuvneshsaini", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.715 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1813 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.336 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1445 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bigcode/starcoder2-15b/e6c85677-61ed-475b-85a5-48b91ec76bcf.json b/data/hfopenllm_v2/bigcode/starcoder2-15b/e6c85677-61ed-475b-85a5-48b91ec76bcf.json deleted file mode 100644 index 9a0162bcf..000000000 --- a/data/hfopenllm_v2/bigcode/starcoder2-15b/e6c85677-61ed-475b-85a5-48b91ec76bcf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bigcode_starcoder2-15b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "starcoder2-15b", - "id": "bigcode/starcoder2-15b", - "developer": "bigcode", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Starcoder2ForCausalLM", - "params_billions": 15.958 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.278 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4448 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bigcode/starcoder2-3b/7b68fa5e-dbbf-4542-8767-6874aabf8f40.json b/data/hfopenllm_v2/bigcode/starcoder2-3b/7b68fa5e-dbbf-4542-8767-6874aabf8f40.json deleted file mode 100644 index 4f9f33d8f..000000000 --- a/data/hfopenllm_v2/bigcode/starcoder2-3b/7b68fa5e-dbbf-4542-8767-6874aabf8f40.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bigcode_starcoder2-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "starcoder2-3b", - "id": "bigcode/starcoder2-3b", - "developer": "bigcode", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Starcoder2ForCausalLM", - "params_billions": 3.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2037 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3509 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3435 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1636 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bigcode/starcoder2-7b/c103b7f4-a432-42d6-86ef-cb369e0c16ff.json b/data/hfopenllm_v2/bigcode/starcoder2-7b/c103b7f4-a432-42d6-86ef-cb369e0c16ff.json deleted file mode 100644 index ba7f43161..000000000 --- a/data/hfopenllm_v2/bigcode/starcoder2-7b/c103b7f4-a432-42d6-86ef-cb369e0c16ff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bigcode_starcoder2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "starcoder2-7b", - "id": "bigcode/starcoder2-7b", - "developer": "bigcode", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Starcoder2ForCausalLM", - "params_billions": 7.174 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2209 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1642 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bigscience/bloom-1b1/643dda41-37d0-4c1e-b856-58b774612886.json b/data/hfopenllm_v2/bigscience/bloom-1b1/643dda41-37d0-4c1e-b856-58b774612886.json deleted file mode 100644 index 02b4a4909..000000000 --- a/data/hfopenllm_v2/bigscience/bloom-1b1/643dda41-37d0-4c1e-b856-58b774612886.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bigscience_bloom-1b1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bloom-1b1", - "id": "bigscience/bloom-1b1", - "developer": "bigscience", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "BloomForCausalLM", - "params_billions": 1.065 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1373 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bigscience/bloom-1b7/ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json b/data/hfopenllm_v2/bigscience/bloom-1b7/ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json deleted file mode 100644 index e53021d83..000000000 --- a/data/hfopenllm_v2/bigscience/bloom-1b7/ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bigscience_bloom-1b7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bloom-1b7", - "id": "bigscience/bloom-1b7", - "developer": "bigscience", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "BloomForCausalLM", - "params_billions": 1.722 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1044 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3141 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3886 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1086 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bigscience/bloom-3b/16e30aa0-736a-4ef8-8ba6-78285b84546f.json b/data/hfopenllm_v2/bigscience/bloom-3b/16e30aa0-736a-4ef8-8ba6-78285b84546f.json deleted file mode 100644 index 0929bf77c..000000000 --- a/data/hfopenllm_v2/bigscience/bloom-3b/16e30aa0-736a-4ef8-8ba6-78285b84546f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bigscience_bloom-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bloom-3b", - "id": "bigscience/bloom-3b", - "developer": "bigscience", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "BloomForCausalLM", - "params_billions": 3.003 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1271 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bigscience/bloom-560m/73eb729d-adfd-4dee-9bde-04a31f5528f6.json b/data/hfopenllm_v2/bigscience/bloom-560m/73eb729d-adfd-4dee-9bde-04a31f5528f6.json deleted file mode 100644 index 2294ec54f..000000000 --- a/data/hfopenllm_v2/bigscience/bloom-560m/73eb729d-adfd-4dee-9bde-04a31f5528f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bigscience_bloom-560m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bloom-560m", - "id": "bigscience/bloom-560m", - "developer": "bigscience", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "BloomForCausalLM", - "params_billions": 0.559 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.062 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bigscience/bloom-7b1/0daad2ae-92d0-4522-a067-20332f72c96f.json b/data/hfopenllm_v2/bigscience/bloom-7b1/0daad2ae-92d0-4522-a067-20332f72c96f.json deleted file mode 100644 index 2cfad20ec..000000000 --- a/data/hfopenllm_v2/bigscience/bloom-7b1/0daad2ae-92d0-4522-a067-20332f72c96f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bigscience_bloom-7b1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bloom-7b1", - "id": "bigscience/bloom-7b1", - "developer": "bigscience", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "BloomForCausalLM", - "params_billions": 7.069 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3114 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bluuwhale/L3-SthenoMaid-8B-V1/a3e3849f-a289-4132-b4a8-f67d67ad46a1.json b/data/hfopenllm_v2/bluuwhale/L3-SthenoMaid-8B-V1/a3e3849f-a289-4132-b4a8-f67d67ad46a1.json deleted file mode 100644 index bc1d74cae..000000000 --- a/data/hfopenllm_v2/bluuwhale/L3-SthenoMaid-8B-V1/a3e3849f-a289-4132-b4a8-f67d67ad46a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bluuwhale_L3-SthenoMaid-8B-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-SthenoMaid-8B-V1", - "id": "bluuwhale/L3-SthenoMaid-8B-V1", - "developer": "bluuwhale", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7345 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5219 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bond005/meno-tiny-0.1/59a9ed26-a67a-4e76-8858-520400c90766.json b/data/hfopenllm_v2/bond005/meno-tiny-0.1/59a9ed26-a67a-4e76-8858-520400c90766.json deleted file mode 100644 index f9ab2bb8c..000000000 --- a/data/hfopenllm_v2/bond005/meno-tiny-0.1/59a9ed26-a67a-4e76-8858-520400c90766.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bond005_meno-tiny-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "meno-tiny-0.1", - "id": "bond005/meno-tiny-0.1", - "developer": "bond005", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.455 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4263 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.139 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2786 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bosonai/Higgs-Llama-3-70B/6c5c61b4-8037-4b28-8616-1aefa7963eb8.json b/data/hfopenllm_v2/bosonai/Higgs-Llama-3-70B/6c5c61b4-8037-4b28-8616-1aefa7963eb8.json deleted file mode 100644 index 7f13e2ca9..000000000 --- a/data/hfopenllm_v2/bosonai/Higgs-Llama-3-70B/6c5c61b4-8037-4b28-8616-1aefa7963eb8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bosonai_Higgs-Llama-3-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Higgs-Llama-3-70B", - "id": "bosonai/Higgs-Llama-3-70B", - "developer": "bosonai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5561 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6258 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2523 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4471 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt/e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt/e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json deleted file mode 100644 index 4c23d4552..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt/e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Blunt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-1.5B-Blunt", - "id": "braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2611 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2774 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1382 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1184 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective/5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective/5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json deleted file mode 100644 index 7fd796153..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective/5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Reflective/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-1.5B-Reflective", - "id": "braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3033 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2908 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1631 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/d5b31b1f-ace0-457f-bf8a-9041398b8344.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/d5b31b1f-ace0-457f-bf8a-9041398b8344.json deleted file mode 100644 index 71e6c0585..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/d5b31b1f-ace0-457f-bf8a-9041398b8344.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B-ABUB-ST", - "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4927 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4221 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4243 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json deleted file mode 100644 index ce916802a..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective", - "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3371 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2372 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4248 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1504 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/c701f1fd-166d-416b-8f78-edf17f2fecd4.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/c701f1fd-166d-416b-8f78-edf17f2fecd4.json deleted file mode 100644 index d0206075b..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/c701f1fd-166d-416b-8f78-edf17f2fecd4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt", - "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5221 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3199 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1484 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/4217b403-e924-4f67-9b0e-ad1d4ed293a1.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/4217b403-e924-4f67-9b0e-ad1d4ed293a1.json deleted file mode 100644 index 7a63e7232..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/4217b403-e924-4f67-9b0e-ad1d4ed293a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective", - "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5139 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3013 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1473 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4433 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1289 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/03816e41-5fb8-4815-ab9c-4108ab19a3bc.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/03816e41-5fb8-4815-ab9c-4108ab19a3bc.json deleted file mode 100644 index 93ba8529b..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/03816e41-5fb8-4815-ab9c-4108ab19a3bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored", - "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5422 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.317 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1631 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1431 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt/a763b10e-350a-4342-ade3-b782437ca3e2.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt/a763b10e-350a-4342-ade3-b782437ca3e2.json deleted file mode 100644 index 511ffb178..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt/a763b10e-350a-4342-ade3-b782437ca3e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt", - "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5612 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3283 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1447 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective/9e806fd2-edbf-40e2-a008-834cee537bb6.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective/9e806fd2-edbf-40e2-a008-834cee537bb6.json deleted file mode 100644 index c56a5b418..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective/9e806fd2-edbf-40e2-a008-834cee537bb6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Reflective/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B-Reflective", - "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B/fbcf861c-62db-4079-bba6-becd4e231216.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B/fbcf861c-62db-4079-bba6-becd4e231216.json deleted file mode 100644 index cd51e1d40..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B/fbcf861c-62db-4079-bba6-becd4e231216.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B", - "id": "braindao/DeepSeek-R1-Distill-Qwen-14B", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4172 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3033 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt/22b591c0-3386-4bd5-860c-20c0c6001986.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt/22b591c0-3386-4bd5-860c-20c0c6001986.json deleted file mode 100644 index e9582887e..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt/22b591c0-3386-4bd5-860c-20c0c6001986.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-7B-Blunt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-7B-Blunt", - "id": "braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4266 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2902 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3885 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1169 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/dfb9a9c4-114e-4188-9940-4d6df7e4815f.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/dfb9a9c4-114e-4188-9940-4d6df7e4815f.json deleted file mode 100644 index b9fd0837f..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/dfb9a9c4-114e-4188-9940-4d6df7e4815f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored", - "id": "braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3655 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2958 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1737 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3846 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective/38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective/38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json deleted file mode 100644 index dea2a0ad9..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective/38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-7B-Reflective/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-7B-Reflective", - "id": "braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2907 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1155 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B/e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B/e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json deleted file mode 100644 index 256a50564..000000000 --- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B/e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-7B", - "id": "braindao/DeepSeek-R1-Distill-Qwen-7B", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2887 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1141 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/Qwen2.5-14B-Instruct/2165e69a-c50c-419a-932e-909f53b73b71.json b/data/hfopenllm_v2/braindao/Qwen2.5-14B-Instruct/2165e69a-c50c-419a-932e-909f53b73b71.json deleted file mode 100644 index 5a58485ba..000000000 --- a/data/hfopenllm_v2/braindao/Qwen2.5-14B-Instruct/2165e69a-c50c-419a-932e-909f53b73b71.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_Qwen2.5-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Instruct", - "id": "braindao/Qwen2.5-14B-Instruct", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8143 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6404 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4889 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/Qwen2.5-14B/46430a07-15c8-4727-9102-2f471d4f1d3c.json b/data/hfopenllm_v2/braindao/Qwen2.5-14B/46430a07-15c8-4727-9102-2f471d4f1d3c.json deleted file mode 100644 index fd68ca929..000000000 --- a/data/hfopenllm_v2/braindao/Qwen2.5-14B/46430a07-15c8-4727-9102-2f471d4f1d3c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_Qwen2.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B", - "id": "braindao/Qwen2.5-14B", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5409 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5853 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4884 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/braindao/iq-code-evmind-0.5b/3c7f540a-c850-4e20-ad93-60e021d17133.json b/data/hfopenllm_v2/braindao/iq-code-evmind-0.5b/3c7f540a-c850-4e20-ad93-60e021d17133.json deleted file mode 100644 index 7ebae94fb..000000000 --- a/data/hfopenllm_v2/braindao/iq-code-evmind-0.5b/3c7f540a-c850-4e20-ad93-60e021d17133.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/braindao_iq-code-evmind-0.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "iq-code-evmind-0.5b", - "id": "braindao/iq-code-evmind-0.5b", - "developer": "braindao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3216 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3164 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2416 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3304 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1189 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/brgx53/3Bgeneral-ECE-PRYMMAL-Martial/c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json b/data/hfopenllm_v2/brgx53/3Bgeneral-ECE-PRYMMAL-Martial/c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json deleted file mode 100644 index e96867233..000000000 --- a/data/hfopenllm_v2/brgx53/3Bgeneral-ECE-PRYMMAL-Martial/c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/brgx53_3Bgeneral-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "3Bgeneral-ECE-PRYMMAL-Martial", - "id": "brgx53/3Bgeneral-ECE-PRYMMAL-Martial", - "developer": "brgx53", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5458 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3934 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial/2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json b/data/hfopenllm_v2/brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial/2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json deleted file mode 100644 index bebdea7ac..000000000 --- a/data/hfopenllm_v2/brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial/2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/brgx53_3Bgeneralv2-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "3Bgeneralv2-ECE-PRYMMAL-Martial", - "id": "brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial", - "developer": "brgx53", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5677 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5607 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/brgx53/3Blareneg-ECE-PRYMMAL-Martial/6427a5ef-8508-430d-970d-054fc485e754.json b/data/hfopenllm_v2/brgx53/3Blareneg-ECE-PRYMMAL-Martial/6427a5ef-8508-430d-970d-054fc485e754.json deleted file mode 100644 index 27aa2a2d2..000000000 --- a/data/hfopenllm_v2/brgx53/3Blareneg-ECE-PRYMMAL-Martial/6427a5ef-8508-430d-970d-054fc485e754.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/brgx53_3Blareneg-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "3Blareneg-ECE-PRYMMAL-Martial", - "id": "brgx53/3Blareneg-ECE-PRYMMAL-Martial", - "developer": "brgx53", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2876 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5358 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4016 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/brgx53/3Blarenegv2-ECE-PRYMMAL-Martial/08984ad9-1e9b-4916-b214-af26dadfcc0b.json b/data/hfopenllm_v2/brgx53/3Blarenegv2-ECE-PRYMMAL-Martial/08984ad9-1e9b-4916-b214-af26dadfcc0b.json deleted file mode 100644 index 3114b7908..000000000 --- a/data/hfopenllm_v2/brgx53/3Blarenegv2-ECE-PRYMMAL-Martial/08984ad9-1e9b-4916-b214-af26dadfcc0b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/brgx53_3Blarenegv2-ECE-PRYMMAL-Martial/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "3Blarenegv2-ECE-PRYMMAL-Martial", - "id": "brgx53/3Blarenegv2-ECE-PRYMMAL-Martial", - "developer": "brgx53", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5662 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5607 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/brgx53/Barracuda-PRYMMAL-ECE-TW3/1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json b/data/hfopenllm_v2/brgx53/Barracuda-PRYMMAL-ECE-TW3/1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json deleted file mode 100644 index ee39d8a64..000000000 --- a/data/hfopenllm_v2/brgx53/Barracuda-PRYMMAL-ECE-TW3/1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/brgx53_Barracuda-PRYMMAL-ECE-TW3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Barracuda-PRYMMAL-ECE-TW3", - "id": "brgx53/Barracuda-PRYMMAL-ECE-TW3", - "developer": "brgx53", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.164 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3002 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3609 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1093 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/brgx53/LaConfiance-PRYMMAL-ECE-TW3/6bf42faa-c3e9-4069-bf93-ffd626062f0f.json b/data/hfopenllm_v2/brgx53/LaConfiance-PRYMMAL-ECE-TW3/6bf42faa-c3e9-4069-bf93-ffd626062f0f.json deleted file mode 100644 index f4004b73f..000000000 --- a/data/hfopenllm_v2/brgx53/LaConfiance-PRYMMAL-ECE-TW3/6bf42faa-c3e9-4069-bf93-ffd626062f0f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/brgx53_LaConfiance-PRYMMAL-ECE-TW3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LaConfiance-PRYMMAL-ECE-TW3", - "id": "brgx53/LaConfiance-PRYMMAL-ECE-TW3", - "developer": "brgx53", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1579 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2962 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3846 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1146 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Best-Mix-Llama-3.1-8B/9feccbdc-18eb-4077-b50b-986db0047fc8.json b/data/hfopenllm_v2/bunnycore/Best-Mix-Llama-3.1-8B/9feccbdc-18eb-4077-b50b-986db0047fc8.json deleted file mode 100644 index b5b81fbcc..000000000 --- a/data/hfopenllm_v2/bunnycore/Best-Mix-Llama-3.1-8B/9feccbdc-18eb-4077-b50b-986db0047fc8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Best-Mix-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Best-Mix-Llama-3.1-8B", - "id": "bunnycore/Best-Mix-Llama-3.1-8B", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2067 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3432 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2054 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2929 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1565 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Blabbertron-1.0/a074c33f-782a-409c-987b-7dd62c65ccc7.json b/data/hfopenllm_v2/bunnycore/Blabbertron-1.0/a074c33f-782a-409c-987b-7dd62c65ccc7.json deleted file mode 100644 index 313ecbf7a..000000000 --- a/data/hfopenllm_v2/bunnycore/Blabbertron-1.0/a074c33f-782a-409c-987b-7dd62c65ccc7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Blabbertron-1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Blabbertron-1.0", - "id": "bunnycore/Blabbertron-1.0", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7433 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5497 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4337 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Blabbertron-1.1/2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json b/data/hfopenllm_v2/bunnycore/Blabbertron-1.1/2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json deleted file mode 100644 index 47f457e44..000000000 --- a/data/hfopenllm_v2/bunnycore/Blabbertron-1.1/2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Blabbertron-1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Blabbertron-1.1", - "id": "bunnycore/Blabbertron-1.1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7265 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5534 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4804 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4416 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4431 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/CyberCore-Qwen-2.1-7B/84481fee-3727-427b-912a-30e2744df28a.json b/data/hfopenllm_v2/bunnycore/CyberCore-Qwen-2.1-7B/84481fee-3727-427b-912a-30e2744df28a.json deleted file mode 100644 index a37c47c92..000000000 --- a/data/hfopenllm_v2/bunnycore/CyberCore-Qwen-2.1-7B/84481fee-3727-427b-912a-30e2744df28a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_CyberCore-Qwen-2.1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CyberCore-Qwen-2.1-7B", - "id": "bunnycore/CyberCore-Qwen-2.1-7B", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5766 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5572 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3588 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4145 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4445 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/DeepQwen-3B-LCoT-SCE/aaa801dc-1a47-4009-9ad4-7129a8d4e651.json b/data/hfopenllm_v2/bunnycore/DeepQwen-3B-LCoT-SCE/aaa801dc-1a47-4009-9ad4-7129a8d4e651.json deleted file mode 100644 index ee38b50d8..000000000 --- a/data/hfopenllm_v2/bunnycore/DeepQwen-3B-LCoT-SCE/aaa801dc-1a47-4009-9ad4-7129a8d4e651.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_DeepQwen-3B-LCoT-SCE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepQwen-3B-LCoT-SCE", - "id": "bunnycore/DeepQwen-3B-LCoT-SCE", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.396 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.247 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/3ac92cbf-c85b-4e00-9ef9-4322f961591a.json b/data/hfopenllm_v2/bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/3ac92cbf-c85b-4e00-9ef9-4322f961591a.json deleted file mode 100644 index b208debb8..000000000 --- a/data/hfopenllm_v2/bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/3ac92cbf-c85b-4e00-9ef9-4322f961591a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-7B-RRP-Ex", - "id": "bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3901 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3494 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1654 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v1/162b511b-4684-4595-9261-a33f3a4117f9.json b/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v1/162b511b-4684-4595-9261-a33f3a4117f9.json deleted file mode 100644 index e66755432..000000000 --- a/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v1/162b511b-4684-4595-9261-a33f3a4117f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_DeepThinker-7B-Sce-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepThinker-7B-Sce-v1", - "id": "bunnycore/DeepThinker-7B-Sce-v1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1218 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3018 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4194 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v2/20d5d59a-028d-4e34-9414-d9edaf2e59b8.json b/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v2/20d5d59a-028d-4e34-9414-d9edaf2e59b8.json deleted file mode 100644 index 36edeced3..000000000 --- a/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v2/20d5d59a-028d-4e34-9414-d9edaf2e59b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_DeepThinker-7B-Sce-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepThinker-7B-Sce-v2", - "id": "bunnycore/DeepThinker-7B-Sce-v2", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1631 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3057 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1146 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct/a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json b/data/hfopenllm_v2/bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct/a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json deleted file mode 100644 index ae646dc09..000000000 --- a/data/hfopenllm_v2/bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct/a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_FuseCyberMix-Qwen-2.5-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FuseCyberMix-Qwen-2.5-7B-Instruct", - "id": "bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7019 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5518 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4841 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4337 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/FuseQwQen-7B/0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json b/data/hfopenllm_v2/bunnycore/FuseQwQen-7B/0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json deleted file mode 100644 index 066c75efc..000000000 --- a/data/hfopenllm_v2/bunnycore/FuseQwQen-7B/0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_FuseQwQen-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FuseQwQen-7B", - "id": "bunnycore/FuseQwQen-7B", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5504 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4217 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4407 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.1/6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json b/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.1/6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json deleted file mode 100644 index 13a373a0b..000000000 --- a/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.1/6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_FwF-Qwen-7B-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FwF-Qwen-7B-0.1", - "id": "bunnycore/FwF-Qwen-7B-0.1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3005 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5019 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2764 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3952 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4061 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.2/78582fec-2f69-4b37-8497-12ceb097b44b.json b/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.2/78582fec-2f69-4b37-8497-12ceb097b44b.json deleted file mode 100644 index c94f58add..000000000 --- a/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.2/78582fec-2f69-4b37-8497-12ceb097b44b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_FwF-Qwen-7B-0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FwF-Qwen-7B-0.2", - "id": "bunnycore/FwF-Qwen-7B-0.2", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5596 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4218 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Gemma-2-2B-Smart/949bf65e-c2ae-4701-82f0-39d0c62a0e87.json b/data/hfopenllm_v2/bunnycore/Gemma-2-2B-Smart/949bf65e-c2ae-4701-82f0-39d0c62a0e87.json deleted file mode 100644 index 7d739968f..000000000 --- a/data/hfopenllm_v2/bunnycore/Gemma-2-2B-Smart/949bf65e-c2ae-4701-82f0-39d0c62a0e87.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Gemma-2-2B-Smart/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-2B-Smart", - "id": "bunnycore/Gemma-2-2B-Smart", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3974 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2426 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Gemma2-9B-TitanFusion/8812151c-4301-4131-a414-d64d025e476e.json b/data/hfopenllm_v2/bunnycore/Gemma2-9B-TitanFusion/8812151c-4301-4131-a414-d64d025e476e.json deleted file mode 100644 index 7da88954d..000000000 --- a/data/hfopenllm_v2/bunnycore/Gemma2-9B-TitanFusion/8812151c-4301-4131-a414-d64d025e476e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Gemma2-9B-TitanFusion/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2-9B-TitanFusion", - "id": "bunnycore/Gemma2-9B-TitanFusion", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1618 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5712 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/HyperLlama-3.1-8B/2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json b/data/hfopenllm_v2/bunnycore/HyperLlama-3.1-8B/2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json deleted file mode 100644 index 0347a413c..000000000 --- a/data/hfopenllm_v2/bunnycore/HyperLlama-3.1-8B/2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_HyperLlama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HyperLlama-3.1-8B", - "id": "bunnycore/HyperLlama-3.1-8B", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5103 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1828 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3829 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3783 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-Mix/9feeffb2-3763-4e43-933e-89100b76f7fa.json b/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-Mix/9feeffb2-3763-4e43-933e-89100b76f7fa.json deleted file mode 100644 index 0258259b7..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-Mix/9feeffb2-3763-4e43-933e-89100b76f7fa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.1-8B-TitanFusion-Mix/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-TitanFusion-Mix", - "id": "bunnycore/Llama-3.1-8B-TitanFusion-Mix", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4925 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5756 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4317 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3695 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-v3/721102b5-ed5e-4631-8600-a6adfff0c784.json b/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-v3/721102b5-ed5e-4631-8600-a6adfff0c784.json deleted file mode 100644 index 9e1223273..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-v3/721102b5-ed5e-4631-8600-a6adfff0c784.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.1-8B-TitanFusion-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-TitanFusion-v3", - "id": "bunnycore/Llama-3.1-8B-TitanFusion-v3", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.142 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3806 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-All-Mix/18c185f7-5ca4-46ff-81c2-6c538f096409.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-All-Mix/18c185f7-5ca4-46ff-81c2-6c538f096409.json deleted file mode 100644 index e63ada024..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-All-Mix/18c185f7-5ca4-46ff-81c2-6c538f096409.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-All-Mix/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-All-Mix", - "id": "bunnycore/Llama-3.2-3B-All-Mix", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7226 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4508 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1503 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.316 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Bespoke-Thought/7ab5911c-e229-43e5-a798-095287d0a597.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Bespoke-Thought/7ab5911c-e229-43e5-a798-095287d0a597.json deleted file mode 100644 index 8d51fa17b..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Bespoke-Thought/7ab5911c-e229-43e5-a798-095287d0a597.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Bespoke-Thought/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Bespoke-Thought", - "id": "bunnycore/Llama-3.2-3B-Bespoke-Thought", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4113 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4522 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1647 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.311 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Booval/f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Booval/f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json deleted file mode 100644 index 596f8d2f9..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Booval/f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Booval/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Booval", - "id": "bunnycore/Llama-3.2-3B-Booval", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6669 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4514 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3058 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json deleted file mode 100644 index cce4b4015..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Deep-Test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Deep-Test", - "id": "bunnycore/Llama-3.2-3B-Deep-Test", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4652 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4531 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/d4b20ef4-734e-40a7-818e-f77e170d7437.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/d4b20ef4-734e-40a7-818e-f77e170d7437.json deleted file mode 100644 index 89f5cb978..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/d4b20ef4-734e-40a7-818e-f77e170d7437.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Deep-Test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Deep-Test", - "id": "bunnycore/Llama-3.2-3B-Deep-Test", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.803 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1775 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.295 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3647 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1049 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Della/e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Della/e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json deleted file mode 100644 index 10dca8528..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Della/e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Della/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Della", - "id": "bunnycore/Llama-3.2-3B-Della", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3561 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3902 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Long-Think/3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Long-Think/3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json deleted file mode 100644 index a4a40c69d..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Long-Think/3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Long-Think/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Long-Think", - "id": "bunnycore/Llama-3.2-3B-Long-Think", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5473 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.461 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3048 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Mix-Skill/9aff874c-1953-4b97-9bff-9e6120b0bfa7.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Mix-Skill/9aff874c-1953-4b97-9bff-9e6120b0bfa7.json deleted file mode 100644 index 80946bb54..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Mix-Skill/9aff874c-1953-4b97-9bff-9e6120b0bfa7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Mix-Skill/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Mix-Skill", - "id": "bunnycore/Llama-3.2-3B-Mix-Skill", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6404 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4582 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1473 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlus/45ae7f45-8c36-46c6-989d-bc672cdf8eff.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlus/45ae7f45-8c36-46c6-989d-bc672cdf8eff.json deleted file mode 100644 index dfe4a54af..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlus/45ae7f45-8c36-46c6-989d-bc672cdf8eff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-ProdigyPlus/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-ProdigyPlus", - "id": "bunnycore/Llama-3.2-3B-ProdigyPlus", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlusPlus/7d36e44e-a329-4b96-a891-365ad900f718.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlusPlus/7d36e44e-a329-4b96-a891-365ad900f718.json deleted file mode 100644 index 29f6d9a02..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlusPlus/7d36e44e-a329-4b96-a891-365ad900f718.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-ProdigyPlusPlus/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-ProdigyPlusPlus", - "id": "bunnycore/Llama-3.2-3B-ProdigyPlusPlus", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1645 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.369 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.15 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RP-DeepThink/a8c26325-1eec-43a6-a8ad-3bcb2e378924.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RP-DeepThink/a8c26325-1eec-43a6-a8ad-3bcb2e378924.json deleted file mode 100644 index 467586c96..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RP-DeepThink/a8c26325-1eec-43a6-a8ad-3bcb2e378924.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-RP-DeepThink/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-RP-DeepThink", - "id": "bunnycore/Llama-3.2-3B-RP-DeepThink", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7144 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4563 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1609 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3242 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RRStock/bde1a879-6852-42ce-9217-f427af85a46a.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RRStock/bde1a879-6852-42ce-9217-f427af85a46a.json deleted file mode 100644 index aa68c7658..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RRStock/bde1a879-6852-42ce-9217-f427af85a46a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-RRStock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-RRStock", - "id": "bunnycore/Llama-3.2-3B-RRStock", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6657 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4568 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1699 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3236 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ToxicKod/dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ToxicKod/dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json deleted file mode 100644 index 69e6b8e9a..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ToxicKod/dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-ToxicKod/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-ToxicKod", - "id": "bunnycore/Llama-3.2-3B-ToxicKod", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6319 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4525 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1699 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.288 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3b-RP-Toxic-Fuse/12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3b-RP-Toxic-Fuse/12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json deleted file mode 100644 index 3cdd73e74..000000000 --- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3b-RP-Toxic-Fuse/12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3b-RP-Toxic-Fuse/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3b-RP-Toxic-Fuse", - "id": "bunnycore/Llama-3.2-3b-RP-Toxic-Fuse", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6834 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2402 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3954 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3106 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Maestro-S1k-7B-Sce/1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json b/data/hfopenllm_v2/bunnycore/Maestro-S1k-7B-Sce/1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json deleted file mode 100644 index 56e2b2c00..000000000 --- a/data/hfopenllm_v2/bunnycore/Maestro-S1k-7B-Sce/1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Maestro-S1k-7B-Sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Maestro-S1k-7B-Sce", - "id": "bunnycore/Maestro-S1k-7B-Sce", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2523 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-3.5-mini-TitanFusion-0.1/7076406b-7e0a-49c7-8150-2e6a243aa23b.json b/data/hfopenllm_v2/bunnycore/Phi-3.5-mini-TitanFusion-0.1/7076406b-7e0a-49c7-8150-2e6a243aa23b.json deleted file mode 100644 index 700136bdc..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-3.5-mini-TitanFusion-0.1/7076406b-7e0a-49c7-8150-2e6a243aa23b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-3.5-mini-TitanFusion-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3.5-mini-TitanFusion-0.1", - "id": "bunnycore/Phi-3.5-mini-TitanFusion-0.1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5228 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4453 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v2/96c3fd80-a601-4629-a1ab-bf7f366a909a.json b/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v2/96c3fd80-a601-4629-a1ab-bf7f366a909a.json deleted file mode 100644 index 82650a142..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v2/96c3fd80-a601-4629-a1ab-bf7f366a909a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Model-Stock-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Model-Stock-v2", - "id": "bunnycore/Phi-4-Model-Stock-v2", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6375 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6825 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4662 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v3/1302c9a5-d35c-400c-b9f3-d990243e5d59.json b/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v3/1302c9a5-d35c-400c-b9f3-d990243e5d59.json deleted file mode 100644 index 2532056a9..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v3/1302c9a5-d35c-400c-b9f3-d990243e5d59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Model-Stock-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Model-Stock-v3", - "id": "bunnycore/Phi-4-Model-Stock-v3", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5912 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6726 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4166 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v4/c7f48bbf-6583-4ddd-ae4d-671c43218dae.json b/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v4/c7f48bbf-6583-4ddd-ae4d-671c43218dae.json deleted file mode 100644 index 6189289e4..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v4/c7f48bbf-6583-4ddd-ae4d-671c43218dae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Model-Stock-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Model-Stock-v4", - "id": "bunnycore/Phi-4-Model-Stock-v4", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6924 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3829 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3691 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4611 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5394 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock/5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json b/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock/5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json deleted file mode 100644 index fd4a9988a..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock/5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Model-Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Model-Stock", - "id": "bunnycore/Phi-4-Model-Stock", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.689 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-RP-v0/15701682-97ce-46cf-8010-a6bdeaf8c7aa.json b/data/hfopenllm_v2/bunnycore/Phi-4-RP-v0/15701682-97ce-46cf-8010-a6bdeaf8c7aa.json deleted file mode 100644 index 695b852da..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-RP-v0/15701682-97ce-46cf-8010-a6bdeaf8c7aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-RP-v0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-RP-v0", - "id": "bunnycore/Phi-4-RP-v0", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6827 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6856 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5364 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-RR-Shoup/c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json b/data/hfopenllm_v2/bunnycore/Phi-4-RR-Shoup/c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json deleted file mode 100644 index a4fe6d434..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-RR-Shoup/c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-RR-Shoup/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-RR-Shoup", - "id": "bunnycore/Phi-4-RR-Shoup", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6587 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6947 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4992 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5429 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-RStock-v0.1/4337b1c1-cc00-4a15-8148-e8d0739561b9.json b/data/hfopenllm_v2/bunnycore/Phi-4-RStock-v0.1/4337b1c1-cc00-4a15-8148-e8d0739561b9.json deleted file mode 100644 index 476936c1d..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-RStock-v0.1/4337b1c1-cc00-4a15-8148-e8d0739561b9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-RStock-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-RStock-v0.1", - "id": "bunnycore/Phi-4-RStock-v0.1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7019 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6928 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4584 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5401 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-ReasoningRP/1151ee14-8fe9-4f97-808d-8103b353c2ec.json b/data/hfopenllm_v2/bunnycore/Phi-4-ReasoningRP/1151ee14-8fe9-4f97-808d-8103b353c2ec.json deleted file mode 100644 index c71d4c31e..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-ReasoningRP/1151ee14-8fe9-4f97-808d-8103b353c2ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-ReasoningRP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-ReasoningRP", - "id": "bunnycore/Phi-4-ReasoningRP", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6736 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6922 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4569 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4491 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5421 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Sce-exp-v0.1/a2c18179-aca3-422c-b9f5-8345109cea13.json b/data/hfopenllm_v2/bunnycore/Phi-4-Sce-exp-v0.1/a2c18179-aca3-422c-b9f5-8345109cea13.json deleted file mode 100644 index ca8ff9b68..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-Sce-exp-v0.1/a2c18179-aca3-422c-b9f5-8345109cea13.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Sce-exp-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Sce-exp-v0.1", - "id": "bunnycore/Phi-4-Sce-exp-v0.1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6595 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6943 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.503 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5423 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Stock-Ex/07495d34-1505-45a9-bb48-887af0da8a0c.json b/data/hfopenllm_v2/bunnycore/Phi-4-Stock-Ex/07495d34-1505-45a9-bb48-887af0da8a0c.json deleted file mode 100644 index d7f7a2004..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-Stock-Ex/07495d34-1505-45a9-bb48-887af0da8a0c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Stock-Ex/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Stock-Ex", - "id": "bunnycore/Phi-4-Stock-Ex", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6864 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5375 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Stock-RP/567baf6d-99f9-46a5-8c40-c6899986f1ff.json b/data/hfopenllm_v2/bunnycore/Phi-4-Stock-RP/567baf6d-99f9-46a5-8c40-c6899986f1ff.json deleted file mode 100644 index bfb812571..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-Stock-RP/567baf6d-99f9-46a5-8c40-c6899986f1ff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Stock-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Stock-RP", - "id": "bunnycore/Phi-4-Stock-RP", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6399 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4715 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5317 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Trim-Exp1/a337df3a-28ff-46c9-adae-4bc029937101.json b/data/hfopenllm_v2/bunnycore/Phi-4-Trim-Exp1/a337df3a-28ff-46c9-adae-4bc029937101.json deleted file mode 100644 index 803b9c23e..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-4-Trim-Exp1/a337df3a-28ff-46c9-adae-4bc029937101.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Trim-Exp1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Trim-Exp1", - "id": "bunnycore/Phi-4-Trim-Exp1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.503 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1219 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4177 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Phi-Seek-4-Sce-V1/b201a849-44e9-4598-918b-ffa27c894ee9.json b/data/hfopenllm_v2/bunnycore/Phi-Seek-4-Sce-V1/b201a849-44e9-4598-918b-ffa27c894ee9.json deleted file mode 100644 index b2a61d834..000000000 --- a/data/hfopenllm_v2/bunnycore/Phi-Seek-4-Sce-V1/b201a849-44e9-4598-918b-ffa27c894ee9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Phi-Seek-4-Sce-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-Seek-4-Sce-V1", - "id": "bunnycore/Phi-Seek-4-Sce-V1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2935 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6459 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3982 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qandora-2.5-7B-Creative/dd87ebf3-3088-43b1-851c-a97d12a68ea8.json b/data/hfopenllm_v2/bunnycore/Qandora-2.5-7B-Creative/dd87ebf3-3088-43b1-851c-a97d12a68ea8.json deleted file mode 100644 index 3f60d9d68..000000000 --- a/data/hfopenllm_v2/bunnycore/Qandora-2.5-7B-Creative/dd87ebf3-3088-43b1-851c-a97d12a68ea8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qandora-2.5-7B-Creative/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qandora-2.5-7B-Creative", - "id": "bunnycore/Qandora-2.5-7B-Creative", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6803 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5542 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3059 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4212 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.448 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/QandoraExp-7B-Persona/1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json b/data/hfopenllm_v2/bunnycore/QandoraExp-7B-Persona/1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json deleted file mode 100644 index 3676e6214..000000000 --- a/data/hfopenllm_v2/bunnycore/QandoraExp-7B-Persona/1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_QandoraExp-7B-Persona/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QandoraExp-7B-Persona", - "id": "bunnycore/QandoraExp-7B-Persona", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6247 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5558 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4407 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/QandoraExp-7B-v2/220cb478-58c0-4028-b51a-ec5fe1050746.json b/data/hfopenllm_v2/bunnycore/QandoraExp-7B-v2/220cb478-58c0-4028-b51a-ec5fe1050746.json deleted file mode 100644 index 69de747fd..000000000 --- a/data/hfopenllm_v2/bunnycore/QandoraExp-7B-v2/220cb478-58c0-4028-b51a-ec5fe1050746.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_QandoraExp-7B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QandoraExp-7B-v2", - "id": "bunnycore/QandoraExp-7B-v2", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5607 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5445 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4713 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4045 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/QandoraExp-7B/17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json b/data/hfopenllm_v2/bunnycore/QandoraExp-7B/17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json deleted file mode 100644 index 7fc71a8b5..000000000 --- a/data/hfopenllm_v2/bunnycore/QandoraExp-7B/17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_QandoraExp-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QandoraExp-7B", - "id": "bunnycore/QandoraExp-7B", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5478 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4743 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4312 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.441 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT-R1/2b55023b-b8bc-42a2-aca8-dcaf39890232.json b/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT-R1/2b55023b-b8bc-42a2-aca8-dcaf39890232.json deleted file mode 100644 index cfea94b78..000000000 --- a/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT-R1/2b55023b-b8bc-42a2-aca8-dcaf39890232.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_QwQen-3B-LCoT-R1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQen-3B-LCoT-R1", - "id": "bunnycore/QwQen-3B-LCoT-R1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.085 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5342 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4799 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT/31736569-5992-4b1d-9d66-27a6c1620506.json b/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT/31736569-5992-4b1d-9d66-27a6c1620506.json deleted file mode 100644 index c7d4b48fe..000000000 --- a/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT/31736569-5992-4b1d-9d66-27a6c1620506.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_QwQen-3B-LCoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQen-3B-LCoT", - "id": "bunnycore/QwQen-3B-LCoT", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6025 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4899 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Sky-T1/630b37b5-351c-403c-ac76-ccb68ffc5d53.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Sky-T1/630b37b5-351c-403c-ac76-ccb68ffc5d53.json deleted file mode 100644 index e8e23a052..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Sky-T1/630b37b5-351c-403c-ac76-ccb68ffc5d53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Deep-Sky-T1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-7B-Deep-Sky-T1", - "id": "bunnycore/Qwen-2.5-7B-Deep-Sky-T1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4018 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2104 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v1/69cdef01-30dc-4f75-97fa-9daeebcec72f.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v1/69cdef01-30dc-4f75-97fa-9daeebcec72f.json deleted file mode 100644 index 949efbe5f..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v1/69cdef01-30dc-4f75-97fa-9daeebcec72f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Deep-Stock-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-7B-Deep-Stock-v1", - "id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5695 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5361 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2644 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4109 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4066 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v4/9aa1acb0-c791-4dea-aa1e-c912cea69466.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v4/9aa1acb0-c791-4dea-aa1e-c912cea69466.json deleted file mode 100644 index 39d26d113..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v4/9aa1acb0-c791-4dea-aa1e-c912cea69466.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Deep-Stock-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-7B-Deep-Stock-v4", - "id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v4", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7753 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5453 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4894 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4127 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4342 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v5/0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v5/0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json deleted file mode 100644 index 7ef80c9ec..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v5/0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Deep-Stock-v5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-7B-Deep-Stock-v5", - "id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v5", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4672 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1473 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3648 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2832 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Exp-Sce/2872dcd9-421b-4346-812c-b27bb32c6e86.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Exp-Sce/2872dcd9-421b-4346-812c-b27bb32c6e86.json deleted file mode 100644 index e05a367ea..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Exp-Sce/2872dcd9-421b-4346-812c-b27bb32c6e86.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Exp-Sce/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-7B-Exp-Sce", - "id": "bunnycore/Qwen-2.5-7B-Exp-Sce", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7652 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5506 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.443 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4259 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-R1-Stock/2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-R1-Stock/2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json deleted file mode 100644 index 126635626..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-R1-Stock/2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-R1-Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-7B-R1-Stock", - "id": "bunnycore/Qwen-2.5-7B-R1-Stock", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7573 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5393 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3994 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4294 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke/d0a76497-84b0-45b9-b748-04ffe9bc13a3.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke/d0a76497-84b0-45b9-b748-04ffe9bc13a3.json deleted file mode 100644 index 4940eaf52..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke/d0a76497-84b0-45b9-b748-04ffe9bc13a3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Stock-Deep-Bespoke/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-7B-Stock-Deep-Bespoke", - "id": "bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5206 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4068 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7b-S1k/185b6560-6790-417f-aeba-f7405fee808a.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7b-S1k/185b6560-6790-417f-aeba-f7405fee808a.json deleted file mode 100644 index 4b2686474..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7b-S1k/185b6560-6790-417f-aeba-f7405fee808a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7b-S1k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-7b-S1k", - "id": "bunnycore/Qwen-2.5-7b-S1k", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7162 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5563 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4781 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-1.5B-Model-Stock/30a8074e-df03-4866-9b8d-a5a7eece3c71.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-1.5B-Model-Stock/30a8074e-df03-4866-9b8d-a5a7eece3c71.json deleted file mode 100644 index 3a805a02c..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-1.5B-Model-Stock/30a8074e-df03-4866-9b8d-a5a7eece3c71.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-1.5B-Model-Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-1.5B-Model-Stock", - "id": "bunnycore/Qwen2.5-1.5B-Model-Stock", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.776 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1829 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2874 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.11 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v2/ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v2/ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json deleted file mode 100644 index edae05519..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v2/ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-Model-Stock-v2", - "id": "bunnycore/Qwen2.5-3B-Model-Stock-v2", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.396 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4677 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3915 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.327 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.1/bc98b048-18d4-438e-80c4-0cd851798da5.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.1/bc98b048-18d4-438e-80c4-0cd851798da5.json deleted file mode 100644 index eae5a80e4..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.1/bc98b048-18d4-438e-80c4-0cd851798da5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock-v3.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-Model-Stock-v3.1", - "id": "bunnycore/Qwen2.5-3B-Model-Stock-v3.1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.396 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4737 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3897 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.2/c88c011f-0a24-4e78-a104-035d25af2430.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.2/c88c011f-0a24-4e78-a104-035d25af2430.json deleted file mode 100644 index 6dedfe6ac..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.2/c88c011f-0a24-4e78-a104-035d25af2430.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock-v3.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-Model-Stock-v3.2", - "id": "bunnycore/Qwen2.5-3B-Model-Stock-v3.2", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.396 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6353 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3294 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v4.1/f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v4.1/f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json deleted file mode 100644 index e50f85cd0..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v4.1/f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock-v4.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-Model-Stock-v4.1", - "id": "bunnycore/Qwen2.5-3B-Model-Stock-v4.1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.396 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3941 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock/5484405a-2ec8-4515-af75-76a5dd348d3d.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock/5484405a-2ec8-4515-af75-76a5dd348d3d.json deleted file mode 100644 index 1e65375e1..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock/5484405a-2ec8-4515-af75-76a5dd348d3d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-Model-Stock", - "id": "bunnycore/Qwen2.5-3B-Model-Stock", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.396 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4712 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3799 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3942 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Mix/7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Mix/7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json deleted file mode 100644 index 3bc4e4b91..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Mix/7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-RP-Mix/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-RP-Mix", - "id": "bunnycore/Qwen2.5-3B-RP-Mix", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5721 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4894 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2153 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker-V2/e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker-V2/e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json deleted file mode 100644 index 8be20f53d..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker-V2/e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-RP-Thinker-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-RP-Thinker-V2", - "id": "bunnycore/Qwen2.5-3B-RP-Thinker-V2", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.642 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4678 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3829 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3271 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker/7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker/7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json deleted file mode 100644 index aa4332eb8..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker/7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-RP-Thinker/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-RP-Thinker", - "id": "bunnycore/Qwen2.5-3B-RP-Thinker", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5894 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4164 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-CyberRombos/d0a70e95-fc72-41c6-ac42-09b8f379b566.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-CyberRombos/d0a70e95-fc72-41c6-ac42-09b8f379b566.json deleted file mode 100644 index d0560f8b6..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-CyberRombos/d0a70e95-fc72-41c6-ac42-09b8f379b566.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-CyberRombos/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-CyberRombos", - "id": "bunnycore/Qwen2.5-7B-CyberRombos", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Fuse-Exp/e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Fuse-Exp/e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json deleted file mode 100644 index 3450ede55..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Fuse-Exp/e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-Fuse-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Fuse-Exp", - "id": "bunnycore/Qwen2.5-7B-Fuse-Exp", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5469 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5109 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4573 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3309 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Fusion/f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Fusion/f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json deleted file mode 100644 index 96f112f7d..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Fusion/f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-Instruct-Fusion/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Instruct-Fusion", - "id": "bunnycore/Qwen2.5-7B-Instruct-Fusion", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6962 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4297 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4467 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1/e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1/e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json deleted file mode 100644 index 192a551c4..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1/e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-Instruct-Merge-Stock-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Instruct-Merge-Stock-v0.1", - "id": "bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5529 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4894 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4383 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3/3feb9449-49a2-427f-a317-c21e6d1ca66c.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3/3feb9449-49a2-427f-a317-c21e6d1ca66c.json deleted file mode 100644 index ce27f83f1..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3/3feb9449-49a2-427f-a317-c21e6d1ca66c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-MixStock-Sce-V0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-MixStock-Sce-V0.3", - "id": "bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.212 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3479 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3714 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1779 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-V0.1/6359e37e-0405-436b-903c-8f0e740dd6c7.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-V0.1/6359e37e-0405-436b-903c-8f0e740dd6c7.json deleted file mode 100644 index cda1c1002..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-V0.1/6359e37e-0405-436b-903c-8f0e740dd6c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-MixStock-V0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-MixStock-V0.1", - "id": "bunnycore/Qwen2.5-7B-MixStock-V0.1", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7673 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5479 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4416 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4256 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Stock/f5daed76-f6e5-4a7d-84d7-80537a046b83.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Stock/f5daed76-f6e5-4a7d-84d7-80537a046b83.json deleted file mode 100644 index 7a2c17b90..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Stock/f5daed76-f6e5-4a7d-84d7-80537a046b83.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-R1-Bespoke-Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-R1-Bespoke-Stock", - "id": "bunnycore/Qwen2.5-7B-R1-Bespoke-Stock", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3726 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4822 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2047 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3472 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Task/03af2b1d-989f-4afc-ab13-8793093b9c50.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Task/03af2b1d-989f-4afc-ab13-8793093b9c50.json deleted file mode 100644 index c62431a02..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Task/03af2b1d-989f-4afc-ab13-8793093b9c50.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-R1-Bespoke-Task/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-R1-Bespoke-Task", - "id": "bunnycore/Qwen2.5-7B-R1-Bespoke-Task", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3787 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3569 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2688 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M-Thinker/5db7ec54-7feb-4c11-b2e0-042226ba1f94.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M-Thinker/5db7ec54-7feb-4c11-b2e0-042226ba1f94.json deleted file mode 100644 index 8ddd86827..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M-Thinker/5db7ec54-7feb-4c11-b2e0-042226ba1f94.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-RRP-1M-Thinker/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-RRP-1M-Thinker", - "id": "bunnycore/Qwen2.5-7B-RRP-1M-Thinker", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2308 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1769 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M/f1f5615d-8a78-43c9-b5c6-edc180252381.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M/f1f5615d-8a78-43c9-b5c6-edc180252381.json deleted file mode 100644 index 80b3d9332..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M/f1f5615d-8a78-43c9-b5c6-edc180252381.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-RRP-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-RRP-1M", - "id": "bunnycore/Qwen2.5-7B-RRP-1M", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4483 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-ID/9c89bf8f-4b8a-4c01-8685-fafc687c673e.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-ID/9c89bf8f-4b8a-4c01-8685-fafc687c673e.json deleted file mode 100644 index ebca5e44d..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-ID/9c89bf8f-4b8a-4c01-8685-fafc687c673e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-RRP-ID/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-RRP-ID", - "id": "bunnycore/Qwen2.5-7B-RRP-ID", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7473 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4864 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Sky-R1-Mini/58b69c0f-826d-414f-915e-dd0b78d9298c.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Sky-R1-Mini/58b69c0f-826d-414f-915e-dd0b78d9298c.json deleted file mode 100644 index 592f70f7e..000000000 --- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Sky-R1-Mini/58b69c0f-826d-414f-915e-dd0b78d9298c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-Sky-R1-Mini/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Sky-R1-Mini", - "id": "bunnycore/Qwen2.5-7B-Sky-R1-Mini", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2305 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3503 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1253 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/QwenMosaic-7B/101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json b/data/hfopenllm_v2/bunnycore/QwenMosaic-7B/101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json deleted file mode 100644 index c1f5056f6..000000000 --- a/data/hfopenllm_v2/bunnycore/QwenMosaic-7B/101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_QwenMosaic-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenMosaic-7B", - "id": "bunnycore/QwenMosaic-7B", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5819 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5564 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4164 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Smol-Llama-3.2-3B/259c4798-ff03-4f58-8fb4-59150710212b.json b/data/hfopenllm_v2/bunnycore/Smol-Llama-3.2-3B/259c4798-ff03-4f58-8fb4-59150710212b.json deleted file mode 100644 index 3ffc953ab..000000000 --- a/data/hfopenllm_v2/bunnycore/Smol-Llama-3.2-3B/259c4798-ff03-4f58-8fb4-59150710212b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Smol-Llama-3.2-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Smol-Llama-3.2-3B", - "id": "bunnycore/Smol-Llama-3.2-3B", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6679 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1382 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3228 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/SmolLM2-1.7-Persona/f731caa1-f777-494a-8490-da0c815f0708.json b/data/hfopenllm_v2/bunnycore/SmolLM2-1.7-Persona/f731caa1-f777-494a-8490-da0c815f0708.json deleted file mode 100644 index 1cde2b7fa..000000000 --- a/data/hfopenllm_v2/bunnycore/SmolLM2-1.7-Persona/f731caa1-f777-494a-8490-da0c815f0708.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_SmolLM2-1.7-Persona/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-1.7-Persona", - "id": "bunnycore/SmolLM2-1.7-Persona", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.711 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3623 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1974 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/SmolLM2-1.7B-roleplay-lora/d4d25d38-b21a-490e-9ca9-556504ec00ea.json b/data/hfopenllm_v2/bunnycore/SmolLM2-1.7B-roleplay-lora/d4d25d38-b21a-490e-9ca9-556504ec00ea.json deleted file mode 100644 index fc1b7c8b5..000000000 --- a/data/hfopenllm_v2/bunnycore/SmolLM2-1.7B-roleplay-lora/d4d25d38-b21a-490e-9ca9-556504ec00ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_SmolLM2-1.7B-roleplay-lora/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-1.7B-roleplay-lora", - "id": "bunnycore/SmolLM2-1.7B-roleplay-lora", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 3.423 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5382 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.361 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1966 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/bunnycore/Tulu-3.1-8B-SuperNova/75bb85a3-40bb-4630-95a0-50e40b008412.json b/data/hfopenllm_v2/bunnycore/Tulu-3.1-8B-SuperNova/75bb85a3-40bb-4630-95a0-50e40b008412.json deleted file mode 100644 index c128cbf2c..000000000 --- a/data/hfopenllm_v2/bunnycore/Tulu-3.1-8B-SuperNova/75bb85a3-40bb-4630-95a0-50e40b008412.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/bunnycore_Tulu-3.1-8B-SuperNova/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tulu-3.1-8B-SuperNova", - "id": "bunnycore/Tulu-3.1-8B-SuperNova", - "developer": "bunnycore", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8194 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5254 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2462 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/byroneverson/Mistral-Small-Instruct-2409-abliterated/bb44f3ef-eefa-48ef-a257-2eb345c89a00.json b/data/hfopenllm_v2/byroneverson/Mistral-Small-Instruct-2409-abliterated/bb44f3ef-eefa-48ef-a257-2eb345c89a00.json deleted file mode 100644 index 096e9bd79..000000000 --- a/data/hfopenllm_v2/byroneverson/Mistral-Small-Instruct-2409-abliterated/bb44f3ef-eefa-48ef-a257-2eb345c89a00.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/byroneverson_Mistral-Small-Instruct-2409-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-Instruct-2409-abliterated", - "id": "byroneverson/Mistral-Small-Instruct-2409-abliterated", - "developer": "byroneverson", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6971 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5238 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3923 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-16K-abliterated/2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json b/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-16K-abliterated/2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json deleted file mode 100644 index 7084716cd..000000000 --- a/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-16K-abliterated/2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/byroneverson_Yi-1.5-9B-Chat-16K-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-9B-Chat-16K-abliterated", - "id": "byroneverson/Yi-1.5-9B-Chat-16K-abliterated", - "developer": "byroneverson", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5528 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5282 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4734 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3823 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-abliterated/caa0c8df-5488-4bf9-a5b8-0fff831e6732.json b/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-abliterated/caa0c8df-5488-4bf9-a5b8-0fff831e6732.json deleted file mode 100644 index 0098a2f51..000000000 --- a/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-abliterated/caa0c8df-5488-4bf9-a5b8-0fff831e6732.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/byroneverson_Yi-1.5-9B-Chat-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-1.5-9B-Chat-abliterated", - "id": "byroneverson/Yi-1.5-9B-Chat-abliterated", - "developer": "byroneverson", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5723 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5401 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1662 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4389 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3715 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/c10x/Q-Pluse/c6f8e581-e849-4e28-b3a6-1838ee522770.json b/data/hfopenllm_v2/c10x/Q-Pluse/c6f8e581-e849-4e28-b3a6-1838ee522770.json deleted file mode 100644 index 10eb44f61..000000000 --- a/data/hfopenllm_v2/c10x/Q-Pluse/c6f8e581-e849-4e28-b3a6-1838ee522770.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/c10x_Q-Pluse/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q-Pluse", - "id": "c10x/Q-Pluse", - "developer": "c10x", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2875 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3938 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1135 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/c10x/longthinker/f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json b/data/hfopenllm_v2/c10x/longthinker/f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json deleted file mode 100644 index c9bd5fd8a..000000000 --- a/data/hfopenllm_v2/c10x/longthinker/f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/c10x_longthinker/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "longthinker", - "id": "c10x/longthinker", - "developer": "c10x", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3609 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4927 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2319 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/carsenk/flippa-v6/44129be7-f73d-4580-8375-e8ef324e73a8.json b/data/hfopenllm_v2/carsenk/flippa-v6/44129be7-f73d-4580-8375-e8ef324e73a8.json deleted file mode 100644 index 5fe1e9b7d..000000000 --- a/data/hfopenllm_v2/carsenk/flippa-v6/44129be7-f73d-4580-8375-e8ef324e73a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/carsenk_flippa-v6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flippa-v6", - "id": "carsenk/flippa-v6", - "developer": "carsenk", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 16.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3439 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1405 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4089 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3668 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/carsenk/phi3.5_mini_exp_825_uncensored/2925ecde-a9a5-4369-b391-d23a8605d35c.json b/data/hfopenllm_v2/carsenk/phi3.5_mini_exp_825_uncensored/2925ecde-a9a5-4369-b391-d23a8605d35c.json deleted file mode 100644 index e36d77eec..000000000 --- a/data/hfopenllm_v2/carsenk/phi3.5_mini_exp_825_uncensored/2925ecde-a9a5-4369-b391-d23a8605d35c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/carsenk_phi3.5_mini_exp_825_uncensored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi3.5_mini_exp_825_uncensored", - "id": "carsenk/phi3.5_mini_exp_825_uncensored", - "developer": "carsenk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1364 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2965 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1175 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1/8409e464-fd16-4b41-b533-2f6cae4fe894.json b/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1/8409e464-fd16-4b41-b533-2f6cae4fe894.json deleted file mode 100644 index 0abe1f2fd..000000000 --- a/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1/8409e464-fd16-4b41-b533-2f6cae4fe894.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cat-searcher_gemma-2-9b-it-sppo-iter-1-evol-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-it-sppo-iter-1-evol-1", - "id": "cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1", - "developer": "cat-searcher", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2942 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5939 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1/86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json b/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1/86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json deleted file mode 100644 index a688e85bd..000000000 --- a/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1/86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cat-searcher_gemma-2-9b-it-sppo-iter-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-it-sppo-iter-1", - "id": "cat-searcher/gemma-2-9b-it-sppo-iter-1", - "developer": "cat-searcher", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3015 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5972 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3927 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3854 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cckm/tinymistral_950m/aa2e6df7-a0b0-42f7-8057-e2763fc34834.json b/data/hfopenllm_v2/cckm/tinymistral_950m/aa2e6df7-a0b0-42f7-8057-e2763fc34834.json deleted file mode 100644 index c5f1e1fe2..000000000 --- a/data/hfopenllm_v2/cckm/tinymistral_950m/aa2e6df7-a0b0-42f7-8057-e2763fc34834.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cckm_tinymistral_950m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tinymistral_950m", - "id": "cckm/tinymistral_950m", - "developer": "cckm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 0.955 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2395 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2969 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1096 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cgato/TheSalt-L3-8b-v0.3.2/2bf9a06e-f3bf-4b55-804b-e553a722e0de.json b/data/hfopenllm_v2/cgato/TheSalt-L3-8b-v0.3.2/2bf9a06e-f3bf-4b55-804b-e553a722e0de.json deleted file mode 100644 index 56d6c2c5f..000000000 --- a/data/hfopenllm_v2/cgato/TheSalt-L3-8b-v0.3.2/2bf9a06e-f3bf-4b55-804b-e553a722e0de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cgato_TheSalt-L3-8b-v0.3.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TheSalt-L3-8b-v0.3.2", - "id": "cgato/TheSalt-L3-8b-v0.3.2", - "developer": "cgato", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2705 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2968 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/chargoddard/prometheus-2-llama-3-8b/b380a675-39ea-4950-ad0a-d9771f09ddde.json b/data/hfopenllm_v2/chargoddard/prometheus-2-llama-3-8b/b380a675-39ea-4950-ad0a-d9771f09ddde.json deleted file mode 100644 index fbec5b2fc..000000000 --- a/data/hfopenllm_v2/chargoddard/prometheus-2-llama-3-8b/b380a675-39ea-4950-ad0a-d9771f09ddde.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/chargoddard_prometheus-2-llama-3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "prometheus-2-llama-3-8b", - "id": "chargoddard/prometheus-2-llama-3-8b", - "developer": "chargoddard", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5289 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4931 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0823 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO/482358eb-7d3b-4de0-b5d9-451308f104e2.json b/data/hfopenllm_v2/chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO/482358eb-7d3b-4de0-b5d9-451308f104e2.json deleted file mode 100644 index 122ad0855..000000000 --- a/data/hfopenllm_v2/chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO/482358eb-7d3b-4de0-b5d9-451308f104e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/chujiezheng_Llama-3-Instruct-8B-SimPO-ExPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SimPO-ExPO", - "id": "chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO", - "developer": "chujiezheng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6434 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4765 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3401 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/chujiezheng/Mistral7B-PairRM-SPPO-ExPO/ef04a83d-7b89-43ec-ba33-30e1006422dc.json b/data/hfopenllm_v2/chujiezheng/Mistral7B-PairRM-SPPO-ExPO/ef04a83d-7b89-43ec-ba33-30e1006422dc.json deleted file mode 100644 index c2d924a8a..000000000 --- a/data/hfopenllm_v2/chujiezheng/Mistral7B-PairRM-SPPO-ExPO/ef04a83d-7b89-43ec-ba33-30e1006422dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/chujiezheng_Mistral7B-PairRM-SPPO-ExPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral7B-PairRM-SPPO-ExPO", - "id": "chujiezheng/Mistral7B-PairRM-SPPO-ExPO", - "developer": "chujiezheng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3673 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4055 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2552 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cjvt/GaMS-1B/7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json b/data/hfopenllm_v2/cjvt/GaMS-1B/7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json deleted file mode 100644 index 9129e2209..000000000 --- a/data/hfopenllm_v2/cjvt/GaMS-1B/7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cjvt_GaMS-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GaMS-1B", - "id": "cjvt/GaMS-1B", - "developer": "cjvt", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "OPTForCausalLM", - "params_billions": 1.54 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1635 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1149 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cloudyu/Llama-3-70Bx2-MOE/52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json b/data/hfopenllm_v2/cloudyu/Llama-3-70Bx2-MOE/52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json deleted file mode 100644 index 91fc09dc7..000000000 --- a/data/hfopenllm_v2/cloudyu/Llama-3-70Bx2-MOE/52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cloudyu_Llama-3-70Bx2-MOE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-70Bx2-MOE", - "id": "cloudyu/Llama-3-70Bx2-MOE", - "developer": "cloudyu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 126.926 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6636 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2175 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4812 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cloudyu/Llama-3.2-3Bx4/1f4a827d-31cd-42e6-871d-7c0cad010f58.json b/data/hfopenllm_v2/cloudyu/Llama-3.2-3Bx4/1f4a827d-31cd-42e6-871d-7c0cad010f58.json deleted file mode 100644 index 2d3c69d77..000000000 --- a/data/hfopenllm_v2/cloudyu/Llama-3.2-3Bx4/1f4a827d-31cd-42e6-871d-7c0cad010f58.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cloudyu_Llama-3.2-3Bx4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3Bx4", - "id": "cloudyu/Llama-3.2-3Bx4", - "developer": "cloudyu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 9.949 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5069 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3496 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2985 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cloudyu/Mixtral_11Bx2_MoE_19B/56d6d99c-fba1-42e7-aad4-631370b44da3.json b/data/hfopenllm_v2/cloudyu/Mixtral_11Bx2_MoE_19B/56d6d99c-fba1-42e7-aad4-631370b44da3.json deleted file mode 100644 index 1913318a3..000000000 --- a/data/hfopenllm_v2/cloudyu/Mixtral_11Bx2_MoE_19B/56d6d99c-fba1-42e7-aad4-631370b44da3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cloudyu_Mixtral_11Bx2_MoE_19B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral_11Bx2_MoE_19B", - "id": "cloudyu/Mixtral_11Bx2_MoE_19B", - "developer": "cloudyu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 19.188 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4297 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3311 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cloudyu/Mixtral_34Bx2_MoE_60B/006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json b/data/hfopenllm_v2/cloudyu/Mixtral_34Bx2_MoE_60B/006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json deleted file mode 100644 index 10f736b8c..000000000 --- a/data/hfopenllm_v2/cloudyu/Mixtral_34Bx2_MoE_60B/006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cloudyu_Mixtral_34Bx2_MoE_60B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral_34Bx2_MoE_60B", - "id": "cloudyu/Mixtral_34Bx2_MoE_60B", - "developer": "cloudyu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 60.814 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4538 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.587 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4625 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4766 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cloudyu/Mixtral_7Bx2_MoE/33a82686-6202-4a4d-ba34-bd4537105e5f.json b/data/hfopenllm_v2/cloudyu/Mixtral_7Bx2_MoE/33a82686-6202-4a4d-ba34-bd4537105e5f.json deleted file mode 100644 index 3cdeaa925..000000000 --- a/data/hfopenllm_v2/cloudyu/Mixtral_7Bx2_MoE/33a82686-6202-4a4d-ba34-bd4537105e5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cloudyu_Mixtral_7Bx2_MoE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral_7Bx2_MoE", - "id": "cloudyu/Mixtral_7Bx2_MoE", - "developer": "cloudyu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.448 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3044 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cloudyu/S1-Llama-3.2-3Bx4-MoE/38d45554-44bd-4b40-b7c9-c0b7ba44b862.json b/data/hfopenllm_v2/cloudyu/S1-Llama-3.2-3Bx4-MoE/38d45554-44bd-4b40-b7c9-c0b7ba44b862.json deleted file mode 100644 index 7127e4c67..000000000 --- a/data/hfopenllm_v2/cloudyu/S1-Llama-3.2-3Bx4-MoE/38d45554-44bd-4b40-b7c9-c0b7ba44b862.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cloudyu_S1-Llama-3.2-3Bx4-MoE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "S1-Llama-3.2-3Bx4-MoE", - "id": "cloudyu/S1-Llama-3.2-3Bx4-MoE", - "developer": "cloudyu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 9.555 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3044 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cloudyu/Yi-34Bx2-MoE-60B-DPO/37d7e3ab-db9c-4ad7-81d1-933c030a6250.json b/data/hfopenllm_v2/cloudyu/Yi-34Bx2-MoE-60B-DPO/37d7e3ab-db9c-4ad7-81d1-933c030a6250.json deleted file mode 100644 index 44ea55fed..000000000 --- a/data/hfopenllm_v2/cloudyu/Yi-34Bx2-MoE-60B-DPO/37d7e3ab-db9c-4ad7-81d1-933c030a6250.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cloudyu_Yi-34Bx2-MoE-60B-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yi-34Bx2-MoE-60B-DPO", - "id": "cloudyu/Yi-34Bx2-MoE-60B-DPO", - "developer": "cloudyu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 60.814 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5319 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5168 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4677 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo/9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json b/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo/9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json deleted file mode 100644 index 3ef6239d5..000000000 --- a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo/9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-ipo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-paraphrase-type-generation-apty-ipo", - "id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo", - "developer": "cluebbers", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1327 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2591 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/b6bd8515-4c95-40ce-b2d5-af8873d261ab.json b/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/b6bd8515-4c95-40ce-b2d5-af8873d261ab.json deleted file mode 100644 index d349e2332..000000000 --- a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/b6bd8515-4c95-40ce-b2d5-af8873d261ab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid", - "id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid", - "developer": "cluebbers", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1318 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3789 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4306 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc/d102e75d-3e20-482b-a243-bae3ec44e2bb.json b/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc/d102e75d-3e20-482b-a243-bae3ec44e2bb.json deleted file mode 100644 index 8314442a4..000000000 --- a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc/d102e75d-3e20-482b-a243-bae3ec44e2bb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cluebbers_Llama-3.1-8B-paraphrase-type-generation-etpc/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-paraphrase-type-generation-etpc", - "id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc", - "developer": "cluebbers", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1209 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4319 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2556 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.1-8B/68920da1-af71-4ccd-88b9-554e3c72c4dc.json b/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.1-8B/68920da1-af71-4ccd-88b9-554e3c72c4dc.json deleted file mode 100644 index e9c8f9f33..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.1-8B/68920da1-af71-4ccd-88b9-554e3c72c4dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_Dolphin3.0-Llama3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dolphin3.0-Llama3.1-8B", - "id": "cognitivecomputations/Dolphin3.0-Llama3.1-8B", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4916 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3653 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2992 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.2-1B/c0eb144f-c726-4a80-bce9-384fb7a641a7.json b/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.2-1B/c0eb144f-c726-4a80-bce9-384fb7a641a7.json deleted file mode 100644 index feeca651a..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.2-1B/c0eb144f-c726-4a80-bce9-384fb7a641a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_Dolphin3.0-Llama3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dolphin3.0-Llama3.2-1B", - "id": "cognitivecomputations/Dolphin3.0-Llama3.2-1B", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5428 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3122 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2299 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1375 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B/0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json b/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B/0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json deleted file mode 100644 index cf6e079cd..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B/0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_Dolphin3.0-Qwen2.5-0.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dolphin3.0-Qwen2.5-0.5B", - "id": "cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4697 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3114 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-R1-Mistral-24B/8fe4360a-0924-4386-b4cd-89069f7ff55f.json b/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-R1-Mistral-24B/8fe4360a-0924-4386-b4cd-89069f7ff55f.json deleted file mode 100644 index 5e1bed350..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-R1-Mistral-24B/8fe4360a-0924-4386-b4cd-89069f7ff55f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_Dolphin3.0-R1-Mistral-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dolphin3.0-R1-Mistral-24B", - "id": "cognitivecomputations/Dolphin3.0-R1-Mistral-24B", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4068 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3119 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3952 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3005 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9-llama3-8b/eeeb082b-7112-4a08-a87a-b2c9ae37efff.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9-llama3-8b/eeeb082b-7112-4a08-a87a-b2c9ae37efff.json deleted file mode 100644 index e996e7392..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9-llama3-8b/eeeb082b-7112-4a08-a87a-b2c9ae37efff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9-llama3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9-llama3-8b", - "id": "cognitivecomputations/dolphin-2.9-llama3-8b", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.385 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2771 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-llama-3-70b/b8f933e9-867f-4934-9648-371d1e632116.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-llama-3-70b/b8f933e9-867f-4934-9648-371d1e632116.json deleted file mode 100644 index 96525d9b7..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-llama-3-70b/b8f933e9-867f-4934-9648-371d1e632116.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.1-llama-3-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.1-llama-3-70b", - "id": "cognitivecomputations/dolphin-2.9.1-llama-3-70b", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.376 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5205 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4976 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-34b/8d225023-4b7e-48cd-ae67-6d00b541f17d.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-34b/8d225023-4b7e-48cd-ae67-6d00b541f17d.json deleted file mode 100644 index e70bb20f7..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-34b/8d225023-4b7e-48cd-ae67-6d00b541f17d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.1-yi-1.5-34b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.1-yi-1.5-34b", - "id": "cognitivecomputations/dolphin-2.9.1-yi-1.5-34b", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3853 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6076 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1866 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4598 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4519 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-9b/ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-9b/ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json deleted file mode 100644 index d916c8a8c..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-9b/ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.1-yi-1.5-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.1-yi-1.5-9b", - "id": "cognitivecomputations/dolphin-2.9.1-yi-1.5-9b", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4465 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5484 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1518 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4348 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3967 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/177ef040-da5c-4a65-adac-efdc555bd110.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/177ef040-da5c-4a65-adac-efdc555bd110.json deleted file mode 100644 index 2947eef38..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/177ef040-da5c-4a65-adac-efdc555bd110.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.2-Phi-3-Medium-abliterated", - "id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3613 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6123 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4112 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4494 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json deleted file mode 100644 index 118e42ec5..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.2-Phi-3-Medium-abliterated", - "id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6383 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.182 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4349 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4525 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium/f4549a39-0b28-4e06-998a-774f5f02cfba.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium/f4549a39-0b28-4e06-998a-774f5f02cfba.json deleted file mode 100644 index a8dbae20b..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium/f4549a39-0b28-4e06-998a-774f5f02cfba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.2-Phi-3-Medium", - "id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": -1.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4248 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6457 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1828 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4191 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4555 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-72b/a79af78a-adab-406f-995a-adb3893e1510.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-72b/a79af78a-adab-406f-995a-adb3893e1510.json deleted file mode 100644 index 027fcc0d4..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-72b/a79af78a-adab-406f-995a-adb3893e1510.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-qwen2-72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.2-qwen2-72b", - "id": "cognitivecomputations/dolphin-2.9.2-qwen2-72b", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6344 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6296 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4521 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5471 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-7b/4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-7b/4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json deleted file mode 100644 index e8d10e1e5..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-7b/4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-qwen2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.2-qwen2-7b", - "id": "cognitivecomputations/dolphin-2.9.2-qwen2-7b", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4894 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4191 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4051 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k/eeb3a10a-d584-414a-90de-e018c47615c2.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k/eeb3a10a-d584-414a-90de-e018c47615c2.json deleted file mode 100644 index a64db73d8..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k/eeb3a10a-d584-414a-90de-e018c47615c2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.3-Yi-1.5-34B-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.3-Yi-1.5-34B-32k", - "id": "cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3639 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1669 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4311 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-7B-32k/e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-7B-32k/e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json deleted file mode 100644 index 88711de67..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-7B-32k/e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.3-mistral-7B-32k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.3-mistral-7B-32k", - "id": "cognitivecomputations/dolphin-2.9.3-mistral-7B-32k", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4126 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4813 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4643 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2821 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b/cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b/cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json deleted file mode 100644 index 470ec6033..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b/cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.3-mistral-nemo-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.3-mistral-nemo-12b", - "id": "cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5601 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.443 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3377 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-gemma2-2b/b64b6416-b18b-47cc-a516-c613cd670b37.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-gemma2-2b/b64b6416-b18b-47cc-a516-c613cd670b37.json deleted file mode 100644 index 859c42976..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-gemma2-2b/b64b6416-b18b-47cc-a516-c613cd670b37.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.4-gemma2-2b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.4-gemma2-2b", - "id": "cognitivecomputations/dolphin-2.9.4-gemma2-2b", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0896 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-llama3.1-8b/64e96d56-72a9-413f-8903-45821b98f71e.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-llama3.1-8b/64e96d56-72a9-413f-8903-45821b98f71e.json deleted file mode 100644 index f6d9ab604..000000000 --- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-llama3.1-8b/64e96d56-72a9-413f-8903-45821b98f71e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.4-llama3.1-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolphin-2.9.4-llama3.1-8b", - "id": "cognitivecomputations/dolphin-2.9.4-llama3.1-8b", - "developer": "cognitivecomputations", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1237 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2/a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json b/data/hfopenllm_v2/collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2/a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json deleted file mode 100644 index faa327cca..000000000 --- a/data/hfopenllm_v2/collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2/a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/collaiborateorg_Collaiborator-MEDLLM-Llama-3-8B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Collaiborator-MEDLLM-Llama-3-8B-v2", - "id": "collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2", - "developer": "collaiborateorg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4648 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3481 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cpayne1303/cp2024-instruct/79314f48-d92b-4992-b3c6-d31278c0867a.json b/data/hfopenllm_v2/cpayne1303/cp2024-instruct/79314f48-d92b-4992-b3c6-d31278c0867a.json deleted file mode 100644 index 0a74e80db..000000000 --- a/data/hfopenllm_v2/cpayne1303/cp2024-instruct/79314f48-d92b-4992-b3c6-d31278c0867a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cpayne1303_cp2024-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cp2024-instruct", - "id": "cpayne1303/cp2024-instruct", - "developer": "cpayne1303", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1706 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2947 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cpayne1303/cp2024/5a007612-c8e7-4f6b-baa9-a21af7e908c6.json b/data/hfopenllm_v2/cpayne1303/cp2024/5a007612-c8e7-4f6b-baa9-a21af7e908c6.json deleted file mode 100644 index 5ffe334d2..000000000 --- a/data/hfopenllm_v2/cpayne1303/cp2024/5a007612-c8e7-4f6b-baa9-a21af7e908c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cpayne1303_cp2024/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cp2024", - "id": "cpayne1303/cp2024", - "developer": "cpayne1303", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1658 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2985 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3383 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1101 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cpayne1303/llama-43m-beta/fdefdd3e-2d83-4430-bd95-e16a1935dff1.json b/data/hfopenllm_v2/cpayne1303/llama-43m-beta/fdefdd3e-2d83-4430-bd95-e16a1935dff1.json deleted file mode 100644 index d8c1991cd..000000000 --- a/data/hfopenllm_v2/cpayne1303/llama-43m-beta/fdefdd3e-2d83-4430-bd95-e16a1935dff1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cpayne1303_llama-43m-beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-43m-beta", - "id": "cpayne1303/llama-43m-beta", - "developer": "cpayne1303", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.043 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1949 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2965 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3885 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cpayne1303/llama-43m-beta/ffdd45bf-3409-4b92-909a-25a32ba27f82.json b/data/hfopenllm_v2/cpayne1303/llama-43m-beta/ffdd45bf-3409-4b92-909a-25a32ba27f82.json deleted file mode 100644 index 979e3c2d6..000000000 --- a/data/hfopenllm_v2/cpayne1303/llama-43m-beta/ffdd45bf-3409-4b92-909a-25a32ba27f82.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cpayne1303_llama-43m-beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-43m-beta", - "id": "cpayne1303/llama-43m-beta", - "developer": "cpayne1303", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.043 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1916 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2977 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1132 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cpayne1303/smallcp2024/a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json b/data/hfopenllm_v2/cpayne1303/smallcp2024/a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json deleted file mode 100644 index fba1477ee..000000000 --- a/data/hfopenllm_v2/cpayne1303/smallcp2024/a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cpayne1303_smallcp2024/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smallcp2024", - "id": "cpayne1303/smallcp2024", - "developer": "cpayne1303", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.002 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1582 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3027 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3425 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1114 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/crestf411/MN-Slush/d9d49bf7-f6f0-4c25-9182-d815454940e3.json b/data/hfopenllm_v2/crestf411/MN-Slush/d9d49bf7-f6f0-4c25-9182-d815454940e3.json deleted file mode 100644 index 5057d4ade..000000000 --- a/data/hfopenllm_v2/crestf411/MN-Slush/d9d49bf7-f6f0-4c25-9182-d815454940e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/crestf411_MN-Slush/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-Slush", - "id": "crestf411/MN-Slush", - "developer": "crestf411", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3933 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3508 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cstr/llama3.1-8b-spaetzle-v90/deb48e93-0378-482f-8a5d-7ec350497e0b.json b/data/hfopenllm_v2/cstr/llama3.1-8b-spaetzle-v90/deb48e93-0378-482f-8a5d-7ec350497e0b.json deleted file mode 100644 index cf3237c07..000000000 --- a/data/hfopenllm_v2/cstr/llama3.1-8b-spaetzle-v90/deb48e93-0378-482f-8a5d-7ec350497e0b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cstr_llama3.1-8b-spaetzle-v90/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3.1-8b-spaetzle-v90", - "id": "cstr/llama3.1-8b-spaetzle-v90", - "developer": "cstr", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7356 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5303 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1495 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4134 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3731 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/cyberagent/calm3-22b-chat/302a9a47-8603-42d9-85fb-64c60e7c6f44.json b/data/hfopenllm_v2/cyberagent/calm3-22b-chat/302a9a47-8603-42d9-85fb-64c60e7c6f44.json deleted file mode 100644 index f7b5122d0..000000000 --- a/data/hfopenllm_v2/cyberagent/calm3-22b-chat/302a9a47-8603-42d9-85fb-64c60e7c6f44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/cyberagent_calm3-22b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "calm3-22b-chat", - "id": "cyberagent/calm3-22b-chat", - "developer": "cyberagent", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 22.543 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5091 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4992 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4553 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.295 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/darkc0de/BuddyGlassNeverSleeps/28d52801-3998-421f-a37a-2b7b677d0eaa.json b/data/hfopenllm_v2/darkc0de/BuddyGlassNeverSleeps/28d52801-3998-421f-a37a-2b7b677d0eaa.json deleted file mode 100644 index 8fd65a779..000000000 --- a/data/hfopenllm_v2/darkc0de/BuddyGlassNeverSleeps/28d52801-3998-421f-a37a-2b7b677d0eaa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/darkc0de_BuddyGlassNeverSleeps/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BuddyGlassNeverSleeps", - "id": "darkc0de/BuddyGlassNeverSleeps", - "developer": "darkc0de", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4239 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4977 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3993 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3452 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/darkc0de/BuddyGlassUncensored2025.2/32b4e23b-9430-45a8-bfa2-eea2e89792c4.json b/data/hfopenllm_v2/darkc0de/BuddyGlassUncensored2025.2/32b4e23b-9430-45a8-bfa2-eea2e89792c4.json deleted file mode 100644 index 0e633e15d..000000000 --- a/data/hfopenllm_v2/darkc0de/BuddyGlassUncensored2025.2/32b4e23b-9430-45a8-bfa2-eea2e89792c4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/darkc0de_BuddyGlassUncensored2025.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BuddyGlassUncensored2025.2", - "id": "darkc0de/BuddyGlassUncensored2025.2", - "developer": "darkc0de", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7731 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6095 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2402 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4336 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/0336e168-e313-44cb-a030-42e6d20e92df.json b/data/hfopenllm_v2/darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/0336e168-e313-44cb-a030-42e6d20e92df.json deleted file mode 100644 index d8470fccf..000000000 --- a/data/hfopenllm_v2/darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/0336e168-e313-44cb-a030-42e6d20e92df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/darkc0de_BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp", - "id": "darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp", - "developer": "darkc0de", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.007 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5243 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4143 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3673 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/databricks/dbrx-base/11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json b/data/hfopenllm_v2/databricks/dbrx-base/11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json deleted file mode 100644 index e2da8eba9..000000000 --- a/data/hfopenllm_v2/databricks/dbrx-base/11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/databricks_dbrx-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dbrx-base", - "id": "databricks/dbrx-base", - "developer": "databricks", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Unknown", - "params_billions": 0.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0821 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5196 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3267 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4067 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/databricks/dbrx-instruct/6d97749c-3bfa-4c32-b581-a5e2b73303f3.json b/data/hfopenllm_v2/databricks/dbrx-instruct/6d97749c-3bfa-4c32-b581-a5e2b73303f3.json deleted file mode 100644 index 3b824e9c7..000000000 --- a/data/hfopenllm_v2/databricks/dbrx-instruct/6d97749c-3bfa-4c32-b581-a5e2b73303f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/databricks_dbrx-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dbrx-instruct", - "id": "databricks/dbrx-instruct", - "developer": "databricks", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "DbrxForCausalLM", - "params_billions": 131.597 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5429 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4269 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/databricks/dolly-v1-6b/ec58907d-b67c-467e-a3dd-b9f9c10138f0.json b/data/hfopenllm_v2/databricks/dolly-v1-6b/ec58907d-b67c-467e-a3dd-b9f9c10138f0.json deleted file mode 100644 index ad7981890..000000000 --- a/data/hfopenllm_v2/databricks/dolly-v1-6b/ec58907d-b67c-467e-a3dd-b9f9c10138f0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/databricks_dolly-v1-6b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolly-v1-6b", - "id": "databricks/dolly-v1-6b", - "developer": "databricks", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPTJForCausalLM", - "params_billions": 6.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2224 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4004 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/databricks/dolly-v2-12b/a7f09a3d-025c-48fa-9358-863b9ae382b1.json b/data/hfopenllm_v2/databricks/dolly-v2-12b/a7f09a3d-025c-48fa-9358-863b9ae382b1.json deleted file mode 100644 index 68d7dcd5a..000000000 --- a/data/hfopenllm_v2/databricks/dolly-v2-12b/a7f09a3d-025c-48fa-9358-863b9ae382b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/databricks_dolly-v2-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolly-v2-12b", - "id": "databricks/dolly-v2-12b", - "developer": "databricks", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 12.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2355 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.332 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/databricks/dolly-v2-3b/bf2be2d5-58de-4550-b733-a5910bded48d.json b/data/hfopenllm_v2/databricks/dolly-v2-3b/bf2be2d5-58de-4550-b733-a5910bded48d.json deleted file mode 100644 index 0f29fbdd0..000000000 --- a/data/hfopenllm_v2/databricks/dolly-v2-3b/bf2be2d5-58de-4550-b733-a5910bded48d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/databricks_dolly-v2-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolly-v2-3b", - "id": "databricks/dolly-v2-3b", - "developer": "databricks", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2247 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/databricks/dolly-v2-7b/52b32c1f-6189-4850-b3f4-de442eb2ccb5.json b/data/hfopenllm_v2/databricks/dolly-v2-7b/52b32c1f-6189-4850-b3f4-de442eb2ccb5.json deleted file mode 100644 index 99647f517..000000000 --- a/data/hfopenllm_v2/databricks/dolly-v2-7b/52b32c1f-6189-4850-b3f4-de442eb2ccb5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/databricks_dolly-v2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dolly-v2-7b", - "id": "databricks/dolly-v2-7b", - "developer": "databricks", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.201 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3173 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3553 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1149 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/davidkim205/Rhea-72b-v0.5/87b44160-c3dd-452d-8c15-c4f758f8db7b.json b/data/hfopenllm_v2/davidkim205/Rhea-72b-v0.5/87b44160-c3dd-452d-8c15-c4f758f8db7b.json deleted file mode 100644 index 6a35282cd..000000000 --- a/data/hfopenllm_v2/davidkim205/Rhea-72b-v0.5/87b44160-c3dd-452d-8c15-c4f758f8db7b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/davidkim205_Rhea-72b-v0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rhea-72b-v0.5", - "id": "davidkim205/Rhea-72b-v0.5", - "developer": "davidkim205", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 72.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0145 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3078 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1737 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/davidkim205/nox-solar-10.7b-v4/3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json b/data/hfopenllm_v2/davidkim205/nox-solar-10.7b-v4/3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json deleted file mode 100644 index 0227496f4..000000000 --- a/data/hfopenllm_v2/davidkim205/nox-solar-10.7b-v4/3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/davidkim205_nox-solar-10.7b-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nox-solar-10.7b-v4", - "id": "davidkim205/nox-solar-10.7b-v4", - "developer": "davidkim205", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4814 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3333 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/35b7ff42-3825-4240-97bf-f8af7e8c23ff.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/35b7ff42-3825-4240-97bf-f8af7e8c23ff.json deleted file mode 100644 index d4fcf871a..000000000 --- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/35b7ff42-3825-4240-97bf-f8af7e8c23ff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Llama-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Llama-70B", - "id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5635 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4748 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/c108173e-1582-4c99-9291-46986d7ba1cf.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/c108173e-1582-4c99-9291-46986d7ba1cf.json deleted file mode 100644 index 50037a506..000000000 --- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/c108173e-1582-4c99-9291-46986d7ba1cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Llama-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Llama-8B", - "id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3239 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2089 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/6feb08b0-1c67-4fe2-a001-0b3b84529687.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/6feb08b0-1c67-4fe2-a001-0b3b84529687.json deleted file mode 100644 index 642a4c84c..000000000 --- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/6feb08b0-1c67-4fe2-a001-0b3b84529687.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-1.5B", - "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3463 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1692 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3635 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1187 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json deleted file mode 100644 index 6772800d9..000000000 --- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Qwen-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B", - "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5906 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4667 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/53ec995e-bcfd-4a72-bd9a-45d14da3f219.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/53ec995e-bcfd-4a72-bd9a-45d14da3f219.json deleted file mode 100644 index 69ee0e546..000000000 --- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/53ec995e-bcfd-4a72-bd9a-45d14da3f219.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Qwen-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-32B", - "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4197 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4526 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4687 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/299a0397-89c7-4329-9599-9fc29a52db87.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/299a0397-89c7-4329-9599-9fc29a52db87.json deleted file mode 100644 index 8f7f946ab..000000000 --- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/299a0397-89c7-4329-9599-9fc29a52db87.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Qwen-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-7B", - "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4038 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3443 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1956 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2321 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-67b-chat/41adbc32-6cdf-49ba-980c-6eb6f722b40b.json b/data/hfopenllm_v2/deepseek-ai/deepseek-llm-67b-chat/41adbc32-6cdf-49ba-980c-6eb6f722b40b.json deleted file mode 100644 index e02fd22ba..000000000 --- a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-67b-chat/41adbc32-6cdf-49ba-980c-6eb6f722b40b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-llm-67b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "deepseek-llm-67b-chat", - "id": "deepseek-ai/deepseek-llm-67b-chat", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 67.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5587 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5243 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5059 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3944 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-base/4236ece5-f2b2-44e7-9503-9731bff20155.json b/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-base/4236ece5-f2b2-44e7-9503-9731bff20155.json deleted file mode 100644 index fa4327008..000000000 --- a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-base/4236ece5-f2b2-44e7-9503-9731bff20155.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-llm-7b-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "deepseek-llm-7b-base", - "id": "deepseek-ai/deepseek-llm-7b-base", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2179 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3503 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1806 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-chat/b33d672c-4a96-4093-bc13-25c42303b918.json b/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-chat/b33d672c-4a96-4093-bc13-25c42303b918.json deleted file mode 100644 index cd8ef9eab..000000000 --- a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-chat/b33d672c-4a96-4093-bc13-25c42303b918.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-llm-7b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "deepseek-llm-7b-chat", - "id": "deepseek-ai/deepseek-llm-7b-chat", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3632 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4668 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-base/2b4f42fc-8b25-481c-98f7-911c52fdd242.json b/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-base/2b4f42fc-8b25-481c-98f7-911c52fdd242.json deleted file mode 100644 index a987a5874..000000000 --- a/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-base/2b4f42fc-8b25-481c-98f7-911c52fdd242.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-moe-16b-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "deepseek-moe-16b-base", - "id": "deepseek-ai/deepseek-moe-16b-base", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "DeepseekForCausalLM", - "params_billions": 16.376 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3409 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-chat/634b7a64-2bd3-48b8-b2f4-a93189801850.json b/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-chat/634b7a64-2bd3-48b8-b2f4-a93189801850.json deleted file mode 100644 index 92b956141..000000000 --- a/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-chat/634b7a64-2bd3-48b8-b2f4-a93189801850.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-moe-16b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "deepseek-moe-16b-chat", - "id": "deepseek-ai/deepseek-moe-16b-chat", - "developer": "deepseek-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "DeepseekForCausalLM", - "params_billions": 16.376 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2248 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3808 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dfurman/CalmeRys-78B-Orpo-v0.1/72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json b/data/hfopenllm_v2/dfurman/CalmeRys-78B-Orpo-v0.1/72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json deleted file mode 100644 index 152bc2222..000000000 --- a/data/hfopenllm_v2/dfurman/CalmeRys-78B-Orpo-v0.1/72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dfurman_CalmeRys-78B-Orpo-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CalmeRys-78B-Orpo-v0.1", - "id": "dfurman/CalmeRys-78B-Orpo-v0.1", - "developer": "dfurman", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.965 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8163 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4063 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4002 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5902 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7012 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dfurman/Llama-3-70B-Orpo-v0.1/78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json b/data/hfopenllm_v2/dfurman/Llama-3-70B-Orpo-v0.1/78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json deleted file mode 100644 index 2fb382007..000000000 --- a/data/hfopenllm_v2/dfurman/Llama-3-70B-Orpo-v0.1/78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dfurman_Llama-3-70B-Orpo-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-70B-Orpo-v0.1", - "id": "dfurman/Llama-3-70B-Orpo-v0.1", - "developer": "dfurman", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2049 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1579 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4534 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json b/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json deleted file mode 100644 index cafc237a9..000000000 --- a/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dfurman_Llama-3-8B-Orpo-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Orpo-v0.1", - "id": "dfurman/Llama-3-8B-Orpo-v0.1", - "developer": "dfurman", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2835 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3566 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2298 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/79b81e37-f75e-4b18-b145-73c42625ced5.json b/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/79b81e37-f75e-4b18-b145-73c42625ced5.json deleted file mode 100644 index f0f0c7311..000000000 --- a/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/79b81e37-f75e-4b18-b145-73c42625ced5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dfurman_Llama-3-8B-Orpo-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Orpo-v0.1", - "id": "dfurman/Llama-3-8B-Orpo-v0.1", - "developer": "dfurman", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3853 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2281 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dfurman/Qwen2-72B-Orpo-v0.1/2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json b/data/hfopenllm_v2/dfurman/Qwen2-72B-Orpo-v0.1/2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json deleted file mode 100644 index 561c35bcc..000000000 --- a/data/hfopenllm_v2/dfurman/Qwen2-72B-Orpo-v0.1/2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dfurman_Qwen2-72B-Orpo-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-72B-Orpo-v0.1", - "id": "dfurman/Qwen2-72B-Orpo-v0.1", - "developer": "dfurman", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.699 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.788 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6969 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4056 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4784 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5455 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dicta-il/dictalm2.0-instruct/315fa815-fab0-47c9-8185-00bc597c0176.json b/data/hfopenllm_v2/dicta-il/dictalm2.0-instruct/315fa815-fab0-47c9-8185-00bc597c0176.json deleted file mode 100644 index 66466a5b8..000000000 --- a/data/hfopenllm_v2/dicta-il/dictalm2.0-instruct/315fa815-fab0-47c9-8185-00bc597c0176.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dicta-il_dictalm2.0-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dictalm2.0-instruct", - "id": "dicta-il/dictalm2.0-instruct", - "developer": "dicta-il", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.251 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3946 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2605 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dicta-il/dictalm2.0/0c1686db-b396-4ecf-86f1-e4e092491acd.json b/data/hfopenllm_v2/dicta-il/dictalm2.0/0c1686db-b396-4ecf-86f1-e4e092491acd.json deleted file mode 100644 index d0fee0d39..000000000 --- a/data/hfopenllm_v2/dicta-il/dictalm2.0/0c1686db-b396-4ecf-86f1-e4e092491acd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dicta-il_dictalm2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dictalm2.0", - "id": "dicta-il/dictalm2.0", - "developer": "dicta-il", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.251 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2413 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4018 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2605 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/distilbert/distilgpt2/57455fbc-b5a9-4a3b-9a30-7da0593fd778.json b/data/hfopenllm_v2/distilbert/distilgpt2/57455fbc-b5a9-4a3b-9a30-7da0593fd778.json deleted file mode 100644 index 4f626e3c3..000000000 --- a/data/hfopenllm_v2/distilbert/distilgpt2/57455fbc-b5a9-4a3b-9a30-7da0593fd778.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/distilbert_distilgpt2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "distilgpt2", - "id": "distilbert/distilgpt2", - "developer": "distilbert", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.088 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0611 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3038 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1187 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/divyanshukunwar/SASTRI_1_9B/a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json b/data/hfopenllm_v2/divyanshukunwar/SASTRI_1_9B/a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json deleted file mode 100644 index 419e211aa..000000000 --- a/data/hfopenllm_v2/divyanshukunwar/SASTRI_1_9B/a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/divyanshukunwar_SASTRI_1_9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SASTRI_1_9B", - "id": "divyanshukunwar/SASTRI_1_9B", - "developer": "divyanshukunwar", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 5.211 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.468 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3187 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base/9d0d4eee-0b87-485c-843f-e32d08aa601b.json b/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base/9d0d4eee-0b87-485c-843f-e32d08aa601b.json deleted file mode 100644 index fd16ac893..000000000 --- a/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base/9d0d4eee-0b87-485c-843f-e32d08aa601b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna-test-lab_TEST-L3.2-ReWish-3B-ties-w-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TEST-L3.2-ReWish-3B-ties-w-base", - "id": "djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base", - "developer": "djuna-test-lab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6353 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B/e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json b/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B/e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json deleted file mode 100644 index 0a5268a87..000000000 --- a/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B/e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna-test-lab_TEST-L3.2-ReWish-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TEST-L3.2-ReWish-3B", - "id": "djuna-test-lab/TEST-L3.2-ReWish-3B", - "developer": "djuna-test-lab", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6368 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/G2-BigGSHT-27B-2/8c7e25df-884d-4940-8185-4c1b82fac8c5.json b/data/hfopenllm_v2/djuna/G2-BigGSHT-27B-2/8c7e25df-884d-4940-8185-4c1b82fac8c5.json deleted file mode 100644 index 294e1f1ce..000000000 --- a/data/hfopenllm_v2/djuna/G2-BigGSHT-27B-2/8c7e25df-884d-4940-8185-4c1b82fac8c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_G2-BigGSHT-27B-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "G2-BigGSHT-27B-2", - "id": "djuna/G2-BigGSHT-27B-2", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7974 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6415 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2349 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4072 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4528 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/G2-GSHT/83611d50-01d0-4642-a104-daf77f1a0fe8.json b/data/hfopenllm_v2/djuna/G2-GSHT/83611d50-01d0-4642-a104-daf77f1a0fe8.json deleted file mode 100644 index efbf1c971..000000000 --- a/data/hfopenllm_v2/djuna/G2-GSHT/83611d50-01d0-4642-a104-daf77f1a0fe8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_G2-GSHT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "G2-GSHT", - "id": "djuna/G2-GSHT", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.563 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.527 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4006 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/Gemma-2-gemmama-9b/5cbdafba-6071-4da1-8b19-3de612e9ff18.json b/data/hfopenllm_v2/djuna/Gemma-2-gemmama-9b/5cbdafba-6071-4da1-8b19-3de612e9ff18.json deleted file mode 100644 index 2777bf875..000000000 --- a/data/hfopenllm_v2/djuna/Gemma-2-gemmama-9b/5cbdafba-6071-4da1-8b19-3de612e9ff18.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_Gemma-2-gemmama-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-gemmama-9b", - "id": "djuna/Gemma-2-gemmama-9b", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7703 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.542 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/L3.1-ForStHS/1c934cba-c94a-4aad-9645-84658e0b5588.json b/data/hfopenllm_v2/djuna/L3.1-ForStHS/1c934cba-c94a-4aad-9645-84658e0b5588.json deleted file mode 100644 index be9ca7b2a..000000000 --- a/data/hfopenllm_v2/djuna/L3.1-ForStHS/1c934cba-c94a-4aad-9645-84658e0b5588.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_L3.1-ForStHS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-ForStHS", - "id": "djuna/L3.1-ForStHS", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7813 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5203 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1503 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4026 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3735 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc/7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json b/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc/7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json deleted file mode 100644 index 5e6b32a51..000000000 --- a/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc/7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_L3.1-Promissum_Mane-8B-Della-1.5-calc/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Promissum_Mane-8B-Della-1.5-calc", - "id": "djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5433 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3904 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-calc/38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json b/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-calc/38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json deleted file mode 100644 index 2ab5da35c..000000000 --- a/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-calc/38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_L3.1-Promissum_Mane-8B-Della-calc/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Promissum_Mane-8B-Della-calc", - "id": "djuna/L3.1-Promissum_Mane-8B-Della-calc", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5442 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5486 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1843 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3802 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/L3.1-Purosani-2-8B/3d65fbc2-bf91-479c-a687-e9ef702794fb.json b/data/hfopenllm_v2/djuna/L3.1-Purosani-2-8B/3d65fbc2-bf91-479c-a687-e9ef702794fb.json deleted file mode 100644 index 14e62740c..000000000 --- a/data/hfopenllm_v2/djuna/L3.1-Purosani-2-8B/3d65fbc2-bf91-479c-a687-e9ef702794fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_L3.1-Purosani-2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Purosani-2-8B", - "id": "djuna/L3.1-Purosani-2-8B", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4988 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5182 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3816 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/L3.1-Suze-Vume-calc/650cdbbb-e066-4581-8d61-77aa6a4c402c.json b/data/hfopenllm_v2/djuna/L3.1-Suze-Vume-calc/650cdbbb-e066-4581-8d61-77aa6a4c402c.json deleted file mode 100644 index 7bf4c3862..000000000 --- a/data/hfopenllm_v2/djuna/L3.1-Suze-Vume-calc/650cdbbb-e066-4581-8d61-77aa6a4c402c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_L3.1-Suze-Vume-calc/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Suze-Vume-calc", - "id": "djuna/L3.1-Suze-Vume-calc", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7297 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5164 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-2/05d566c5-1810-483c-8ce0-84635b9457dc.json b/data/hfopenllm_v2/djuna/MN-Chinofun-12B-2/05d566c5-1810-483c-8ce0-84635b9457dc.json deleted file mode 100644 index 5e17b60d7..000000000 --- a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-2/05d566c5-1810-483c-8ce0-84635b9457dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_MN-Chinofun-12B-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-Chinofun-12B-2", - "id": "djuna/MN-Chinofun-12B-2", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6171 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5037 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4268 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-3/37e3456a-92ff-4122-a697-ffbdc1c79555.json b/data/hfopenllm_v2/djuna/MN-Chinofun-12B-3/37e3456a-92ff-4122-a697-ffbdc1c79555.json deleted file mode 100644 index d1f440712..000000000 --- a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-3/37e3456a-92ff-4122-a697-ffbdc1c79555.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_MN-Chinofun-12B-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-Chinofun-12B-3", - "id": "djuna/MN-Chinofun-12B-3", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3053 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5348 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1005 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3026 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-4/70c908d4-f1bf-4553-9bf7-95eb593b4853.json b/data/hfopenllm_v2/djuna/MN-Chinofun-12B-4/70c908d4-f1bf-4553-9bf7-95eb593b4853.json deleted file mode 100644 index c46c8ff73..000000000 --- a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-4/70c908d4-f1bf-4553-9bf7-95eb593b4853.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_MN-Chinofun-12B-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-Chinofun-12B-4", - "id": "djuna/MN-Chinofun-12B-4", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5404 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5348 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4307 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/MN-Chinofun/2ccc9c20-5414-4286-abcd-ad2b20f8652d.json b/data/hfopenllm_v2/djuna/MN-Chinofun/2ccc9c20-5414-4286-abcd-ad2b20f8652d.json deleted file mode 100644 index 3bb80844d..000000000 --- a/data/hfopenllm_v2/djuna/MN-Chinofun/2ccc9c20-5414-4286-abcd-ad2b20f8652d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_MN-Chinofun/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-Chinofun", - "id": "djuna/MN-Chinofun", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4953 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4084 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/Q2.5-Partron-7B/50f4560a-e172-42b9-b552-437aff158a38.json b/data/hfopenllm_v2/djuna/Q2.5-Partron-7B/50f4560a-e172-42b9-b552-437aff158a38.json deleted file mode 100644 index 0893e6c11..000000000 --- a/data/hfopenllm_v2/djuna/Q2.5-Partron-7B/50f4560a-e172-42b9-b552-437aff158a38.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_Q2.5-Partron-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-Partron-7B", - "id": "djuna/Q2.5-Partron-7B", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5418 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4826 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4165 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4283 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B-0.5/c6a3abac-8a34-4725-915b-c27c3d0bc484.json b/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B-0.5/c6a3abac-8a34-4725-915b-c27c3d0bc484.json deleted file mode 100644 index 8d30205b4..000000000 --- a/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B-0.5/c6a3abac-8a34-4725-915b-c27c3d0bc484.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_Q2.5-Veltha-14B-0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-Veltha-14B-0.5", - "id": "djuna/Q2.5-Veltha-14B-0.5", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7796 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6523 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4339 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5295 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B/a8ed68ea-6463-4ff9-9dcd-034080272dec.json b/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B/a8ed68ea-6463-4ff9-9dcd-034080272dec.json deleted file mode 100644 index b3a521555..000000000 --- a/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B/a8ed68ea-6463-4ff9-9dcd-034080272dec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/djuna_Q2.5-Veltha-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Q2.5-Veltha-14B", - "id": "djuna/Q2.5-Veltha-14B", - "developer": "djuna", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8292 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6484 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4789 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4194 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5298 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-8B-Instruct/5799ce8b-c00d-49f6-96dc-f7dd057a268c.json b/data/hfopenllm_v2/dnhkng/RYS-Llama-3-8B-Instruct/5799ce8b-c00d-49f6-96dc-f7dd057a268c.json deleted file mode 100644 index 9ecc384c0..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-8B-Instruct/5799ce8b-c00d-49f6-96dc-f7dd057a268c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama-3-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-Llama-3-8B-Instruct", - "id": "dnhkng/RYS-Llama-3-8B-Instruct", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6958 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4809 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3383 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Huge-Instruct/0d261023-3e35-4160-98ca-241bbaee927e.json b/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Huge-Instruct/0d261023-3e35-4160-98ca-241bbaee927e.json deleted file mode 100644 index f9db9541c..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Huge-Instruct/0d261023-3e35-4160-98ca-241bbaee927e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama-3-Huge-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-Llama-3-Huge-Instruct", - "id": "dnhkng/RYS-Llama-3-Huge-Instruct", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 99.646 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7686 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6481 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2289 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.511 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Large-Instruct/f0454d3b-18b4-488a-94dd-fb24729996c7.json b/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Large-Instruct/f0454d3b-18b4-488a-94dd-fb24729996c7.json deleted file mode 100644 index b5b981469..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Large-Instruct/f0454d3b-18b4-488a-94dd-fb24729996c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama-3-Large-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-Llama-3-Large-Instruct", - "id": "dnhkng/RYS-Llama-3-Large-Instruct", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 73.976 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8051 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6525 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2304 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama-3.1-8B-Instruct/6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json b/data/hfopenllm_v2/dnhkng/RYS-Llama-3.1-8B-Instruct/6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json deleted file mode 100644 index 8a33c3dbf..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-Llama-3.1-8B-Instruct/6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-Llama-3.1-8B-Instruct", - "id": "dnhkng/RYS-Llama-3.1-8B-Instruct", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 8.685 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5164 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3681 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3639 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama3.1-Large/37f20f86-40ba-4f63-b29d-efff6cb0e09b.json b/data/hfopenllm_v2/dnhkng/RYS-Llama3.1-Large/37f20f86-40ba-4f63-b29d-efff6cb0e09b.json deleted file mode 100644 index bd2270331..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-Llama3.1-Large/37f20f86-40ba-4f63-b29d-efff6cb0e09b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama3.1-Large/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-Llama3.1-Large", - "id": "dnhkng/RYS-Llama3.1-Large", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 81.677 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8492 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6899 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3505 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5249 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-Medium/bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json b/data/hfopenllm_v2/dnhkng/RYS-Medium/bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json deleted file mode 100644 index f208cf7e5..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-Medium/bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Medium/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-Medium", - "id": "dnhkng/RYS-Medium", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 18.731 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4406 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6285 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4069 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4326 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-Phi-3-medium-4k-instruct/bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json b/data/hfopenllm_v2/dnhkng/RYS-Phi-3-medium-4k-instruct/bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json deleted file mode 100644 index e893c8a74..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-Phi-3-medium-4k-instruct/bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Phi-3-medium-4k-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-Phi-3-medium-4k-instruct", - "id": "dnhkng/RYS-Phi-3-medium-4k-instruct", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 17.709 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4391 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6226 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1609 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4846 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-XLarge-base/cbea8d66-0370-4998-8e3a-06fef0a60f0c.json b/data/hfopenllm_v2/dnhkng/RYS-XLarge-base/cbea8d66-0370-4998-8e3a-06fef0a60f0c.json deleted file mode 100644 index 7bb7b0a97..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-XLarge-base/cbea8d66-0370-4998-8e3a-06fef0a60f0c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-XLarge-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-XLarge-base", - "id": "dnhkng/RYS-XLarge-base", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.972 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.791 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4903 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-XLarge/ca48b670-b82e-46cc-beb9-2fd0f11d3585.json b/data/hfopenllm_v2/dnhkng/RYS-XLarge/ca48b670-b82e-46cc-beb9-2fd0f11d3585.json deleted file mode 100644 index 8542415fb..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-XLarge/ca48b670-b82e-46cc-beb9-2fd0f11d3585.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-XLarge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-XLarge", - "id": "dnhkng/RYS-XLarge", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.965 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7996 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.705 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4252 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.497 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5428 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dnhkng/RYS-XLarge2/d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json b/data/hfopenllm_v2/dnhkng/RYS-XLarge2/d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json deleted file mode 100644 index c6ee9080f..000000000 --- a/data/hfopenllm_v2/dnhkng/RYS-XLarge2/d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dnhkng_RYS-XLarge2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RYS-XLarge2", - "id": "dnhkng/RYS-XLarge2", - "developer": "dnhkng", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 77.965 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6574 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2749 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4508 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5378 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dreamgen/WizardLM-2-7B/503c8a24-4ced-4dca-b9df-5733ce89c2ca.json b/data/hfopenllm_v2/dreamgen/WizardLM-2-7B/503c8a24-4ced-4dca-b9df-5733ce89c2ca.json deleted file mode 100644 index ea7995a74..000000000 --- a/data/hfopenllm_v2/dreamgen/WizardLM-2-7B/503c8a24-4ced-4dca-b9df-5733ce89c2ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dreamgen_WizardLM-2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WizardLM-2-7B", - "id": "dreamgen/WizardLM-2-7B", - "developer": "dreamgen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4583 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3941 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v1/5c5283a0-819f-4112-bb90-5277423d9c00.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v1/5c5283a0-819f-4112-bb90-5277423d9c00.json deleted file mode 100644 index c68eb92a9..000000000 --- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v1/5c5283a0-819f-4112-bb90-5277423d9c00.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflexis-8b-chat-v1", - "id": "dustinwloring1988/Reflexis-8b-chat-v1", - "developer": "dustinwloring1988", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4664 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v2/b636bc82-1625-49b1-beec-cadaf4e1b1a9.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v2/b636bc82-1625-49b1-beec-cadaf4e1b1a9.json deleted file mode 100644 index 9bbf76e4e..000000000 --- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v2/b636bc82-1625-49b1-beec-cadaf4e1b1a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflexis-8b-chat-v2", - "id": "dustinwloring1988/Reflexis-8b-chat-v2", - "developer": "dustinwloring1988", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3912 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4724 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3526 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3378 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v3/00f481c1-0ef0-40bd-bd95-81dc9443a62c.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v3/00f481c1-0ef0-40bd-bd95-81dc9443a62c.json deleted file mode 100644 index 4ff73b25c..000000000 --- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v3/00f481c1-0ef0-40bd-bd95-81dc9443a62c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflexis-8b-chat-v3", - "id": "dustinwloring1988/Reflexis-8b-chat-v3", - "developer": "dustinwloring1988", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5367 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4658 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3512 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3548 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v4/7ea22fef-2d79-49ae-bf72-9153a4e239c5.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v4/7ea22fef-2d79-49ae-bf72-9153a4e239c5.json deleted file mode 100644 index 6e5776566..000000000 --- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v4/7ea22fef-2d79-49ae-bf72-9153a4e239c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflexis-8b-chat-v4", - "id": "dustinwloring1988/Reflexis-8b-chat-v4", - "developer": "dustinwloring1988", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4698 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4686 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1027 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2341 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3393 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.339 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v5/64f441df-1781-4d01-b73b-2156413ad403.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v5/64f441df-1781-4d01-b73b-2156413ad403.json deleted file mode 100644 index 6d8fa5edd..000000000 --- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v5/64f441df-1781-4d01-b73b-2156413ad403.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflexis-8b-chat-v5", - "id": "dustinwloring1988/Reflexis-8b-chat-v5", - "developer": "dustinwloring1988", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4238 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4782 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3217 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v6/4e3676eb-8607-416e-986a-7098bc192820.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v6/4e3676eb-8607-416e-986a-7098bc192820.json deleted file mode 100644 index be5afc0ca..000000000 --- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v6/4e3676eb-8607-416e-986a-7098bc192820.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflexis-8b-chat-v6", - "id": "dustinwloring1988/Reflexis-8b-chat-v6", - "developer": "dustinwloring1988", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4939 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.481 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3479 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v7/2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v7/2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json deleted file mode 100644 index 601437128..000000000 --- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v7/2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflexis-8b-chat-v7", - "id": "dustinwloring1988/Reflexis-8b-chat-v7", - "developer": "dustinwloring1988", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.398 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.481 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1631 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3643 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/duyhv1411/Llama-3.2-1B-en-vi/c4b86264-3725-4742-91f0-3e01f8d965a4.json b/data/hfopenllm_v2/duyhv1411/Llama-3.2-1B-en-vi/c4b86264-3725-4742-91f0-3e01f8d965a4.json deleted file mode 100644 index f91722bb5..000000000 --- a/data/hfopenllm_v2/duyhv1411/Llama-3.2-1B-en-vi/c4b86264-3725-4742-91f0-3e01f8d965a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/duyhv1411_Llama-3.2-1B-en-vi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-en-vi", - "id": "duyhv1411/Llama-3.2-1B-en-vi", - "developer": "duyhv1411", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4788 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3291 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3197 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1341 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/duyhv1411/Llama-3.2-3B-en-vi/0308147c-dabb-46bb-8add-d332fcd5a800.json b/data/hfopenllm_v2/duyhv1411/Llama-3.2-3B-en-vi/0308147c-dabb-46bb-8add-d332fcd5a800.json deleted file mode 100644 index 7c38df979..000000000 --- a/data/hfopenllm_v2/duyhv1411/Llama-3.2-3B-en-vi/0308147c-dabb-46bb-8add-d332fcd5a800.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/duyhv1411_Llama-3.2-3B-en-vi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-en-vi", - "id": "duyhv1411/Llama-3.2-3B-en-vi", - "developer": "duyhv1411", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4852 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.321 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1359 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-inst/a9977a0d-e199-488a-a26e-6269806fdb2b.json b/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-inst/a9977a0d-e199-488a-a26e-6269806fdb2b.json deleted file mode 100644 index ebf7ac8d4..000000000 --- a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-inst/a9977a0d-e199-488a-a26e-6269806fdb2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dwikitheduck_gemma-2-2b-id-inst/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-id-inst", - "id": "dwikitheduck/gemma-2-2b-id-inst", - "developer": "dwikitheduck", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3962 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-instruct/56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json b/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-instruct/56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json deleted file mode 100644 index 83e3a462d..000000000 --- a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-instruct/56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dwikitheduck_gemma-2-2b-id-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-id-instruct", - "id": "dwikitheduck/gemma-2-2b-id-instruct", - "developer": "dwikitheduck", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3962 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id/4185c376-91c6-435d-ae3b-47cd85151049.json b/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id/4185c376-91c6-435d-ae3b-47cd85151049.json deleted file mode 100644 index f7fe9743d..000000000 --- a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id/4185c376-91c6-435d-ae3b-47cd85151049.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dwikitheduck_gemma-2-2b-id/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-id", - "id": "dwikitheduck/gemma-2-2b-id", - "developer": "dwikitheduck", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3962 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dwikitheduck/gen-inst-1/26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json b/data/hfopenllm_v2/dwikitheduck/gen-inst-1/26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json deleted file mode 100644 index 0250810c2..000000000 --- a/data/hfopenllm_v2/dwikitheduck/gen-inst-1/26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dwikitheduck_gen-inst-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gen-inst-1", - "id": "dwikitheduck/gen-inst-1", - "developer": "dwikitheduck", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.775 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.642 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4554 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4205 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5089 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dwikitheduck/gen-try1-notemp/09be48ce-61f8-4ba9-b082-b9c475fa714d.json b/data/hfopenllm_v2/dwikitheduck/gen-try1-notemp/09be48ce-61f8-4ba9-b082-b9c475fa714d.json deleted file mode 100644 index ce2a46bd9..000000000 --- a/data/hfopenllm_v2/dwikitheduck/gen-try1-notemp/09be48ce-61f8-4ba9-b082-b9c475fa714d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dwikitheduck_gen-try1-notemp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gen-try1-notemp", - "id": "dwikitheduck/gen-try1-notemp", - "developer": "dwikitheduck", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2627 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6263 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4714 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.521 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dwikitheduck/gen-try1/27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json b/data/hfopenllm_v2/dwikitheduck/gen-try1/27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json deleted file mode 100644 index e7929b7e4..000000000 --- a/data/hfopenllm_v2/dwikitheduck/gen-try1/27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dwikitheduck_gen-try1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gen-try1", - "id": "dwikitheduck/gen-try1", - "developer": "dwikitheduck", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7522 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6359 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4416 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/dzakwan/dzakwan-MoE-4x7b-Beta/7b6fc3c2-a67d-450e-858c-fa87be122376.json b/data/hfopenllm_v2/dzakwan/dzakwan-MoE-4x7b-Beta/7b6fc3c2-a67d-450e-858c-fa87be122376.json deleted file mode 100644 index 1562aefaf..000000000 --- a/data/hfopenllm_v2/dzakwan/dzakwan-MoE-4x7b-Beta/7b6fc3c2-a67d-450e-858c-fa87be122376.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/dzakwan_dzakwan-MoE-4x7b-Beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dzakwan-MoE-4x7b-Beta", - "id": "dzakwan/dzakwan-MoE-4x7b-Beta", - "developer": "dzakwan", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4443 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4267 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/Falcon3-8B-Franken-Basestruct/76b86418-5450-48c6-ae56-58a19016d055.json b/data/hfopenllm_v2/ehristoforu/Falcon3-8B-Franken-Basestruct/76b86418-5450-48c6-ae56-58a19016d055.json deleted file mode 100644 index 8426beecc..000000000 --- a/data/hfopenllm_v2/ehristoforu/Falcon3-8B-Franken-Basestruct/76b86418-5450-48c6-ae56-58a19016d055.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_Falcon3-8B-Franken-Basestruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-8B-Franken-Basestruct", - "id": "ehristoforu/Falcon3-8B-Franken-Basestruct", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.406 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1715 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5463 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3947 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/Falcon3-MoE-2x7B-Insruct/e06594e4-899a-4285-b130-f7b605e5a6b9.json b/data/hfopenllm_v2/ehristoforu/Falcon3-MoE-2x7B-Insruct/e06594e4-899a-4285-b130-f7b605e5a6b9.json deleted file mode 100644 index 02842cc41..000000000 --- a/data/hfopenllm_v2/ehristoforu/Falcon3-MoE-2x7B-Insruct/e06594e4-899a-4285-b130-f7b605e5a6b9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_Falcon3-MoE-2x7B-Insruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-MoE-2x7B-Insruct", - "id": "ehristoforu/Falcon3-MoE-2x7B-Insruct", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 13.401 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7643 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5648 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.484 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/Gemma2-9B-it-psy10k-mental_health/9efdc773-a5c7-4709-88c8-96a67d84a742.json b/data/hfopenllm_v2/ehristoforu/Gemma2-9B-it-psy10k-mental_health/9efdc773-a5c7-4709-88c8-96a67d84a742.json deleted file mode 100644 index 4328adf7f..000000000 --- a/data/hfopenllm_v2/ehristoforu/Gemma2-9B-it-psy10k-mental_health/9efdc773-a5c7-4709-88c8-96a67d84a742.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_Gemma2-9B-it-psy10k-mental_health/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2-9B-it-psy10k-mental_health", - "id": "ehristoforu/Gemma2-9B-it-psy10k-mental_health", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5887 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1631 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3829 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/Gemma2-9b-it-train6/1fcc2f96-afc9-403f-b82e-8e1804506582.json b/data/hfopenllm_v2/ehristoforu/Gemma2-9b-it-train6/1fcc2f96-afc9-403f-b82e-8e1804506582.json deleted file mode 100644 index b1b1e648f..000000000 --- a/data/hfopenllm_v2/ehristoforu/Gemma2-9b-it-train6/1fcc2f96-afc9-403f-b82e-8e1804506582.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_Gemma2-9b-it-train6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2-9b-it-train6", - "id": "ehristoforu/Gemma2-9b-it-train6", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7025 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5898 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1911 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4084 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3942 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/HappyLlama1/bee1e134-9a43-441a-b977-522c510dd1ce.json b/data/hfopenllm_v2/ehristoforu/HappyLlama1/bee1e134-9a43-441a-b977-522c510dd1ce.json deleted file mode 100644 index ee8741019..000000000 --- a/data/hfopenllm_v2/ehristoforu/HappyLlama1/bee1e134-9a43-441a-b977-522c510dd1ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_HappyLlama1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HappyLlama1", - "id": "ehristoforu/HappyLlama1", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7363 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4996 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1427 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3546 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT-Dare/b70e1089-d136-4b2f-a253-f361bcf8cdcc.json b/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT-Dare/b70e1089-d136-4b2f-a253-f361bcf8cdcc.json deleted file mode 100644 index db80ae6ad..000000000 --- a/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT-Dare/b70e1089-d136-4b2f-a253-f361bcf8cdcc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_QwenQwen2.5-7B-IT-Dare/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenQwen2.5-7B-IT-Dare", - "id": "ehristoforu/QwenQwen2.5-7B-IT-Dare", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5398 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT/8b7e9c34-a982-4f4d-b5dc-66a12578601f.json b/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT/8b7e9c34-a982-4f4d-b5dc-66a12578601f.json deleted file mode 100644 index a51f16a50..000000000 --- a/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT/8b7e9c34-a982-4f4d-b5dc-66a12578601f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_QwenQwen2.5-7B-IT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenQwen2.5-7B-IT", - "id": "ehristoforu/QwenQwen2.5-7B-IT", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5398 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/RQwen-v0.1/0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json b/data/hfopenllm_v2/ehristoforu/RQwen-v0.1/0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json deleted file mode 100644 index 552e8c521..000000000 --- a/data/hfopenllm_v2/ehristoforu/RQwen-v0.1/0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_RQwen-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RQwen-v0.1", - "id": "ehristoforu/RQwen-v0.1", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7625 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6446 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4645 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4139 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5202 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/RQwen-v0.2/066abe97-2c6c-4f3b-9e5e-e144f130258a.json b/data/hfopenllm_v2/ehristoforu/RQwen-v0.2/066abe97-2c6c-4f3b-9e5e-e144f130258a.json deleted file mode 100644 index 9ac5c419c..000000000 --- a/data/hfopenllm_v2/ehristoforu/RQwen-v0.2/066abe97-2c6c-4f3b-9e5e-e144f130258a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_RQwen-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RQwen-v0.2", - "id": "ehristoforu/RQwen-v0.2", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7504 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6427 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.327 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5159 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/SoRu-0009/a3af8f77-d915-4482-a2b6-c99744aada4b.json b/data/hfopenllm_v2/ehristoforu/SoRu-0009/a3af8f77-d915-4482-a2b6-c99744aada4b.json deleted file mode 100644 index 1af53918b..000000000 --- a/data/hfopenllm_v2/ehristoforu/SoRu-0009/a3af8f77-d915-4482-a2b6-c99744aada4b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_SoRu-0009/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SoRu-0009", - "id": "ehristoforu/SoRu-0009", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2582 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/coolqwen-3b-it/82cc8b37-e242-441e-ac74-1662bcc0a0e2.json b/data/hfopenllm_v2/ehristoforu/coolqwen-3b-it/82cc8b37-e242-441e-ac74-1662bcc0a0e2.json deleted file mode 100644 index fcf5a776a..000000000 --- a/data/hfopenllm_v2/ehristoforu/coolqwen-3b-it/82cc8b37-e242-441e-ac74-1662bcc0a0e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_coolqwen-3b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "coolqwen-3b-it", - "id": "ehristoforu/coolqwen-3b-it", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.085 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6473 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4851 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3601 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/della-70b-test-v1/1527c8bc-c1ec-45f4-9663-4cffbb808f94.json b/data/hfopenllm_v2/ehristoforu/della-70b-test-v1/1527c8bc-c1ec-45f4-9663-4cffbb808f94.json deleted file mode 100644 index 367d47be6..000000000 --- a/data/hfopenllm_v2/ehristoforu/della-70b-test-v1/1527c8bc-c1ec-45f4-9663-4cffbb808f94.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_della-70b-test-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "della-70b-test-v1", - "id": "ehristoforu/della-70b-test-v1", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4979 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1575 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/falcon3-ultraset/337b8ce8-d697-47f6-94ac-7a420dd7d91b.json b/data/hfopenllm_v2/ehristoforu/falcon3-ultraset/337b8ce8-d697-47f6-94ac-7a420dd7d91b.json deleted file mode 100644 index e2151cd4e..000000000 --- a/data/hfopenllm_v2/ehristoforu/falcon3-ultraset/337b8ce8-d697-47f6-94ac-7a420dd7d91b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_falcon3-ultraset/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "falcon3-ultraset", - "id": "ehristoforu/falcon3-ultraset", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7135 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5584 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2122 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4853 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3982 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/fd-lora-merged-16x32/3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json b/data/hfopenllm_v2/ehristoforu/fd-lora-merged-16x32/3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json deleted file mode 100644 index af627d759..000000000 --- a/data/hfopenllm_v2/ehristoforu/fd-lora-merged-16x32/3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_fd-lora-merged-16x32/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fd-lora-merged-16x32", - "id": "ehristoforu/fd-lora-merged-16x32", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.776 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3308 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1205 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/fd-lora-merged-64x128/0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json b/data/hfopenllm_v2/ehristoforu/fd-lora-merged-64x128/0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json deleted file mode 100644 index 185158297..000000000 --- a/data/hfopenllm_v2/ehristoforu/fd-lora-merged-64x128/0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_fd-lora-merged-64x128/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fd-lora-merged-64x128", - "id": "ehristoforu/fd-lora-merged-64x128", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3281 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3345 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1537 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/fp4-14b-it-v1/3e236ad8-3828-407f-9076-743b465b8d15.json b/data/hfopenllm_v2/ehristoforu/fp4-14b-it-v1/3e236ad8-3828-407f-9076-743b465b8d15.json deleted file mode 100644 index cd4c75104..000000000 --- a/data/hfopenllm_v2/ehristoforu/fp4-14b-it-v1/3e236ad8-3828-407f-9076-743b465b8d15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_fp4-14b-it-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fp4-14b-it-v1", - "id": "ehristoforu/fp4-14b-it-v1", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2535 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.574 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4205 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/fp4-14b-v1-fix/9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json b/data/hfopenllm_v2/ehristoforu/fp4-14b-v1-fix/9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json deleted file mode 100644 index ba50dee2a..000000000 --- a/data/hfopenllm_v2/ehristoforu/fp4-14b-v1-fix/9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_fp4-14b-v1-fix/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fp4-14b-v1-fix", - "id": "ehristoforu/fp4-14b-v1-fix", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6742 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6817 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4532 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_false/940d88e9-085b-4065-b8c8-92ebe685deb0.json b/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_false/940d88e9-085b-4065-b8c8-92ebe685deb0.json deleted file mode 100644 index 06d2d501b..000000000 --- a/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_false/940d88e9-085b-4065-b8c8-92ebe685deb0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_fq2.5-7b-it-normalize_false/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fq2.5-7b-it-normalize_false", - "id": "ehristoforu/fq2.5-7b-it-normalize_false", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7399 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.552 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4622 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4612 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_true/7fdcd616-2c72-4c44-9646-9c32344bfa0b.json b/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_true/7fdcd616-2c72-4c44-9646-9c32344bfa0b.json deleted file mode 100644 index 0dfb08d8b..000000000 --- a/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_true/7fdcd616-2c72-4c44-9646-9c32344bfa0b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_fq2.5-7b-it-normalize_true/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fq2.5-7b-it-normalize_true", - "id": "ehristoforu/fq2.5-7b-it-normalize_true", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7399 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.552 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4622 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4612 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-duable4layers-it/9d358f55-810c-4ac1-adc7-83f95bd74c11.json b/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-duable4layers-it/9d358f55-810c-4ac1-adc7-83f95bd74c11.json deleted file mode 100644 index 073ffd258..000000000 --- a/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-duable4layers-it/9d358f55-810c-4ac1-adc7-83f95bd74c11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_frqwen2.5-from7b-duable4layers-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "frqwen2.5-from7b-duable4layers-it", - "id": "ehristoforu/frqwen2.5-from7b-duable4layers-it", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 8.545 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7729 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5264 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4166 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-it/9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json b/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-it/9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json deleted file mode 100644 index 61c9293ae..000000000 --- a/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-it/9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_frqwen2.5-from7b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "frqwen2.5-from7b-it", - "id": "ehristoforu/frqwen2.5-from7b-it", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 13.206 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6532 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5143 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-instruct/651a32b1-77fb-4acf-89bf-2d45b684944d.json b/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-instruct/651a32b1-77fb-4acf-89bf-2d45b684944d.json deleted file mode 100644 index 364558711..000000000 --- a/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-instruct/651a32b1-77fb-4acf-89bf-2d45b684944d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_mllama-3.1-8b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mllama-3.1-8b-instruct", - "id": "ehristoforu/mllama-3.1-8b-instruct", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3458 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4718 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2533 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-it/192c4037-753a-4790-80d0-33c4d277102d.json b/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-it/192c4037-753a-4790-80d0-33c4d277102d.json deleted file mode 100644 index 601171444..000000000 --- a/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-it/192c4037-753a-4790-80d0-33c4d277102d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_mllama-3.1-8b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mllama-3.1-8b-it", - "id": "ehristoforu/mllama-3.1-8b-it", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3879 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4868 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3799 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3349 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2622 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/moremerge-upscaled/679d66bf-244e-4080-9a42-0a0c6cfdc965.json b/data/hfopenllm_v2/ehristoforu/moremerge-upscaled/679d66bf-244e-4080-9a42-0a0c6cfdc965.json deleted file mode 100644 index 62db8ac19..000000000 --- a/data/hfopenllm_v2/ehristoforu/moremerge-upscaled/679d66bf-244e-4080-9a42-0a0c6cfdc965.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_moremerge-upscaled/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "moremerge-upscaled", - "id": "ehristoforu/moremerge-upscaled", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 8.545 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1979 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2698 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3593 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1041 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/moremerge/73b0ca8a-fb16-43eb-a9af-a01219cf6196.json b/data/hfopenllm_v2/ehristoforu/moremerge/73b0ca8a-fb16-43eb-a9af-a01219cf6196.json deleted file mode 100644 index f1591edbb..000000000 --- a/data/hfopenllm_v2/ehristoforu/moremerge/73b0ca8a-fb16-43eb-a9af-a01219cf6196.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_moremerge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "moremerge", - "id": "ehristoforu/moremerge", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2019 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2868 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3566 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1065 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/phi-4-25b/7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json b/data/hfopenllm_v2/ehristoforu/phi-4-25b/7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json deleted file mode 100644 index 524b66617..000000000 --- a/data/hfopenllm_v2/ehristoforu/phi-4-25b/7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_phi-4-25b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-25b", - "id": "ehristoforu/phi-4-25b", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 24.883 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6484 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6908 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4524 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5351 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/qwen2.5-test-32b-it/a8238bd4-3982-4e45-92e4-bab77e528e29.json b/data/hfopenllm_v2/ehristoforu/qwen2.5-test-32b-it/a8238bd4-3982-4e45-92e4-bab77e528e29.json deleted file mode 100644 index e044dab4c..000000000 --- a/data/hfopenllm_v2/ehristoforu/qwen2.5-test-32b-it/a8238bd4-3982-4e45-92e4-bab77e528e29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_qwen2.5-test-32b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-test-32b-it", - "id": "ehristoforu/qwen2.5-test-32b-it", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7889 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7081 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4578 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5765 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/qwen2.5-with-lora-think-3b-it/f87f9f08-e989-4e99-a254-a3650e7ab1b6.json b/data/hfopenllm_v2/ehristoforu/qwen2.5-with-lora-think-3b-it/f87f9f08-e989-4e99-a254-a3650e7ab1b6.json deleted file mode 100644 index c5897513d..000000000 --- a/data/hfopenllm_v2/ehristoforu/qwen2.5-with-lora-think-3b-it/f87f9f08-e989-4e99-a254-a3650e7ab1b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_qwen2.5-with-lora-think-3b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-with-lora-think-3b-it", - "id": "ehristoforu/qwen2.5-with-lora-think-3b-it", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5319 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4687 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2364 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3403 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/rmoe-v1/f40496a9-fb14-4b2d-8070-84f55e6417f6.json b/data/hfopenllm_v2/ehristoforu/rmoe-v1/f40496a9-fb14-4b2d-8070-84f55e6417f6.json deleted file mode 100644 index 622169295..000000000 --- a/data/hfopenllm_v2/ehristoforu/rmoe-v1/f40496a9-fb14-4b2d-8070-84f55e6417f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_rmoe-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "rmoe-v1", - "id": "ehristoforu/rmoe-v1", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 11.026 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.265 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2929 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/rufalcon3-3b-it/cc52f59d-5669-44b0-b1af-e6fd0836e284.json b/data/hfopenllm_v2/ehristoforu/rufalcon3-3b-it/cc52f59d-5669-44b0-b1af-e6fd0836e284.json deleted file mode 100644 index 17d1c2ff2..000000000 --- a/data/hfopenllm_v2/ehristoforu/rufalcon3-3b-it/cc52f59d-5669-44b0-b1af-e6fd0836e284.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_rufalcon3-3b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "rufalcon3-3b-it", - "id": "ehristoforu/rufalcon3-3b-it", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.228 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5942 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4155 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3895 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2348 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/ruphi-4b/67525a37-f658-40e8-89a1-de8bf6275a00.json b/data/hfopenllm_v2/ehristoforu/ruphi-4b/67525a37-f658-40e8-89a1-de8bf6275a00.json deleted file mode 100644 index 9d1d684fd..000000000 --- a/data/hfopenllm_v2/ehristoforu/ruphi-4b/67525a37-f658-40e8-89a1-de8bf6275a00.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_ruphi-4b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ruphi-4b", - "id": "ehristoforu/ruphi-4b", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1752 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2906 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3512 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/testq-32b/3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json b/data/hfopenllm_v2/ehristoforu/testq-32b/3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json deleted file mode 100644 index 50e07975c..000000000 --- a/data/hfopenllm_v2/ehristoforu/testq-32b/3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_testq-32b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "testq-32b", - "id": "ehristoforu/testq-32b", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 56.165 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1876 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2877 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3715 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/tmoe-v2/0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json b/data/hfopenllm_v2/ehristoforu/tmoe-v2/0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json deleted file mode 100644 index ed11e623d..000000000 --- a/data/hfopenllm_v2/ehristoforu/tmoe-v2/0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_tmoe-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tmoe-v2", - "id": "ehristoforu/tmoe-v2", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 11.026 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1903 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2897 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4151 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.11 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/tmoe/7a05616e-7335-419a-914d-00fb287fe663.json b/data/hfopenllm_v2/ehristoforu/tmoe/7a05616e-7335-419a-914d-00fb287fe663.json deleted file mode 100644 index d62cd4fdb..000000000 --- a/data/hfopenllm_v2/ehristoforu/tmoe/7a05616e-7335-419a-914d-00fb287fe663.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_tmoe/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tmoe", - "id": "ehristoforu/tmoe", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 11.026 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2232 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1191 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/trd-7b-it/070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json b/data/hfopenllm_v2/ehristoforu/trd-7b-it/070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json deleted file mode 100644 index bf0489323..000000000 --- a/data/hfopenllm_v2/ehristoforu/trd-7b-it/070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_trd-7b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "trd-7b-it", - "id": "ehristoforu/trd-7b-it", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2185 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.299 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1179 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ehristoforu/ud-14b/5afc044a-3138-443f-89cf-74f1272cc632.json b/data/hfopenllm_v2/ehristoforu/ud-14b/5afc044a-3138-443f-89cf-74f1272cc632.json deleted file mode 100644 index 5bff4a74e..000000000 --- a/data/hfopenllm_v2/ehristoforu/ud-14b/5afc044a-3138-443f-89cf-74f1272cc632.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ehristoforu_ud-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ud-14b", - "id": "ehristoforu/ud-14b", - "developer": "ehristoforu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4235 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3324 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1903 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2374 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2415 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/elinas/Chronos-Gold-12B-1.0/a6c1d914-647c-46b7-b0e1-712b8d506780.json b/data/hfopenllm_v2/elinas/Chronos-Gold-12B-1.0/a6c1d914-647c-46b7-b0e1-712b8d506780.json deleted file mode 100644 index 1bd60921e..000000000 --- a/data/hfopenllm_v2/elinas/Chronos-Gold-12B-1.0/a6c1d914-647c-46b7-b0e1-712b8d506780.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/elinas_Chronos-Gold-12B-1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chronos-Gold-12B-1.0", - "id": "elinas/Chronos-Gold-12B-1.0", - "developer": "elinas", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3166 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ell44ot/gemma-2b-def/43f35eac-0946-42f9-a128-eb8011c29588.json b/data/hfopenllm_v2/ell44ot/gemma-2b-def/43f35eac-0946-42f9-a128-eb8011c29588.json deleted file mode 100644 index 07020d3eb..000000000 --- a/data/hfopenllm_v2/ell44ot/gemma-2b-def/43f35eac-0946-42f9-a128-eb8011c29588.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ell44ot_gemma-2b-def/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2b-def", - "id": "ell44ot/gemma-2b-def", - "developer": "ell44ot", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GemmaModel", - "params_billions": 1.546 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3159 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1572 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/euclaise/ReMask-3B/04c22be7-2cf4-4774-b479-863199c7c3a4.json b/data/hfopenllm_v2/euclaise/ReMask-3B/04c22be7-2cf4-4774-b479-863199c7c3a4.json deleted file mode 100644 index f550f9a02..000000000 --- a/data/hfopenllm_v2/euclaise/ReMask-3B/04c22be7-2cf4-4774-b479-863199c7c3a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/euclaise_ReMask-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReMask-3B", - "id": "euclaise/ReMask-3B", - "developer": "euclaise", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "StableLmForCausalLM", - "params_billions": 2.795 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2419 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3517 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/eworojoshua/vas-01/fc3d436b-ec61-4458-a3c6-1df41057ea70.json b/data/hfopenllm_v2/eworojoshua/vas-01/fc3d436b-ec61-4458-a3c6-1df41057ea70.json deleted file mode 100644 index d331211d6..000000000 --- a/data/hfopenllm_v2/eworojoshua/vas-01/fc3d436b-ec61-4458-a3c6-1df41057ea70.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/eworojoshua_vas-01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "vas-01", - "id": "eworojoshua/vas-01", - "developer": "eworojoshua", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7612 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5418 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4736 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4348 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning/e3ed157f-f306-40fb-b3a1-d3434236759e.json b/data/hfopenllm_v2/ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning/e3ed157f-f306-40fb-b3a1-d3434236759e.json deleted file mode 100644 index 99fd3f7c5..000000000 --- a/data/hfopenllm_v2/ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning/e3ed157f-f306-40fb-b3a1-d3434236759e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ewre324_Thinker-Llama-3.2-3B-Instruct-Reasoning/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Thinker-Llama-3.2-3B-Instruct-Reasoning", - "id": "ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning", - "developer": "ewre324", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4439 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3655 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning/8793b3e3-f409-499a-81f8-c250c8092841.json b/data/hfopenllm_v2/ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning/8793b3e3-f409-499a-81f8-c250c8092841.json deleted file mode 100644 index eba4ef4a3..000000000 --- a/data/hfopenllm_v2/ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning/8793b3e3-f409-499a-81f8-c250c8092841.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ewre324_Thinker-Qwen2.5-0.5B-Instruct-Reasoning/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Thinker-Qwen2.5-0.5B-Instruct-Reasoning", - "id": "ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning", - "developer": "ewre324", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2476 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1647 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning/33572f63-15ba-4fbc-b1cf-56b978384d02.json b/data/hfopenllm_v2/ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning/33572f63-15ba-4fbc-b1cf-56b978384d02.json deleted file mode 100644 index 3408e328d..000000000 --- a/data/hfopenllm_v2/ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning/33572f63-15ba-4fbc-b1cf-56b978384d02.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ewre324_Thinker-SmolLM2-135M-Instruct-Reasoning/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Thinker-SmolLM2-135M-Instruct-Reasoning", - "id": "ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning", - "developer": "ewre324", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3071 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1094 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ewre324/ewre324-R1-SmolLM2-135M-Distill/44c636ba-8303-4d75-bcb5-46e3c07a991a.json b/data/hfopenllm_v2/ewre324/ewre324-R1-SmolLM2-135M-Distill/44c636ba-8303-4d75-bcb5-46e3c07a991a.json deleted file mode 100644 index 5cd36efa3..000000000 --- a/data/hfopenllm_v2/ewre324/ewre324-R1-SmolLM2-135M-Distill/44c636ba-8303-4d75-bcb5-46e3c07a991a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ewre324_ewre324-R1-SmolLM2-135M-Distill/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ewre324-R1-SmolLM2-135M-Distill", - "id": "ewre324/ewre324-R1-SmolLM2-135M-Distill", - "developer": "ewre324", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3042 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1134 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/experiment-llm/exp-3-q-r/0a002444-3e5a-4fc8-acc6-72210a4181a9.json b/data/hfopenllm_v2/experiment-llm/exp-3-q-r/0a002444-3e5a-4fc8-acc6-72210a4181a9.json deleted file mode 100644 index 3553a3e14..000000000 --- a/data/hfopenllm_v2/experiment-llm/exp-3-q-r/0a002444-3e5a-4fc8-acc6-72210a4181a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/experiment-llm_exp-3-q-r/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "exp-3-q-r", - "id": "experiment-llm/exp-3-q-r", - "developer": "experiment-llm", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6036 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5397 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2787 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/facebook/opt-1.3b/bbf936a5-3594-4d0a-b5af-7a01740d0c81.json b/data/hfopenllm_v2/facebook/opt-1.3b/bbf936a5-3594-4d0a-b5af-7a01740d0c81.json deleted file mode 100644 index fc192890e..000000000 --- a/data/hfopenllm_v2/facebook/opt-1.3b/bbf936a5-3594-4d0a-b5af-7a01740d0c81.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/facebook_opt-1.3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "opt-1.3b", - "id": "facebook/opt-1.3b", - "developer": "facebook", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "OPTForCausalLM", - "params_billions": 1.3 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2383 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3094 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1107 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/facebook/opt-30b/1164abea-4cc2-46a7-a44b-f024a2ce40b4.json b/data/hfopenllm_v2/facebook/opt-30b/1164abea-4cc2-46a7-a44b-f024a2ce40b4.json deleted file mode 100644 index d05075cf6..000000000 --- a/data/hfopenllm_v2/facebook/opt-30b/1164abea-4cc2-46a7-a44b-f024a2ce40b4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/facebook_opt-30b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "opt-30b", - "id": "facebook/opt-30b", - "developer": "facebook", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "OPTForCausalLM", - "params_billions": 30.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2453 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-MopeyMule/bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json b/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-MopeyMule/bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json deleted file mode 100644 index c959c146e..000000000 --- a/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-MopeyMule/bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/failspy_Llama-3-8B-Instruct-MopeyMule/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-MopeyMule", - "id": "failspy/Llama-3-8B-Instruct-MopeyMule", - "developer": "failspy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3839 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3513 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-abliterated/7f49e582-a01f-481f-8345-1c384fc8b567.json b/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-abliterated/7f49e582-a01f-481f-8345-1c384fc8b567.json deleted file mode 100644 index b05984172..000000000 --- a/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-abliterated/7f49e582-a01f-481f-8345-1c384fc8b567.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/failspy_Llama-3-8B-Instruct-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-abliterated", - "id": "failspy/Llama-3-8B-Instruct-abliterated", - "developer": "failspy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5909 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4116 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2742 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5/10937ed1-56e2-4aad-b717-5125bc8ac72a.json b/data/hfopenllm_v2/failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5/10937ed1-56e2-4aad-b717-5125bc8ac72a.json deleted file mode 100644 index 794c2e176..000000000 --- a/data/hfopenllm_v2/failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5/10937ed1-56e2-4aad-b717-5125bc8ac72a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/failspy_Meta-Llama-3-70B-Instruct-abliterated-v3.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3-70B-Instruct-abliterated-v3.5", - "id": "failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5", - "developer": "failspy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7747 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5747 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3982 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3/f4622539-c0ac-4e9f-86d4-00e3c826d03b.json b/data/hfopenllm_v2/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3/f4622539-c0ac-4e9f-86d4-00e3c826d03b.json deleted file mode 100644 index eb2bf7ad8..000000000 --- a/data/hfopenllm_v2/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3/f4622539-c0ac-4e9f-86d4-00e3c826d03b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/failspy_Meta-Llama-3-8B-Instruct-abliterated-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3-8B-Instruct-abliterated-v3", - "id": "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3", - "developer": "failspy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7245 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4925 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3622 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3654 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/failspy/Phi-3-medium-4k-instruct-abliterated-v3/6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json b/data/hfopenllm_v2/failspy/Phi-3-medium-4k-instruct-abliterated-v3/6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json deleted file mode 100644 index 8fbcaddc2..000000000 --- a/data/hfopenllm_v2/failspy/Phi-3-medium-4k-instruct-abliterated-v3/6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/failspy_Phi-3-medium-4k-instruct-abliterated-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-medium-4k-instruct-abliterated-v3", - "id": "failspy/Phi-3-medium-4k-instruct-abliterated-v3", - "developer": "failspy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6319 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6305 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1594 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4604 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/failspy/llama-3-70B-Instruct-abliterated/5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json b/data/hfopenllm_v2/failspy/llama-3-70B-Instruct-abliterated/5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json deleted file mode 100644 index c9da5b3a7..000000000 --- a/data/hfopenllm_v2/failspy/llama-3-70B-Instruct-abliterated/5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/failspy_llama-3-70B-Instruct-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-70B-Instruct-abliterated", - "id": "failspy/llama-3-70B-Instruct-abliterated", - "developer": "failspy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8023 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2432 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4128 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/21d6f2dd-7bd6-42a9-b14e-c25777497890.json b/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/21d6f2dd-7bd6-42a9-b14e-c25777497890.json deleted file mode 100644 index bfc6fcad1..000000000 --- a/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/21d6f2dd-7bd6-42a9-b14e-c25777497890.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_TheBeagle-v2beta-32B-MGS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TheBeagle-v2beta-32B-MGS", - "id": "fblgit/TheBeagle-v2beta-32B-MGS", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5181 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7033 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4947 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5915 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/d0bc11cb-56ff-4c77-9446-e76e550e0919.json b/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/d0bc11cb-56ff-4c77-9446-e76e550e0919.json deleted file mode 100644 index 6080ab1e6..000000000 --- a/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/d0bc11cb-56ff-4c77-9446-e76e550e0919.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_TheBeagle-v2beta-32B-MGS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TheBeagle-v2beta-32B-MGS", - "id": "fblgit/TheBeagle-v2beta-32B-MGS", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4503 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7035 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.401 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5911 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/UNA-SimpleSmaug-34b-v1beta/ff78dc97-e9cf-4215-a607-3e80892af82c.json b/data/hfopenllm_v2/fblgit/UNA-SimpleSmaug-34b-v1beta/ff78dc97-e9cf-4215-a607-3e80892af82c.json deleted file mode 100644 index a5cda2338..000000000 --- a/data/hfopenllm_v2/fblgit/UNA-SimpleSmaug-34b-v1beta/ff78dc97-e9cf-4215-a607-3e80892af82c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_UNA-SimpleSmaug-34b-v1beta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "UNA-SimpleSmaug-34b-v1beta", - "id": "fblgit/UNA-SimpleSmaug-34b-v1beta", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.389 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4556 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5287 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4256 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.454 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/UNA-TheBeagle-7b-v1/0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json b/data/hfopenllm_v2/fblgit/UNA-TheBeagle-7b-v1/0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json deleted file mode 100644 index 4d873390f..000000000 --- a/data/hfopenllm_v2/fblgit/UNA-TheBeagle-7b-v1/0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_UNA-TheBeagle-7b-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "UNA-TheBeagle-7b-v1", - "id": "fblgit/UNA-TheBeagle-7b-v1", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3689 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5029 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4564 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3019 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/UNA-ThePitbull-21.4B-v2/48837141-2556-4658-87e0-bb88cfcd562a.json b/data/hfopenllm_v2/fblgit/UNA-ThePitbull-21.4B-v2/48837141-2556-4658-87e0-bb88cfcd562a.json deleted file mode 100644 index a60c55895..000000000 --- a/data/hfopenllm_v2/fblgit/UNA-ThePitbull-21.4B-v2/48837141-2556-4658-87e0-bb88cfcd562a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_UNA-ThePitbull-21.4B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "UNA-ThePitbull-21.4B-v2", - "id": "fblgit/UNA-ThePitbull-21.4B-v2", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 21.421 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.379 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.635 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3516 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-MGS/f2d6da5d-3685-43de-8ceb-5b798f88e24c.json b/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-MGS/f2d6da5d-3685-43de-8ceb-5b798f88e24c.json deleted file mode 100644 index 8b3f04151..000000000 --- a/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-MGS/f2d6da5d-3685-43de-8ceb-5b798f88e24c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_cybertron-v4-qw7B-MGS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cybertron-v4-qw7B-MGS", - "id": "fblgit/cybertron-v4-qw7B-MGS", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6264 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5592 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3489 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4473 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-UNAMGS/9ec02ccd-329a-4d62-9f04-87de6fda5011.json b/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-UNAMGS/9ec02ccd-329a-4d62-9f04-87de6fda5011.json deleted file mode 100644 index 4e95a568a..000000000 --- a/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-UNAMGS/9ec02ccd-329a-4d62-9f04-87de6fda5011.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_cybertron-v4-qw7B-UNAMGS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cybertron-v4-qw7B-UNAMGS", - "id": "fblgit/cybertron-v4-qw7B-UNAMGS", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5643 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3731 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/juanako-7b-UNA/781d0332-e332-4ff7-8585-9c2d8395a147.json b/data/hfopenllm_v2/fblgit/juanako-7b-UNA/781d0332-e332-4ff7-8585-9c2d8395a147.json deleted file mode 100644 index 0a2dc5709..000000000 --- a/data/hfopenllm_v2/fblgit/juanako-7b-UNA/781d0332-e332-4ff7-8585-9c2d8395a147.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_juanako-7b-UNA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "juanako-7b-UNA", - "id": "fblgit/juanako-7b-UNA", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4837 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4645 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2771 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS-GRPO/d6dd460e-c352-4d31-8941-183c6eabd0a7.json b/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS-GRPO/d6dd460e-c352-4d31-8941-183c6eabd0a7.json deleted file mode 100644 index 5152ad99c..000000000 --- a/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS-GRPO/d6dd460e-c352-4d31-8941-183c6eabd0a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "miniclaus-qw1.5B-UNAMGS-GRPO", - "id": "fblgit/miniclaus-qw1.5B-UNAMGS-GRPO", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4234 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS/66bf6442-04ea-437b-88c4-e61afc6f7139.json b/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS/66bf6442-04ea-437b-88c4-e61afc6f7139.json deleted file mode 100644 index 5d5d165d7..000000000 --- a/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS/66bf6442-04ea-437b-88c4-e61afc6f7139.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_miniclaus-qw1.5B-UNAMGS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "miniclaus-qw1.5B-UNAMGS", - "id": "fblgit/miniclaus-qw1.5B-UNAMGS", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3348 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4239 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2937 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/pancho-v1-qw25-3B-UNAMGS/0d1911f5-a2e7-4511-a8d8-098cbf9207df.json b/data/hfopenllm_v2/fblgit/pancho-v1-qw25-3B-UNAMGS/0d1911f5-a2e7-4511-a8d8-098cbf9207df.json deleted file mode 100644 index 2dcb46652..000000000 --- a/data/hfopenllm_v2/fblgit/pancho-v1-qw25-3B-UNAMGS/0d1911f5-a2e7-4511-a8d8-098cbf9207df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_pancho-v1-qw25-3B-UNAMGS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pancho-v1-qw25-3B-UNAMGS", - "id": "fblgit/pancho-v1-qw25-3B-UNAMGS", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5361 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4926 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1571 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4027 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3766 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fblgit/una-cybertron-7b-v2-bf16/abc18648-ef96-4695-94d5-fa14be277431.json b/data/hfopenllm_v2/fblgit/una-cybertron-7b-v2-bf16/abc18648-ef96-4695-94d5-fa14be277431.json deleted file mode 100644 index 56fbad07e..000000000 --- a/data/hfopenllm_v2/fblgit/una-cybertron-7b-v2-bf16/abc18648-ef96-4695-94d5-fa14be277431.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fblgit_una-cybertron-7b-v2-bf16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "una-cybertron-7b-v2-bf16", - "id": "fblgit/una-cybertron-7b-v2-bf16", - "developer": "fblgit", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4737 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3973 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2443 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fhai50032/RolePlayLake-7B/ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json b/data/hfopenllm_v2/fhai50032/RolePlayLake-7B/ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json deleted file mode 100644 index 5c8be692e..000000000 --- a/data/hfopenllm_v2/fhai50032/RolePlayLake-7B/ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fhai50032_RolePlayLake-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RolePlayLake-7B", - "id": "fhai50032/RolePlayLake-7B", - "developer": "fhai50032", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5057 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5252 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4459 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.316 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fhai50032/Unaligned-Thinker-PHI-4/cc8ef5bd-957f-4308-9539-00a696182056.json b/data/hfopenllm_v2/fhai50032/Unaligned-Thinker-PHI-4/cc8ef5bd-957f-4308-9539-00a696182056.json deleted file mode 100644 index 97e284a57..000000000 --- a/data/hfopenllm_v2/fhai50032/Unaligned-Thinker-PHI-4/cc8ef5bd-957f-4308-9539-00a696182056.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fhai50032_Unaligned-Thinker-PHI-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Unaligned-Thinker-PHI-4", - "id": "fhai50032/Unaligned-Thinker-PHI-4", - "developer": "fhai50032", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0563 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6643 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4679 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/flammenai/Llama3.1-Flammades-70B/abc7652f-b88e-40ba-847c-c99dce9f2719.json b/data/hfopenllm_v2/flammenai/Llama3.1-Flammades-70B/abc7652f-b88e-40ba-847c-c99dce9f2719.json deleted file mode 100644 index 791c615fd..000000000 --- a/data/hfopenllm_v2/flammenai/Llama3.1-Flammades-70B/abc7652f-b88e-40ba-847c-c99dce9f2719.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/flammenai_Llama3.1-Flammades-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-Flammades-70B", - "id": "flammenai/Llama3.1-Flammades-70B", - "developer": "flammenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7058 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.666 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2092 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4871 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4752 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/flammenai/Mahou-1.2a-llama3-8B/56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json b/data/hfopenllm_v2/flammenai/Mahou-1.2a-llama3-8B/56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json deleted file mode 100644 index b1967f57c..000000000 --- a/data/hfopenllm_v2/flammenai/Mahou-1.2a-llama3-8B/56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/flammenai_Mahou-1.2a-llama3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mahou-1.2a-llama3-8B", - "id": "flammenai/Mahou-1.2a-llama3-8B", - "developer": "flammenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5093 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5094 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/flammenai/Mahou-1.2a-mistral-7B/4b81caad-92ed-4bd5-98bd-58582854b5d8.json b/data/hfopenllm_v2/flammenai/Mahou-1.2a-mistral-7B/4b81caad-92ed-4bd5-98bd-58582854b5d8.json deleted file mode 100644 index cca5cbc09..000000000 --- a/data/hfopenllm_v2/flammenai/Mahou-1.2a-mistral-7B/4b81caad-92ed-4bd5-98bd-58582854b5d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/flammenai_Mahou-1.2a-mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mahou-1.2a-mistral-7B", - "id": "flammenai/Mahou-1.2a-mistral-7B", - "developer": "flammenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4552 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5118 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/flammenai/Mahou-1.5-llama3.1-70B/2cef0040-6d4c-4c38-be40-5477911f3063.json b/data/hfopenllm_v2/flammenai/Mahou-1.5-llama3.1-70B/2cef0040-6d4c-4c38-be40-5477911f3063.json deleted file mode 100644 index 5ef38b931..000000000 --- a/data/hfopenllm_v2/flammenai/Mahou-1.5-llama3.1-70B/2cef0040-6d4c-4c38-be40-5477911f3063.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/flammenai_Mahou-1.5-llama3.1-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mahou-1.5-llama3.1-70B", - "id": "flammenai/Mahou-1.5-llama3.1-70B", - "developer": "flammenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7147 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6651 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.495 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4749 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/flammenai/Mahou-1.5-mistral-nemo-12B/4aeef94f-823e-4be5-b4f1-37463e052748.json b/data/hfopenllm_v2/flammenai/Mahou-1.5-mistral-nemo-12B/4aeef94f-823e-4be5-b4f1-37463e052748.json deleted file mode 100644 index 818a33929..000000000 --- a/data/hfopenllm_v2/flammenai/Mahou-1.5-mistral-nemo-12B/4aeef94f-823e-4be5-b4f1-37463e052748.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/flammenai_Mahou-1.5-mistral-nemo-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mahou-1.5-mistral-nemo-12B", - "id": "flammenai/Mahou-1.5-mistral-nemo-12B", - "developer": "flammenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6751 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5522 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3602 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/flammenai/flammen15-gutenberg-DPO-v1-7B/3d367147-373f-4543-be19-55a6429558a2.json b/data/hfopenllm_v2/flammenai/flammen15-gutenberg-DPO-v1-7B/3d367147-373f-4543-be19-55a6429558a2.json deleted file mode 100644 index fd18593b8..000000000 --- a/data/hfopenllm_v2/flammenai/flammen15-gutenberg-DPO-v1-7B/3d367147-373f-4543-be19-55a6429558a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/flammenai_flammen15-gutenberg-DPO-v1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flammen15-gutenberg-DPO-v1-7B", - "id": "flammenai/flammen15-gutenberg-DPO-v1-7B", - "developer": "flammenai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4798 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5203 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3186 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fluently-lm/FluentlyLM-Prinum/cb93091a-6c46-438a-b111-cbf7e2fac420.json b/data/hfopenllm_v2/fluently-lm/FluentlyLM-Prinum/cb93091a-6c46-438a-b111-cbf7e2fac420.json deleted file mode 100644 index 921cf029a..000000000 --- a/data/hfopenllm_v2/fluently-lm/FluentlyLM-Prinum/cb93091a-6c46-438a-b111-cbf7e2fac420.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fluently-lm_FluentlyLM-Prinum/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FluentlyLM-Prinum", - "id": "fluently-lm/FluentlyLM-Prinum", - "developer": "fluently-lm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7144 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4471 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5808 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fluently-lm/Llama-TI-8B-Instruct/ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json b/data/hfopenllm_v2/fluently-lm/Llama-TI-8B-Instruct/ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json deleted file mode 100644 index 9c0f198a8..000000000 --- a/data/hfopenllm_v2/fluently-lm/Llama-TI-8B-Instruct/ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fluently-lm_Llama-TI-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-TI-8B-Instruct", - "id": "fluently-lm/Llama-TI-8B-Instruct", - "developer": "fluently-lm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7716 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5252 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2304 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3813 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3726 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fluently-lm/Llama-TI-8B/f4dc1659-800f-49d2-a290-48e9d4b15581.json b/data/hfopenllm_v2/fluently-lm/Llama-TI-8B/f4dc1659-800f-49d2-a290-48e9d4b15581.json deleted file mode 100644 index bacff0c4e..000000000 --- a/data/hfopenllm_v2/fluently-lm/Llama-TI-8B/f4dc1659-800f-49d2-a290-48e9d4b15581.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fluently-lm_Llama-TI-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-TI-8B", - "id": "fluently-lm/Llama-TI-8B", - "developer": "fluently-lm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.288 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5201 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4103 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fluently-sets/FalconThink3-10B-IT/d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json b/data/hfopenllm_v2/fluently-sets/FalconThink3-10B-IT/d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json deleted file mode 100644 index 9c1acfca1..000000000 --- a/data/hfopenllm_v2/fluently-sets/FalconThink3-10B-IT/d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fluently-sets_FalconThink3-10B-IT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FalconThink3-10B-IT", - "id": "fluently-sets/FalconThink3-10B-IT", - "developer": "fluently-sets", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7326 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2447 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4435 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fluently-sets/reasoning-1-1k-demo/91017e73-f33a-49f5-ac87-f6e6a178d885.json b/data/hfopenllm_v2/fluently-sets/reasoning-1-1k-demo/91017e73-f33a-49f5-ac87-f6e6a178d885.json deleted file mode 100644 index 3cfba9c12..000000000 --- a/data/hfopenllm_v2/fluently-sets/reasoning-1-1k-demo/91017e73-f33a-49f5-ac87-f6e6a178d885.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fluently-sets_reasoning-1-1k-demo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "reasoning-1-1k-demo", - "id": "fluently-sets/reasoning-1-1k-demo", - "developer": "fluently-sets", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7525 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6397 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4282 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4061 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4774 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/b7a75bca-6afe-448a-8e5c-53ebd577c964.json b/data/hfopenllm_v2/formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/b7a75bca-6afe-448a-8e5c-53ebd577c964.json deleted file mode 100644 index dc662457b..000000000 --- a/data/hfopenllm_v2/formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/b7a75bca-6afe-448a-8e5c-53ebd577c964.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp", - "id": "formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1614 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2976 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4219 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1174 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-elite-v1.1-7b-2-25-2025/8cdced5c-23bc-4426-a0c9-b9bf82913683.json b/data/hfopenllm_v2/formulae/mita-elite-v1.1-7b-2-25-2025/8cdced5c-23bc-4426-a0c9-b9bf82913683.json deleted file mode 100644 index 735daf408..000000000 --- a/data/hfopenllm_v2/formulae/mita-elite-v1.1-7b-2-25-2025/8cdced5c-23bc-4426-a0c9-b9bf82913683.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-elite-v1.1-7b-2-25-2025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-elite-v1.1-7b-2-25-2025", - "id": "formulae/mita-elite-v1.1-7b-2-25-2025", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.125 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2867 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-elite-v1.1-gen2-7b-2-25-2025/368784c8-6fc2-4340-8277-a6a9a9800a99.json b/data/hfopenllm_v2/formulae/mita-elite-v1.1-gen2-7b-2-25-2025/368784c8-6fc2-4340-8277-a6a9a9800a99.json deleted file mode 100644 index c6044805f..000000000 --- a/data/hfopenllm_v2/formulae/mita-elite-v1.1-gen2-7b-2-25-2025/368784c8-6fc2-4340-8277-a6a9a9800a99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-elite-v1.1-gen2-7b-2-25-2025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-elite-v1.1-gen2-7b-2-25-2025", - "id": "formulae/mita-elite-v1.1-gen2-7b-2-25-2025", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1411 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2924 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1101 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-elite-v1.2-7b-2-26-2025/f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json b/data/hfopenllm_v2/formulae/mita-elite-v1.2-7b-2-26-2025/f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json deleted file mode 100644 index 218786c8e..000000000 --- a/data/hfopenllm_v2/formulae/mita-elite-v1.2-7b-2-26-2025/f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-elite-v1.2-7b-2-26-2025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-elite-v1.2-7b-2-26-2025", - "id": "formulae/mita-elite-v1.2-7b-2-26-2025", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.293 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-gen3-7b-2-26-2025/f423b0d1-3536-4865-9615-f89b9d15b14c.json b/data/hfopenllm_v2/formulae/mita-gen3-7b-2-26-2025/f423b0d1-3536-4865-9615-f89b9d15b14c.json deleted file mode 100644 index 45585cf79..000000000 --- a/data/hfopenllm_v2/formulae/mita-gen3-7b-2-26-2025/f423b0d1-3536-4865-9615-f89b9d15b14c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-gen3-7b-2-26-2025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-gen3-7b-2-26-2025", - "id": "formulae/mita-gen3-7b-2-26-2025", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2916 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3912 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-gen3-v1.2-7b-2-26-2025/c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json b/data/hfopenllm_v2/formulae/mita-gen3-v1.2-7b-2-26-2025/c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json deleted file mode 100644 index edcc1062c..000000000 --- a/data/hfopenllm_v2/formulae/mita-gen3-v1.2-7b-2-26-2025/c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-gen3-v1.2-7b-2-26-2025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-gen3-v1.2-7b-2-26-2025", - "id": "formulae/mita-gen3-v1.2-7b-2-26-2025", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2044 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3058 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-math-v2.3-2-25-2025/b6149d15-3e0f-43d2-ae90-eca290a94edb.json b/data/hfopenllm_v2/formulae/mita-math-v2.3-2-25-2025/b6149d15-3e0f-43d2-ae90-eca290a94edb.json deleted file mode 100644 index e68dfb118..000000000 --- a/data/hfopenllm_v2/formulae/mita-math-v2.3-2-25-2025/b6149d15-3e0f-43d2-ae90-eca290a94edb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-math-v2.3-2-25-2025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-math-v2.3-2-25-2025", - "id": "formulae/mita-math-v2.3-2-25-2025", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1373 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2949 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-v1-7b/e21f5d83-6b71-488d-ad55-d23268fbd611.json b/data/hfopenllm_v2/formulae/mita-v1-7b/e21f5d83-6b71-488d-ad55-d23268fbd611.json deleted file mode 100644 index fd92b7206..000000000 --- a/data/hfopenllm_v2/formulae/mita-v1-7b/e21f5d83-6b71-488d-ad55-d23268fbd611.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-v1-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-v1-7b", - "id": "formulae/mita-v1-7b", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1972 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4152 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-v1.1-7b-2-24-2025/68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json b/data/hfopenllm_v2/formulae/mita-v1.1-7b-2-24-2025/68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json deleted file mode 100644 index 354c32cae..000000000 --- a/data/hfopenllm_v2/formulae/mita-v1.1-7b-2-24-2025/68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-v1.1-7b-2-24-2025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-v1.1-7b-2-24-2025", - "id": "formulae/mita-v1.1-7b-2-24-2025", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5442 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4557 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4524 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/formulae/mita-v1.2-7b-2-24-2025/12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json b/data/hfopenllm_v2/formulae/mita-v1.2-7b-2-24-2025/12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json deleted file mode 100644 index 514af2af5..000000000 --- a/data/hfopenllm_v2/formulae/mita-v1.2-7b-2-24-2025/12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/formulae_mita-v1.2-7b-2-24-2025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mita-v1.2-7b-2-24-2025", - "id": "formulae/mita-v1.2-7b-2-24-2025", - "developer": "formulae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4919 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4879 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4344 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3359 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/frameai/Loxa-4B/adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json b/data/hfopenllm_v2/frameai/Loxa-4B/adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json deleted file mode 100644 index 3d74cbfc7..000000000 --- a/data/hfopenllm_v2/frameai/Loxa-4B/adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/frameai_Loxa-4B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Loxa-4B", - "id": "frameai/Loxa-4B", - "developer": "frameai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.018 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4765 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4217 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3377 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/freewheelin/free-evo-qwen72b-v0.8-re/7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json b/data/hfopenllm_v2/freewheelin/free-evo-qwen72b-v0.8-re/7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json deleted file mode 100644 index 86d886da9..000000000 --- a/data/hfopenllm_v2/freewheelin/free-evo-qwen72b-v0.8-re/7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/freewheelin_free-evo-qwen72b-v0.8-re/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "free-evo-qwen72b-v0.8-re", - "id": "freewheelin/free-evo-qwen72b-v0.8-re", - "developer": "freewheelin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 72.288 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6127 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4872 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.487 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.1/1bb09da7-1675-4e57-b46a-9791c888ce6f.json b/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.1/1bb09da7-1675-4e57-b46a-9791c888ce6f.json deleted file mode 100644 index d246d43e9..000000000 --- a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.1/1bb09da7-1675-4e57-b46a-9791c888ce6f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/freewheelin_free-solar-evo-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "free-solar-evo-v0.1", - "id": "freewheelin/free-solar-evo-v0.1", - "developer": "freewheelin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.205 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4502 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4946 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.11/3ed7dd5a-e431-480a-91a7-5ccd915057e4.json b/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.11/3ed7dd5a-e431-480a-91a7-5ccd915057e4.json deleted file mode 100644 index 75ddbaee6..000000000 --- a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.11/3ed7dd5a-e431-480a-91a7-5ccd915057e4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/freewheelin_free-solar-evo-v0.11/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "free-solar-evo-v0.11", - "id": "freewheelin/free-solar-evo-v0.11", - "developer": "freewheelin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2027 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4545 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5052 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3467 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.13/9cab35b6-d6a7-475e-b715-e4493d07cd92.json b/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.13/9cab35b6-d6a7-475e-b715-e4493d07cd92.json deleted file mode 100644 index e1d46c7d8..000000000 --- a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.13/9cab35b6-d6a7-475e-b715-e4493d07cd92.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/freewheelin_free-solar-evo-v0.13/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "free-solar-evo-v0.13", - "id": "freewheelin/free-solar-evo-v0.13", - "developer": "freewheelin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4555 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5052 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.347 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/fulim/FineLlama-3.1-8B/ef7149ae-8d50-4890-89ae-fb561a86d130.json b/data/hfopenllm_v2/fulim/FineLlama-3.1-8B/ef7149ae-8d50-4890-89ae-fb561a86d130.json deleted file mode 100644 index 9613a08ac..000000000 --- a/data/hfopenllm_v2/fulim/FineLlama-3.1-8B/ef7149ae-8d50-4890-89ae-fb561a86d130.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/fulim_FineLlama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FineLlama-3.1-8B", - "id": "fulim/FineLlama-3.1-8B", - "developer": "fulim", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1439 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4569 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gabrielmbmb/SmolLM-1.7B-Instruct-IFEval/3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json b/data/hfopenllm_v2/gabrielmbmb/SmolLM-1.7B-Instruct-IFEval/3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json deleted file mode 100644 index 690658385..000000000 --- a/data/hfopenllm_v2/gabrielmbmb/SmolLM-1.7B-Instruct-IFEval/3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gabrielmbmb_SmolLM-1.7B-Instruct-IFEval/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM-1.7B-Instruct-IFEval", - "id": "gabrielmbmb/SmolLM-1.7B-Instruct-IFEval", - "developer": "gabrielmbmb", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.711 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2306 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json b/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json deleted file mode 100644 index f193f5a9b..000000000 --- a/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA", - "id": "gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA", - "developer": "gaverfraxz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4009 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3985 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.365 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1654 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/8fe13380-a045-4d63-96f8-ec977540478c.json b/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/8fe13380-a045-4d63-96f8-ec977540478c.json deleted file mode 100644 index 280b09aa0..000000000 --- a/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/8fe13380-a045-4d63-96f8-ec977540478c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES", - "id": "gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES", - "developer": "gaverfraxz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4551 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gbueno86/Brinebreath-Llama-3.1-70B/6da42427-c7de-4830-b368-ca7757ee1d51.json b/data/hfopenllm_v2/gbueno86/Brinebreath-Llama-3.1-70B/6da42427-c7de-4830-b368-ca7757ee1d51.json deleted file mode 100644 index 700b6cf24..000000000 --- a/data/hfopenllm_v2/gbueno86/Brinebreath-Llama-3.1-70B/6da42427-c7de-4830-b368-ca7757ee1d51.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gbueno86_Brinebreath-Llama-3.1-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Brinebreath-Llama-3.1-70B", - "id": "gbueno86/Brinebreath-Llama-3.1-70B", - "developer": "gbueno86", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5533 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6881 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2976 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5196 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b/5faf24b3-38af-4f3f-8377-bba70d75f8df.json b/data/hfopenllm_v2/gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b/5faf24b3-38af-4f3f-8377-bba70d75f8df.json deleted file mode 100644 index 8becc522b..000000000 --- a/data/hfopenllm_v2/gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b/5faf24b3-38af-4f3f-8377-bba70d75f8df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gbueno86_Meta-LLama-3-Cat-Smaug-LLama-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-LLama-3-Cat-Smaug-LLama-70b", - "id": "gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b", - "developer": "gbueno86", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8072 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6674 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2938 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ghost-x/ghost-8b-beta-1608/9a26214c-2601-49be-b1b1-03796b704059.json b/data/hfopenllm_v2/ghost-x/ghost-8b-beta-1608/9a26214c-2601-49be-b1b1-03796b704059.json deleted file mode 100644 index e2d293c3a..000000000 --- a/data/hfopenllm_v2/ghost-x/ghost-8b-beta-1608/9a26214c-2601-49be-b1b1-03796b704059.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ghost-x_ghost-8b-beta-1608/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ghost-8b-beta-1608", - "id": "ghost-x/ghost-8b-beta-1608", - "developer": "ghost-x", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3516 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.284 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/glaiveai/Reflection-Llama-3.1-70B/fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json b/data/hfopenllm_v2/glaiveai/Reflection-Llama-3.1-70B/fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json deleted file mode 100644 index 3344ef2b5..000000000 --- a/data/hfopenllm_v2/glaiveai/Reflection-Llama-3.1-70B/fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/glaiveai_Reflection-Llama-3.1-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflection-Llama-3.1-70B", - "id": "glaiveai/Reflection-Llama-3.1-70B", - "developer": "glaiveai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 69.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5991 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5681 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2757 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.438 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6341 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gmonsoon/SahabatAI-Llama-11B-Test/25c5b304-46d3-4df3-9ac3-75ffa972849a.json b/data/hfopenllm_v2/gmonsoon/SahabatAI-Llama-11B-Test/25c5b304-46d3-4df3-9ac3-75ffa972849a.json deleted file mode 100644 index 579c66afe..000000000 --- a/data/hfopenllm_v2/gmonsoon/SahabatAI-Llama-11B-Test/25c5b304-46d3-4df3-9ac3-75ffa972849a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gmonsoon_SahabatAI-Llama-11B-Test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SahabatAI-Llama-11B-Test", - "id": "gmonsoon/SahabatAI-Llama-11B-Test", - "developer": "gmonsoon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 11.52 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3376 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4728 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4001 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gmonsoon/SahabatAI-MediChatIndo-8B-v1/88ed0272-39f8-4676-970a-525aee058991.json b/data/hfopenllm_v2/gmonsoon/SahabatAI-MediChatIndo-8B-v1/88ed0272-39f8-4676-970a-525aee058991.json deleted file mode 100644 index 4c3ee6998..000000000 --- a/data/hfopenllm_v2/gmonsoon/SahabatAI-MediChatIndo-8B-v1/88ed0272-39f8-4676-970a-525aee058991.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gmonsoon_SahabatAI-MediChatIndo-8B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SahabatAI-MediChatIndo-8B-v1", - "id": "gmonsoon/SahabatAI-MediChatIndo-8B-v1", - "developer": "gmonsoon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4163 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gmonsoon/SahabatAI-Rebase-8B-Test/d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json b/data/hfopenllm_v2/gmonsoon/SahabatAI-Rebase-8B-Test/d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json deleted file mode 100644 index 58ed22e54..000000000 --- a/data/hfopenllm_v2/gmonsoon/SahabatAI-Rebase-8B-Test/d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gmonsoon_SahabatAI-Rebase-8B-Test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SahabatAI-Rebase-8B-Test", - "id": "gmonsoon/SahabatAI-Rebase-8B-Test", - "developer": "gmonsoon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5156 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.523 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4133 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3664 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gmonsoon/StockSeaLLMs-7B-v1/dcb90e75-8709-4729-8c00-e756e6a9a49d.json b/data/hfopenllm_v2/gmonsoon/StockSeaLLMs-7B-v1/dcb90e75-8709-4729-8c00-e756e6a9a49d.json deleted file mode 100644 index 0e6375ecc..000000000 --- a/data/hfopenllm_v2/gmonsoon/StockSeaLLMs-7B-v1/dcb90e75-8709-4729-8c00-e756e6a9a49d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gmonsoon_StockSeaLLMs-7B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "StockSeaLLMs-7B-v1", - "id": "gmonsoon/StockSeaLLMs-7B-v1", - "developer": "gmonsoon", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4599 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5271 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4214 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3952 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES/81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json b/data/hfopenllm_v2/gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES/81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json deleted file mode 100644 index 61f640923..000000000 --- a/data/hfopenllm_v2/gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES/81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gmonsoon_gemma2-9b-sahabatai-v1-instruct-BaseTIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma2-9b-sahabatai-v1-instruct-BaseTIES", - "id": "gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES", - "developer": "gmonsoon", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6077 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1994 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4778 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4347 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_full_2/0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_full_2/0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json deleted file mode 100644 index cf37f2ef2..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_full_2/0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_full_2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_full_2", - "id": "godlikehhd/alpaca_data_full_2", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3178 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4217 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4052 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2854 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_full_3B/82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_full_3B/82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json deleted file mode 100644 index 4c1cc5329..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_full_3B/82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_full_3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_full_3B", - "id": "godlikehhd/alpaca_data_full_3B", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3696 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4684 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600/e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600/e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json deleted file mode 100644 index dae6a5835..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600/e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ifd_max_2600/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_ifd_max_2600", - "id": "godlikehhd/alpaca_data_ifd_max_2600", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4029 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3509 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2916 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600_3B/7ccaa29a-4f73-4794-83a2-b925d755d91e.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600_3B/7ccaa29a-4f73-4794-83a2-b925d755d91e.json deleted file mode 100644 index 3e8447d76..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600_3B/7ccaa29a-4f73-4794-83a2-b925d755d91e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ifd_max_2600_3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_ifd_max_2600_3B", - "id": "godlikehhd/alpaca_data_ifd_max_2600_3B", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2982 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4626 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1594 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_me_max_5200/ba8de8f6-c118-4bc3-ae8d-851e964684ed.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_me_max_5200/ba8de8f6-c118-4bc3-ae8d-851e964684ed.json deleted file mode 100644 index 6e4a08360..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_me_max_5200/ba8de8f6-c118-4bc3-ae8d-851e964684ed.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ifd_me_max_5200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_ifd_me_max_5200", - "id": "godlikehhd/alpaca_data_ifd_me_max_5200", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4153 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3483 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2982 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_min_2600/4011975a-e2a0-466a-9b34-923e1b4f8733.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_min_2600/4011975a-e2a0-466a-9b34-923e1b4f8733.json deleted file mode 100644 index bc8f77337..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_min_2600/4011975a-e2a0-466a-9b34-923e1b4f8733.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ifd_min_2600/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_ifd_min_2600", - "id": "godlikehhd/alpaca_data_ifd_min_2600", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4219 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2893 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_ans_max_5200/8a172205-39c6-4dd1-86b2-11b234b37e3c.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_ans_max_5200/8a172205-39c6-4dd1-86b2-11b234b37e3c.json deleted file mode 100644 index 8ecd5019c..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_ans_max_5200/8a172205-39c6-4dd1-86b2-11b234b37e3c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ins_ans_max_5200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_ins_ans_max_5200", - "id": "godlikehhd/alpaca_data_ins_ans_max_5200", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3479 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4098 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1027 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3602 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2901 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_max_5200/495b2e8e-e2d8-4158-bc6e-7568604d44e9.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_max_5200/495b2e8e-e2d8-4158-bc6e-7568604d44e9.json deleted file mode 100644 index b1a0ba707..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_max_5200/495b2e8e-e2d8-4158-bc6e-7568604d44e9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ins_max_5200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_ins_max_5200", - "id": "godlikehhd/alpaca_data_ins_max_5200", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4155 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3614 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2916 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_2600/e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_2600/e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json deleted file mode 100644 index 31d6adcd2..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_2600/e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ins_min_2600/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_ins_min_2600", - "id": "godlikehhd/alpaca_data_ins_min_2600", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.333 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3853 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.288 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_5200/4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_5200/4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json deleted file mode 100644 index b84f600b2..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_5200/4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ins_min_5200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_ins_min_5200", - "id": "godlikehhd/alpaca_data_ins_min_5200", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3906 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2949 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_5200/a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_5200/a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json deleted file mode 100644 index 64a2916b2..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_5200/a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_sampled_ifd_5200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_sampled_ifd_5200", - "id": "godlikehhd/alpaca_data_sampled_ifd_5200", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2924 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4033 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1254 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3521 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2896 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_new_5200/c85c79d6-28e0-4deb-ad84-901b725aeca8.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_new_5200/c85c79d6-28e0-4deb-ad84-901b725aeca8.json deleted file mode 100644 index 081b3f249..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_new_5200/c85c79d6-28e0-4deb-ad84-901b725aeca8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_sampled_ifd_new_5200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_sampled_ifd_new_5200", - "id": "godlikehhd/alpaca_data_sampled_ifd_new_5200", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3613 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2925 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.1_2600/73271472-d06f-405b-af9d-2da7c17e1eb0.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.1_2600/73271472-d06f-405b-af9d-2da7c17e1eb0.json deleted file mode 100644 index 3d3313dd2..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.1_2600/73271472-d06f-405b-af9d-2da7c17e1eb0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_0.1_2600/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_score_max_0.1_2600", - "id": "godlikehhd/alpaca_data_score_max_0.1_2600", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4252 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3706 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.3_2600/4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.3_2600/4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json deleted file mode 100644 index 90908a71a..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.3_2600/4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_0.3_2600/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_score_max_0.3_2600", - "id": "godlikehhd/alpaca_data_score_max_0.3_2600", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3375 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4151 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3759 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2913 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.7_2600/9b36e4c0-0d13-4988-8145-b9254da2e76e.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.7_2600/9b36e4c0-0d13-4988-8145-b9254da2e76e.json deleted file mode 100644 index 97a5ad3b9..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.7_2600/9b36e4c0-0d13-4988-8145-b9254da2e76e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_0.7_2600/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_score_max_0.7_2600", - "id": "godlikehhd/alpaca_data_score_max_0.7_2600", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3469 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2983 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2500/6a464798-0111-4c71-b156-72a5aba1da63.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2500/6a464798-0111-4c71-b156-72a5aba1da63.json deleted file mode 100644 index 5e21ebf38..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2500/6a464798-0111-4c71-b156-72a5aba1da63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_2500/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_score_max_2500", - "id": "godlikehhd/alpaca_data_score_max_2500", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0952 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3627 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.294 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2600_3B/78252135-f15b-427d-86de-c32cd3dbcd0f.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2600_3B/78252135-f15b-427d-86de-c32cd3dbcd0f.json deleted file mode 100644 index 6e58fff0e..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2600_3B/78252135-f15b-427d-86de-c32cd3dbcd0f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_2600_3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_score_max_2600_3B", - "id": "godlikehhd/alpaca_data_score_max_2600_3B", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3358 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4716 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_5200/c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_5200/c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json deleted file mode 100644 index 25483eeb0..000000000 --- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_5200/c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_5200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "alpaca_data_score_max_5200", - "id": "godlikehhd/alpaca_data_score_max_5200", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4242 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3878 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/ifd_2500_qwen/bce17582-e807-4b91-b0e7-0a890bf5eb24.json b/data/hfopenllm_v2/godlikehhd/ifd_2500_qwen/bce17582-e807-4b91-b0e7-0a890bf5eb24.json deleted file mode 100644 index 3d6fa2b02..000000000 --- a/data/hfopenllm_v2/godlikehhd/ifd_2500_qwen/bce17582-e807-4b91-b0e7-0a890bf5eb24.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_ifd_2500_qwen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ifd_2500_qwen", - "id": "godlikehhd/ifd_2500_qwen", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3365 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2921 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/ifd_new_correct_all_sample_2500_qwen/f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json b/data/hfopenllm_v2/godlikehhd/ifd_new_correct_all_sample_2500_qwen/f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json deleted file mode 100644 index 9f6d40e63..000000000 --- a/data/hfopenllm_v2/godlikehhd/ifd_new_correct_all_sample_2500_qwen/f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_ifd_new_correct_all_sample_2500_qwen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ifd_new_correct_all_sample_2500_qwen", - "id": "godlikehhd/ifd_new_correct_all_sample_2500_qwen", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3376 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3562 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2889 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/ifd_new_correct_sample_2500_qwen/78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json b/data/hfopenllm_v2/godlikehhd/ifd_new_correct_sample_2500_qwen/78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json deleted file mode 100644 index 72f6ab1df..000000000 --- a/data/hfopenllm_v2/godlikehhd/ifd_new_correct_sample_2500_qwen/78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_ifd_new_correct_sample_2500_qwen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ifd_new_correct_sample_2500_qwen", - "id": "godlikehhd/ifd_new_correct_sample_2500_qwen", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3397 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.411 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1042 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3627 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2932 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/ifd_new_qwen_2500/bdb9e2d2-8d09-4994-a320-2f968bcb4898.json b/data/hfopenllm_v2/godlikehhd/ifd_new_qwen_2500/bdb9e2d2-8d09-4994-a320-2f968bcb4898.json deleted file mode 100644 index dd934c455..000000000 --- a/data/hfopenllm_v2/godlikehhd/ifd_new_qwen_2500/bdb9e2d2-8d09-4994-a320-2f968bcb4898.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_ifd_new_qwen_2500/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ifd_new_qwen_2500", - "id": "godlikehhd/ifd_new_qwen_2500", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.324 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.359 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/qwen-2.5-1.5b-cherry/c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json b/data/hfopenllm_v2/godlikehhd/qwen-2.5-1.5b-cherry/c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json deleted file mode 100644 index 0eae1c7ae..000000000 --- a/data/hfopenllm_v2/godlikehhd/qwen-2.5-1.5b-cherry/c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_qwen-2.5-1.5b-cherry/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-2.5-1.5b-cherry", - "id": "godlikehhd/qwen-2.5-1.5b-cherry", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.772 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2893 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4036 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/qwen_2.5-1.5b-cherry_new/550d5665-7a8a-437e-b318-000690dd250f.json b/data/hfopenllm_v2/godlikehhd/qwen_2.5-1.5b-cherry_new/550d5665-7a8a-437e-b318-000690dd250f.json deleted file mode 100644 index 8676d2e1f..000000000 --- a/data/hfopenllm_v2/godlikehhd/qwen_2.5-1.5b-cherry_new/550d5665-7a8a-437e-b318-000690dd250f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_qwen_2.5-1.5b-cherry_new/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen_2.5-1.5b-cherry_new", - "id": "godlikehhd/qwen_2.5-1.5b-cherry_new", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.312 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3496 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/qwen_full_data_alpaca/a1922f33-32f5-4f99-8df6-e2080808d292.json b/data/hfopenllm_v2/godlikehhd/qwen_full_data_alpaca/a1922f33-32f5-4f99-8df6-e2080808d292.json deleted file mode 100644 index 0a3408038..000000000 --- a/data/hfopenllm_v2/godlikehhd/qwen_full_data_alpaca/a1922f33-32f5-4f99-8df6-e2080808d292.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_qwen_full_data_alpaca/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen_full_data_alpaca", - "id": "godlikehhd/qwen_full_data_alpaca", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3136 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4229 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0921 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4052 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2851 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/godlikehhd/qwen_ins_ans_2500/6ccc376b-24a4-42cc-8ea0-823ef14336db.json b/data/hfopenllm_v2/godlikehhd/qwen_ins_ans_2500/6ccc376b-24a4-42cc-8ea0-823ef14336db.json deleted file mode 100644 index 79114530d..000000000 --- a/data/hfopenllm_v2/godlikehhd/qwen_ins_ans_2500/6ccc376b-24a4-42cc-8ea0-823ef14336db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/godlikehhd_qwen_ins_ans_2500/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen_ins_ans_2500", - "id": "godlikehhd/qwen_ins_ans_2500", - "developer": "godlikehhd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2698 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4074 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3589 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/codegemma-1.1-2b/6547b6f3-63dd-4516-b294-62c4246c3dc7.json b/data/hfopenllm_v2/google/codegemma-1.1-2b/6547b6f3-63dd-4516-b294-62c4246c3dc7.json deleted file mode 100644 index 262dce559..000000000 --- a/data/hfopenllm_v2/google/codegemma-1.1-2b/6547b6f3-63dd-4516-b294-62c4246c3dc7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_codegemma-1.1-2b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "codegemma-1.1-2b", - "id": "google/codegemma-1.1-2b", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2294 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3871 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1278 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/flan-t5-base/a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json b/data/hfopenllm_v2/google/flan-t5-base/a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json deleted file mode 100644 index 42df32f4d..000000000 --- a/data/hfopenllm_v2/google/flan-t5-base/a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_flan-t5-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flan-t5-base", - "id": "google/flan-t5-base", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 0.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1891 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3526 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2383 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/flan-t5-large/b15ad3b5-7ef2-439e-9acd-a85eab520d31.json b/data/hfopenllm_v2/google/flan-t5-large/b15ad3b5-7ef2-439e-9acd-a85eab520d31.json deleted file mode 100644 index 12a83a4b2..000000000 --- a/data/hfopenllm_v2/google/flan-t5-large/b15ad3b5-7ef2-439e-9acd-a85eab520d31.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_flan-t5-large/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flan-t5-large", - "id": "google/flan-t5-large", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 0.783 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2201 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4153 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4083 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1709 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/flan-t5-small/64da2654-9fdb-4a08-ad16-cf8793a30ed8.json b/data/hfopenllm_v2/google/flan-t5-small/64da2654-9fdb-4a08-ad16-cf8793a30ed8.json deleted file mode 100644 index 736e0e185..000000000 --- a/data/hfopenllm_v2/google/flan-t5-small/64da2654-9fdb-4a08-ad16-cf8793a30ed8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_flan-t5-small/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flan-t5-small", - "id": "google/flan-t5-small", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 0.077 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1524 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3283 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4123 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1233 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/flan-t5-xl/37080215-ee30-4e59-a407-b14695ac2a38.json b/data/hfopenllm_v2/google/flan-t5-xl/37080215-ee30-4e59-a407-b14695ac2a38.json deleted file mode 100644 index 5eaa5962e..000000000 --- a/data/hfopenllm_v2/google/flan-t5-xl/37080215-ee30-4e59-a407-b14695ac2a38.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_flan-t5-xl/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flan-t5-xl", - "id": "google/flan-t5-xl", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 2.85 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2237 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4531 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4181 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/flan-t5-xl/b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json b/data/hfopenllm_v2/google/flan-t5-xl/b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json deleted file mode 100644 index e4219b2cd..000000000 --- a/data/hfopenllm_v2/google/flan-t5-xl/b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_flan-t5-xl/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flan-t5-xl", - "id": "google/flan-t5-xl", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 2.85 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2207 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4537 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/flan-t5-xxl/bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json b/data/hfopenllm_v2/google/flan-t5-xxl/bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json deleted file mode 100644 index e29cc1652..000000000 --- a/data/hfopenllm_v2/google/flan-t5-xxl/bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_flan-t5-xxl/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flan-t5-xxl", - "id": "google/flan-t5-xxl", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 11.267 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.22 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4218 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2343 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/flan-ul2/da9ddecc-43cf-4055-a19e-795b1ee98826.json b/data/hfopenllm_v2/google/flan-ul2/da9ddecc-43cf-4055-a19e-795b1ee98826.json deleted file mode 100644 index efa59a78c..000000000 --- a/data/hfopenllm_v2/google/flan-ul2/da9ddecc-43cf-4055-a19e-795b1ee98826.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_flan-ul2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flan-ul2", - "id": "google/flan-ul2", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 19.46 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2393 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5054 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2493 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-1.1-2b-it/a93ccb3f-f2d9-415d-8397-0c7fb765fada.json b/data/hfopenllm_v2/google/gemma-1.1-2b-it/a93ccb3f-f2d9-415d-8397-0c7fb765fada.json deleted file mode 100644 index f13cf1295..000000000 --- a/data/hfopenllm_v2/google/gemma-1.1-2b-it/a93ccb3f-f2d9-415d-8397-0c7fb765fada.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-1.1-2b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-1.1-2b-it", - "id": "google/gemma-1.1-2b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3067 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3185 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1484 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-1.1-7b-it/d0f86765-bdb4-4367-986b-28303bbe1844.json b/data/hfopenllm_v2/google/gemma-1.1-7b-it/d0f86765-bdb4-4367-986b-28303bbe1844.json deleted file mode 100644 index 53d0c2f3f..000000000 --- a/data/hfopenllm_v2/google/gemma-1.1-7b-it/d0f86765-bdb4-4367-986b-28303bbe1844.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-1.1-7b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-1.1-7b-it", - "id": "google/gemma-1.1-7b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5039 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2-27b-it/693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json b/data/hfopenllm_v2/google/gemma-2-27b-it/693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json deleted file mode 100644 index 3ca551306..000000000 --- a/data/hfopenllm_v2/google/gemma-2-27b-it/693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2-27b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-27b-it", - "id": "google/gemma-2-27b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7978 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6451 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2387 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4033 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4451 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2-27b/7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json b/data/hfopenllm_v2/google/gemma-2-27b/7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json deleted file mode 100644 index b5cad47da..000000000 --- a/data/hfopenllm_v2/google/gemma-2-27b/7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2-27b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-27b", - "id": "google/gemma-2-27b", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5643 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1662 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2-2b-it/c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json b/data/hfopenllm_v2/google/gemma-2-2b-it/c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json deleted file mode 100644 index 9103c1efd..000000000 --- a/data/hfopenllm_v2/google/gemma-2-2b-it/c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2-2b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-it", - "id": "google/gemma-2-2b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "InternLM2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5668 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3929 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/1810033a-185b-4c91-91d3-43b8f6c61443.json b/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/1810033a-185b-4c91-91d3-43b8f6c61443.json deleted file mode 100644 index 4fde83e45..000000000 --- a/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/1810033a-185b-4c91-91d3-43b8f6c61443.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2-2b-jpn-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-jpn-it", - "id": "google/gemma-2-2b-jpn-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5078 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3964 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2578 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json b/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json deleted file mode 100644 index 1442b3bca..000000000 --- a/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2-2b-jpn-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-jpn-it", - "id": "google/gemma-2-2b-jpn-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5288 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2467 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2-2b/cf20e77a-340f-4d8d-b593-9645bdfc5877.json b/data/hfopenllm_v2/google/gemma-2-2b/cf20e77a-340f-4d8d-b593-9645bdfc5877.json deleted file mode 100644 index c8343d88b..000000000 --- a/data/hfopenllm_v2/google/gemma-2-2b/cf20e77a-340f-4d8d-b593-9645bdfc5877.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2-2b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b", - "id": "google/gemma-2-2b", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "InternLM2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2018 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3709 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4219 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2217 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2-2b/eec73e49-ac2b-42ed-a115-76e45007cd5d.json b/data/hfopenllm_v2/google/gemma-2-2b/eec73e49-ac2b-42ed-a115-76e45007cd5d.json deleted file mode 100644 index 96086b370..000000000 --- a/data/hfopenllm_v2/google/gemma-2-2b/eec73e49-ac2b-42ed-a115-76e45007cd5d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2-2b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b", - "id": "google/gemma-2-2b", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "InternLM2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1993 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.218 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2-9b-it/aa06d058-87f9-4fde-ad53-139b29a71448.json b/data/hfopenllm_v2/google/gemma-2-9b-it/aa06d058-87f9-4fde-ad53-139b29a71448.json deleted file mode 100644 index 8f9ad5f6e..000000000 --- a/data/hfopenllm_v2/google/gemma-2-9b-it/aa06d058-87f9-4fde-ad53-139b29a71448.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2-9b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-it", - "id": "google/gemma-2-9b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1949 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2-9b/3f1d571a-fc42-411b-88ab-4700d5861367.json b/data/hfopenllm_v2/google/gemma-2-9b/3f1d571a-fc42-411b-88ab-4700d5861367.json deleted file mode 100644 index c496a90c9..000000000 --- a/data/hfopenllm_v2/google/gemma-2-9b/3f1d571a-fc42-411b-88ab-4700d5861367.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b", - "id": "google/gemma-2-9b", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.204 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5377 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4103 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2b-it/74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json b/data/hfopenllm_v2/google/gemma-2b-it/74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json deleted file mode 100644 index e1752b69f..000000000 --- a/data/hfopenllm_v2/google/gemma-2b-it/74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2b-it", - "id": "google/gemma-2b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.269 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3151 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-2b/2eb433ba-5c93-4355-99dd-edcb65721603.json b/data/hfopenllm_v2/google/gemma-2b/2eb433ba-5c93-4355-99dd-edcb65721603.json deleted file mode 100644 index 3c6b79553..000000000 --- a/data/hfopenllm_v2/google/gemma-2b/2eb433ba-5c93-4355-99dd-edcb65721603.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-2b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2b", - "id": "google/gemma-2b", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2038 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3366 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3978 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1366 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-7b-it/826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json b/data/hfopenllm_v2/google/gemma-7b-it/826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json deleted file mode 100644 index 47f028978..000000000 --- a/data/hfopenllm_v2/google/gemma-7b-it/826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-7b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-7b-it", - "id": "google/gemma-7b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3868 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3646 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4274 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1695 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/gemma-7b/6da54964-e3b5-4567-8ce4-7e0f279af84f.json b/data/hfopenllm_v2/google/gemma-7b/6da54964-e3b5-4567-8ce4-7e0f279af84f.json deleted file mode 100644 index 0bb149bf8..000000000 --- a/data/hfopenllm_v2/google/gemma-7b/6da54964-e3b5-4567-8ce4-7e0f279af84f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_gemma-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-7b", - "id": "google/gemma-7b", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GemmaForCausalLM", - "params_billions": 8.538 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4362 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2948 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/mt5-base/a7dde688-a0ae-4731-909f-0bef0c6eeba9.json b/data/hfopenllm_v2/google/mt5-base/a7dde688-a0ae-4731-909f-0bef0c6eeba9.json deleted file mode 100644 index 3221afa06..000000000 --- a/data/hfopenllm_v2/google/mt5-base/a7dde688-a0ae-4731-909f-0bef0c6eeba9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_mt5-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mt5-base", - "id": "google/mt5-base", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MT5ForConditionalGeneration", - "params_billions": 0.39 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1645 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2883 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3672 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.107 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/mt5-small/eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json b/data/hfopenllm_v2/google/mt5-small/eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json deleted file mode 100644 index f462ca442..000000000 --- a/data/hfopenllm_v2/google/mt5-small/eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_mt5-small/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mt5-small", - "id": "google/mt5-small", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MT5ForConditionalGeneration", - "params_billions": 0.17 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1718 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2766 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3857 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/mt5-xl/9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json b/data/hfopenllm_v2/google/mt5-xl/9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json deleted file mode 100644 index 53e4e6095..000000000 --- a/data/hfopenllm_v2/google/mt5-xl/9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_mt5-xl/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mt5-xl", - "id": "google/mt5-xl", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MT5ForConditionalGeneration", - "params_billions": 3.23 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.196 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3795 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/mt5-xxl/6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json b/data/hfopenllm_v2/google/mt5-xxl/6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json deleted file mode 100644 index 151ffbc18..000000000 --- a/data/hfopenllm_v2/google/mt5-xxl/6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_mt5-xxl/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mt5-xxl", - "id": "google/mt5-xxl", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "T5ForConditionalGeneration", - "params_billions": 11.9 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2358 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2959 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2416 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3689 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1089 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/recurrentgemma-2b-it/b0ca2dec-387f-4b27-9adb-772af1899832.json b/data/hfopenllm_v2/google/recurrentgemma-2b-it/b0ca2dec-387f-4b27-9adb-772af1899832.json deleted file mode 100644 index a93defb31..000000000 --- a/data/hfopenllm_v2/google/recurrentgemma-2b-it/b0ca2dec-387f-4b27-9adb-772af1899832.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_recurrentgemma-2b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recurrentgemma-2b-it", - "id": "google/recurrentgemma-2b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "RecurrentGemmaForCausalLM", - "params_billions": 2.683 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2949 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.333 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/recurrentgemma-2b/53c4b397-b78e-4699-a01e-3535aa072225.json b/data/hfopenllm_v2/google/recurrentgemma-2b/53c4b397-b78e-4699-a01e-3535aa072225.json deleted file mode 100644 index ea66c049b..000000000 --- a/data/hfopenllm_v2/google/recurrentgemma-2b/53c4b397-b78e-4699-a01e-3535aa072225.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_recurrentgemma-2b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recurrentgemma-2b", - "id": "google/recurrentgemma-2b", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "RecurrentGemmaForCausalLM", - "params_billions": 2.683 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3017 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3197 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1176 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/recurrentgemma-9b-it/f5b251f0-741c-4ad5-ab04-19c5202854ea.json b/data/hfopenllm_v2/google/recurrentgemma-9b-it/f5b251f0-741c-4ad5-ab04-19c5202854ea.json deleted file mode 100644 index 001b78f3b..000000000 --- a/data/hfopenllm_v2/google/recurrentgemma-9b-it/f5b251f0-741c-4ad5-ab04-19c5202854ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_recurrentgemma-9b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recurrentgemma-9b-it", - "id": "google/recurrentgemma-9b-it", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "RecurrentGemmaForCausalLM", - "params_billions": 9.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.501 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4367 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4379 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2843 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/recurrentgemma-9b/7b2ba13a-e01d-4442-9abe-d16df1a1668a.json b/data/hfopenllm_v2/google/recurrentgemma-9b/7b2ba13a-e01d-4442-9abe-d16df1a1668a.json deleted file mode 100644 index 549f07884..000000000 --- a/data/hfopenllm_v2/google/recurrentgemma-9b/7b2ba13a-e01d-4442-9abe-d16df1a1668a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_recurrentgemma-9b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recurrentgemma-9b", - "id": "google/recurrentgemma-9b", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "RecurrentGemmaForCausalLM", - "params_billions": 9.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3116 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3956 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3803 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2605 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/switch-base-8/bf79f87c-3f14-49e8-acba-725e709d5f11.json b/data/hfopenllm_v2/google/switch-base-8/bf79f87c-3f14-49e8-acba-725e709d5f11.json deleted file mode 100644 index 1bc6517df..000000000 --- a/data/hfopenllm_v2/google/switch-base-8/bf79f87c-3f14-49e8-acba-725e709d5f11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_switch-base-8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "switch-base-8", - "id": "google/switch-base-8", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "SwitchTransformersForConditionalGeneration", - "params_billions": 0.62 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1585 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2876 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3517 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/google/umt5-base/3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json b/data/hfopenllm_v2/google/umt5-base/3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json deleted file mode 100644 index ee9655c66..000000000 --- a/data/hfopenllm_v2/google/umt5-base/3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/google_umt5-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "umt5-base", - "id": "google/umt5-base", - "developer": "google", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "UMT5ForConditionalGeneration", - "params_billions": -1.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1746 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2788 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1078 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/goulue5/merging_LLM/6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json b/data/hfopenllm_v2/goulue5/merging_LLM/6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json deleted file mode 100644 index 384e12c7d..000000000 --- a/data/hfopenllm_v2/goulue5/merging_LLM/6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/goulue5_merging_LLM/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merging_LLM", - "id": "goulue5/merging_LLM", - "developer": "goulue5", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3233 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4216 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4333 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2958 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gradientai/Llama-3-8B-Instruct-Gradient-1048k/1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json b/data/hfopenllm_v2/gradientai/Llama-3-8B-Instruct-Gradient-1048k/1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json deleted file mode 100644 index 62b5a87b9..000000000 --- a/data/hfopenllm_v2/gradientai/Llama-3-8B-Instruct-Gradient-1048k/1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gradientai_Llama-3-8B-Instruct-Gradient-1048k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-Gradient-1048k", - "id": "gradientai/Llama-3-8B-Instruct-Gradient-1048k", - "developer": "gradientai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4346 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.294 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/03393ffd-1923-4767-ba14-d0e3e6751842.json b/data/hfopenllm_v2/grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/03393ffd-1923-4767-ba14-d0e3e6751842.json deleted file mode 100644 index ff2605288..000000000 --- a/data/hfopenllm_v2/grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/03393ffd-1923-4767-ba14-d0e3e6751842.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B", - "id": "grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4797 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5269 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2221 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3957 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Gigantes-v1-gemma2-9b-it/b7d049dc-127d-4075-8067-22adac9a58c3.json b/data/hfopenllm_v2/grimjim/Gigantes-v1-gemma2-9b-it/b7d049dc-127d-4075-8067-22adac9a58c3.json deleted file mode 100644 index fac07281a..000000000 --- a/data/hfopenllm_v2/grimjim/Gigantes-v1-gemma2-9b-it/b7d049dc-127d-4075-8067-22adac9a58c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Gigantes-v1-gemma2-9b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gigantes-v1-gemma2-9b-it", - "id": "grimjim/Gigantes-v1-gemma2-9b-it", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6925 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5978 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4225 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Gigantes-v2-gemma2-9b-it/89d79024-f4b8-4165-bd88-47f2b0010800.json b/data/hfopenllm_v2/grimjim/Gigantes-v2-gemma2-9b-it/89d79024-f4b8-4165-bd88-47f2b0010800.json deleted file mode 100644 index b451c7250..000000000 --- a/data/hfopenllm_v2/grimjim/Gigantes-v2-gemma2-9b-it/89d79024-f4b8-4165-bd88-47f2b0010800.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Gigantes-v2-gemma2-9b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gigantes-v2-gemma2-9b-it", - "id": "grimjim/Gigantes-v2-gemma2-9b-it", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7351 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5987 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2017 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4259 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Gigantes-v3-gemma2-9b-it/d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json b/data/hfopenllm_v2/grimjim/Gigantes-v3-gemma2-9b-it/d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json deleted file mode 100644 index 918da3c13..000000000 --- a/data/hfopenllm_v2/grimjim/Gigantes-v3-gemma2-9b-it/d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Gigantes-v3-gemma2-9b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gigantes-v3-gemma2-9b-it", - "id": "grimjim/Gigantes-v3-gemma2-9b-it", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6976 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4608 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/HuatuoSkywork-o1-Llama-3.1-8B/a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json b/data/hfopenllm_v2/grimjim/HuatuoSkywork-o1-Llama-3.1-8B/a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json deleted file mode 100644 index 6204ca20a..000000000 --- a/data/hfopenllm_v2/grimjim/HuatuoSkywork-o1-Llama-3.1-8B/a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_HuatuoSkywork-o1-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HuatuoSkywork-o1-Llama-3.1-8B", - "id": "grimjim/HuatuoSkywork-o1-Llama-3.1-8B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3839 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json b/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json deleted file mode 100644 index bd55b41d5..000000000 --- a/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge", - "id": "grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4271 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4043 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/fe7a6940-fc4c-4345-84be-609c8155be57.json b/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/fe7a6940-fc4c-4345-84be-609c8155be57.json deleted file mode 100644 index c0a6840ce..000000000 --- a/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/fe7a6940-fc4c-4345-84be-609c8155be57.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge", - "id": "grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6806 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5022 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3885 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter/77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json b/data/hfopenllm_v2/grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter/77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json deleted file mode 100644 index b33246650..000000000 --- a/data/hfopenllm_v2/grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter/77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Llama-3.1-8B-Instruct-abliterated_via_adapter/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Instruct-abliterated_via_adapter", - "id": "grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5105 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.401 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Llama-3.1-Bonsaikraft-8B-Instruct/94d744be-5d28-490a-ba9a-8440cb97dce9.json b/data/hfopenllm_v2/grimjim/Llama-3.1-Bonsaikraft-8B-Instruct/94d744be-5d28-490a-ba9a-8440cb97dce9.json deleted file mode 100644 index 3c3c836ca..000000000 --- a/data/hfopenllm_v2/grimjim/Llama-3.1-Bonsaikraft-8B-Instruct/94d744be-5d28-490a-ba9a-8440cb97dce9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Llama-3.1-Bonsaikraft-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Bonsaikraft-8B-Instruct", - "id": "grimjim/Llama-3.1-Bonsaikraft-8B-Instruct", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5287 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4235 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Llama-Nephilim-Metamorphosis-v2-8B/2765061e-7506-4eb6-b63f-312f6290665a.json b/data/hfopenllm_v2/grimjim/Llama-Nephilim-Metamorphosis-v2-8B/2765061e-7506-4eb6-b63f-312f6290665a.json deleted file mode 100644 index ee870fb1c..000000000 --- a/data/hfopenllm_v2/grimjim/Llama-Nephilim-Metamorphosis-v2-8B/2765061e-7506-4eb6-b63f-312f6290665a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Llama-Nephilim-Metamorphosis-v2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Nephilim-Metamorphosis-v2-8B", - "id": "grimjim/Llama-Nephilim-Metamorphosis-v2-8B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4545 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5013 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4091 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/167c937c-66c7-45a8-bbd9-97d98531bf7d.json b/data/hfopenllm_v2/grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/167c937c-66c7-45a8-bbd9-97d98531bf7d.json deleted file mode 100644 index 7a0c41480..000000000 --- a/data/hfopenllm_v2/grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/167c937c-66c7-45a8-bbd9-97d98531bf7d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B", - "id": "grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5287 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3999 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v1-Gemma2-8k-9B/9587c35c-1def-46e7-8642-7acb0340be5e.json b/data/hfopenllm_v2/grimjim/Magnolia-v1-Gemma2-8k-9B/9587c35c-1def-46e7-8642-7acb0340be5e.json deleted file mode 100644 index 51f562ec4..000000000 --- a/data/hfopenllm_v2/grimjim/Magnolia-v1-Gemma2-8k-9B/9587c35c-1def-46e7-8642-7acb0340be5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v1-Gemma2-8k-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magnolia-v1-Gemma2-8k-9B", - "id": "grimjim/Magnolia-v1-Gemma2-8k-9B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3531 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5589 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1684 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4645 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4242 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v2-12B/1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json b/data/hfopenllm_v2/grimjim/Magnolia-v2-12B/1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json deleted file mode 100644 index 4765d94c9..000000000 --- a/data/hfopenllm_v2/grimjim/Magnolia-v2-12B/1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v2-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magnolia-v2-12B", - "id": "grimjim/Magnolia-v2-12B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3506 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3601 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v2-Gemma2-8k-9B/8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json b/data/hfopenllm_v2/grimjim/Magnolia-v2-Gemma2-8k-9B/8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json deleted file mode 100644 index 3274bce78..000000000 --- a/data/hfopenllm_v2/grimjim/Magnolia-v2-Gemma2-8k-9B/8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v2-Gemma2-8k-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magnolia-v2-Gemma2-8k-9B", - "id": "grimjim/Magnolia-v2-Gemma2-8k-9B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7384 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6016 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2281 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v3-12B/a2f9536a-9266-4aee-be90-d04f4dcbe53c.json b/data/hfopenllm_v2/grimjim/Magnolia-v3-12B/a2f9536a-9266-4aee-be90-d04f4dcbe53c.json deleted file mode 100644 index 0565ceb88..000000000 --- a/data/hfopenllm_v2/grimjim/Magnolia-v3-12B/a2f9536a-9266-4aee-be90-d04f4dcbe53c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v3-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magnolia-v3-12B", - "id": "grimjim/Magnolia-v3-12B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3965 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5327 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1352 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4184 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v3-Gemma2-8k-9B/7f116aaa-3880-4e53-948a-4b06e0d26cff.json b/data/hfopenllm_v2/grimjim/Magnolia-v3-Gemma2-8k-9B/7f116aaa-3880-4e53-948a-4b06e0d26cff.json deleted file mode 100644 index 5518b4113..000000000 --- a/data/hfopenllm_v2/grimjim/Magnolia-v3-Gemma2-8k-9B/7f116aaa-3880-4e53-948a-4b06e0d26cff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v3-Gemma2-8k-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magnolia-v3-Gemma2-8k-9B", - "id": "grimjim/Magnolia-v3-Gemma2-8k-9B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6015 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2319 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4337 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v4-12B/7cbe4516-2be2-421b-95f4-c9500ad64ca5.json b/data/hfopenllm_v2/grimjim/Magnolia-v4-12B/7cbe4516-2be2-421b-95f4-c9500ad64ca5.json deleted file mode 100644 index 53c5ccbfc..000000000 --- a/data/hfopenllm_v2/grimjim/Magnolia-v4-12B/7cbe4516-2be2-421b-95f4-c9500ad64ca5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v4-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magnolia-v4-12B", - "id": "grimjim/Magnolia-v4-12B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3418 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3672 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v5a-12B/07df565a-bc30-4a9d-b472-7a85f35938be.json b/data/hfopenllm_v2/grimjim/Magnolia-v5a-12B/07df565a-bc30-4a9d-b472-7a85f35938be.json deleted file mode 100644 index edba3c4ff..000000000 --- a/data/hfopenllm_v2/grimjim/Magnolia-v5a-12B/07df565a-bc30-4a9d-b472-7a85f35938be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v5a-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magnolia-v5a-12B", - "id": "grimjim/Magnolia-v5a-12B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5312 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1375 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4145 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3601 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Magot-v1-Gemma2-8k-9B/7545f7db-10bb-4d97-9b3f-4346f4f26bad.json b/data/hfopenllm_v2/grimjim/Magot-v1-Gemma2-8k-9B/7545f7db-10bb-4d97-9b3f-4346f4f26bad.json deleted file mode 100644 index e97c90ea3..000000000 --- a/data/hfopenllm_v2/grimjim/Magot-v1-Gemma2-8k-9B/7545f7db-10bb-4d97-9b3f-4346f4f26bad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Magot-v1-Gemma2-8k-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magot-v1-Gemma2-8k-9B", - "id": "grimjim/Magot-v1-Gemma2-8k-9B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2997 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6019 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4337 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/Magot-v2-Gemma2-8k-9B/47384f10-ac6a-4629-92db-86f01a441f7f.json b/data/hfopenllm_v2/grimjim/Magot-v2-Gemma2-8k-9B/47384f10-ac6a-4629-92db-86f01a441f7f.json deleted file mode 100644 index f21dbc7f0..000000000 --- a/data/hfopenllm_v2/grimjim/Magot-v2-Gemma2-8k-9B/47384f10-ac6a-4629-92db-86f01a441f7f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_Magot-v2-Gemma2-8k-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magot-v2-Gemma2-8k-9B", - "id": "grimjim/Magot-v2-Gemma2-8k-9B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7347 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5897 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2017 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4344 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4223 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B/3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json b/data/hfopenllm_v2/grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B/3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json deleted file mode 100644 index 8620f2a75..000000000 --- a/data/hfopenllm_v2/grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B/3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_SauerHuatuoSkywork-o1-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SauerHuatuoSkywork-o1-Llama-3.1-8B", - "id": "grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5219 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5222 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.173 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3991 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v1-8B/1d851cfb-8624-4516-8204-85569c60dc67.json b/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v1-8B/1d851cfb-8624-4516-8204-85569c60dc67.json deleted file mode 100644 index a154af168..000000000 --- a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v1-8B/1d851cfb-8624-4516-8204-85569c60dc67.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_llama-3-Nephilim-v1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-Nephilim-v1-8B", - "id": "grimjim/llama-3-Nephilim-v1-8B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5132 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3796 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2-8B/a7990990-7498-4b74-a0aa-9c266910698e.json b/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2-8B/a7990990-7498-4b74-a0aa-9c266910698e.json deleted file mode 100644 index 961e8d5ae..000000000 --- a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2-8B/a7990990-7498-4b74-a0aa-9c266910698e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_llama-3-Nephilim-v2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-Nephilim-v2-8B", - "id": "grimjim/llama-3-Nephilim-v2-8B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5048 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3895 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2.1-8B/0b41d37e-0728-4575-9662-c150e2e29bd0.json b/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2.1-8B/0b41d37e-0728-4575-9662-c150e2e29bd0.json deleted file mode 100644 index 2e431db78..000000000 --- a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2.1-8B/0b41d37e-0728-4575-9662-c150e2e29bd0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_llama-3-Nephilim-v2.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-Nephilim-v2.1-8B", - "id": "grimjim/llama-3-Nephilim-v2.1-8B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3895 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5095 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v3-8B/c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json b/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v3-8B/c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json deleted file mode 100644 index 123748d61..000000000 --- a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v3-8B/c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/grimjim_llama-3-Nephilim-v3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-Nephilim-v3-8B", - "id": "grimjim/llama-3-Nephilim-v3-8B", - "developer": "grimjim", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4174 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5013 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0952 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3989 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3612 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gupta-tanish/llama-7b-dpo-baseline/680a4507-755e-4014-877b-6032f0220270.json b/data/hfopenllm_v2/gupta-tanish/llama-7b-dpo-baseline/680a4507-755e-4014-877b-6032f0220270.json deleted file mode 100644 index 888dfc69c..000000000 --- a/data/hfopenllm_v2/gupta-tanish/llama-7b-dpo-baseline/680a4507-755e-4014-877b-6032f0220270.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gupta-tanish_llama-7b-dpo-baseline/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-7b-dpo-baseline", - "id": "gupta-tanish/llama-7b-dpo-baseline", - "developer": "gupta-tanish", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3897 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2028 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.1/5ace8dc6-e348-4267-bb4a-f71a335d074e.json b/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.1/5ace8dc6-e348-4267-bb4a-f71a335d074e.json deleted file mode 100644 index 711f9349b..000000000 --- a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.1/5ace8dc6-e348-4267-bb4a-f71a335d074e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gz987_qwen2.5-7b-cabs-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-7b-cabs-v0.1", - "id": "gz987/qwen2.5-7b-cabs-v0.1", - "developer": "gz987", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7506 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5482 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4796 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4376 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.2/07549821-db51-4b77-980a-056131b5dd29.json b/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.2/07549821-db51-4b77-980a-056131b5dd29.json deleted file mode 100644 index 3ae09bf09..000000000 --- a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.2/07549821-db51-4b77-980a-056131b5dd29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gz987_qwen2.5-7b-cabs-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-7b-cabs-v0.2", - "id": "gz987/qwen2.5-7b-cabs-v0.2", - "developer": "gz987", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7418 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4397 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.3/ff12a0a1-a913-441b-955c-bcbd50056acf.json b/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.3/ff12a0a1-a913-441b-955c-bcbd50056acf.json deleted file mode 100644 index 9b5948568..000000000 --- a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.3/ff12a0a1-a913-441b-955c-bcbd50056acf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gz987_qwen2.5-7b-cabs-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-7b-cabs-v0.3", - "id": "gz987/qwen2.5-7b-cabs-v0.3", - "developer": "gz987", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5494 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4932 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.443 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.4/947cfc2b-b73c-40eb-9e57-be5278776711.json b/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.4/947cfc2b-b73c-40eb-9e57-be5278776711.json deleted file mode 100644 index bc502245a..000000000 --- a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.4/947cfc2b-b73c-40eb-9e57-be5278776711.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/gz987_qwen2.5-7b-cabs-v0.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-7b-cabs-v0.4", - "id": "gz987/qwen2.5-7b-cabs-v0.4", - "developer": "gz987", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7583 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4849 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.443 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/h2oai/h2o-danube-1.8b-chat/53639078-c50a-4147-bab0-16993f1790b6.json b/data/hfopenllm_v2/h2oai/h2o-danube-1.8b-chat/53639078-c50a-4147-bab0-16993f1790b6.json deleted file mode 100644 index 47446d221..000000000 --- a/data/hfopenllm_v2/h2oai/h2o-danube-1.8b-chat/53639078-c50a-4147-bab0-16993f1790b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube-1.8b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "h2o-danube-1.8b-chat", - "id": "h2oai/h2o-danube-1.8b-chat", - "developer": "h2oai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 1.831 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.322 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3989 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/h2oai/h2o-danube3-4b-base/b2cf96e0-382e-4200-a4a4-d66e8a188878.json b/data/hfopenllm_v2/h2oai/h2o-danube3-4b-base/b2cf96e0-382e-4200-a4a4-d66e8a188878.json deleted file mode 100644 index 0acd0b4f9..000000000 --- a/data/hfopenllm_v2/h2oai/h2o-danube3-4b-base/b2cf96e0-382e-4200-a4a4-d66e8a188878.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube3-4b-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "h2o-danube3-4b-base", - "id": "h2oai/h2o-danube3-4b-base", - "developer": "h2oai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.962 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2338 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3778 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/h2oai/h2o-danube3-4b-chat/d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json b/data/hfopenllm_v2/h2oai/h2o-danube3-4b-chat/d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json deleted file mode 100644 index c17f6d915..000000000 --- a/data/hfopenllm_v2/h2oai/h2o-danube3-4b-chat/d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube3-4b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "h2o-danube3-4b-chat", - "id": "h2oai/h2o-danube3-4b-chat", - "developer": "h2oai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.962 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3629 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2228 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/h2oai/h2o-danube3-500m-chat/210f7063-e0d9-424d-94f4-3645e4e1b401.json b/data/hfopenllm_v2/h2oai/h2o-danube3-500m-chat/210f7063-e0d9-424d-94f4-3645e4e1b401.json deleted file mode 100644 index fd3e0551c..000000000 --- a/data/hfopenllm_v2/h2oai/h2o-danube3-500m-chat/210f7063-e0d9-424d-94f4-3645e4e1b401.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube3-500m-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "h2o-danube3-500m-chat", - "id": "h2oai/h2o-danube3-500m-chat", - "developer": "h2oai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.514 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3035 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/h2oai/h2o-danube3.1-4b-chat/4ecd26d8-8416-4dba-8d53-96f4013cfef0.json b/data/hfopenllm_v2/h2oai/h2o-danube3.1-4b-chat/4ecd26d8-8416-4dba-8d53-96f4013cfef0.json deleted file mode 100644 index 6f2847856..000000000 --- a/data/hfopenllm_v2/h2oai/h2o-danube3.1-4b-chat/4ecd26d8-8416-4dba-8d53-96f4013cfef0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube3.1-4b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "h2o-danube3.1-4b-chat", - "id": "h2oai/h2o-danube3.1-4b-chat", - "developer": "h2oai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.962 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3608 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4102 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/haoranxu/ALMA-13B-R/15712b7d-e69f-4a4f-b13c-4e79ce859399.json b/data/hfopenllm_v2/haoranxu/ALMA-13B-R/15712b7d-e69f-4a4f-b13c-4e79ce859399.json deleted file mode 100644 index b4507647e..000000000 --- a/data/hfopenllm_v2/haoranxu/ALMA-13B-R/15712b7d-e69f-4a4f-b13c-4e79ce859399.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/haoranxu_ALMA-13B-R/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ALMA-13B-R", - "id": "haoranxu/ALMA-13B-R", - "developer": "haoranxu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0039 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3457 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-CPO-SimPO/9148c375-7c08-4c1c-82ed-5f935b2a4f04.json b/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-CPO-SimPO/9148c375-7c08-4c1c-82ed-5f935b2a4f04.json deleted file mode 100644 index e8754a2a8..000000000 --- a/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-CPO-SimPO/9148c375-7c08-4c1c-82ed-5f935b2a4f04.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/haoranxu_Llama-3-Instruct-8B-CPO-SimPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-CPO-SimPO", - "id": "haoranxu/Llama-3-Instruct-8B-CPO-SimPO", - "developer": "haoranxu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7046 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5048 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1027 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3567 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-SimPO/fb93274b-b7d8-483a-a95d-96340535febc.json b/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-SimPO/fb93274b-b7d8-483a-a95d-96340535febc.json deleted file mode 100644 index e99c8aeb7..000000000 --- a/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-SimPO/fb93274b-b7d8-483a-a95d-96340535febc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/haoranxu_Llama-3-Instruct-8B-SimPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SimPO", - "id": "haoranxu/Llama-3-Instruct-8B-SimPO", - "developer": "haoranxu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7347 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4979 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3566 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc/0818b755-ec49-457c-8635-73f01816f30b.json b/data/hfopenllm_v2/hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc/0818b755-ec49-457c-8635-73f01816f30b.json deleted file mode 100644 index 86fc861f4..000000000 --- a/data/hfopenllm_v2/hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc/0818b755-ec49-457c-8635-73f01816f30b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hatemmahmoud_qwen2.5-1.5b-sft-raft-grpo-hra-doc/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-1.5b-sft-raft-grpo-hra-doc", - "id": "hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc", - "developer": "hatemmahmoud", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4196 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.427 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2175 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.361 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2776 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v0.5/77962326-0160-49bd-9ef1-59b403b2bfce.json b/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v0.5/77962326-0160-49bd-9ef1-59b403b2bfce.json deleted file mode 100644 index 4312be247..000000000 --- a/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v0.5/77962326-0160-49bd-9ef1-59b403b2bfce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hon9kon9ize_CantoneseLLMChat-v0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CantoneseLLMChat-v0.5", - "id": "hon9kon9ize/CantoneseLLMChat-v0.5", - "developer": "hon9kon9ize", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.069 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3231 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4345 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4706 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2504 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v1.0-7B/272abbe5-8b61-442f-9860-d7411e7fec99.json b/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v1.0-7B/272abbe5-8b61-442f-9860-d7411e7fec99.json deleted file mode 100644 index 265d4e3f0..000000000 --- a/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v1.0-7B/272abbe5-8b61-442f-9860-d7411e7fec99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hon9kon9ize_CantoneseLLMChat-v1.0-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CantoneseLLMChat-v1.0-7B", - "id": "hon9kon9ize/CantoneseLLMChat-v1.0-7B", - "developer": "hon9kon9ize", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4455 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4866 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2107 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3883 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3785 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hongbai12/li-0.4-pre/14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json b/data/hfopenllm_v2/hongbai12/li-0.4-pre/14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json deleted file mode 100644 index c385d6d79..000000000 --- a/data/hfopenllm_v2/hongbai12/li-0.4-pre/14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hongbai12_li-0.4-pre/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "li-0.4-pre", - "id": "hongbai12/li-0.4-pre", - "developer": "hongbai12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4513 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5015 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Deepseek-qwen-modelstock-2B/ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json b/data/hfopenllm_v2/hotmailuser/Deepseek-qwen-modelstock-2B/ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json deleted file mode 100644 index 2ecbdfd24..000000000 --- a/data/hfopenllm_v2/hotmailuser/Deepseek-qwen-modelstock-2B/ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Deepseek-qwen-modelstock-2B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Deepseek-qwen-modelstock-2B", - "id": "hotmailuser/Deepseek-qwen-modelstock-2B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2149 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3399 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1911 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp1-10B/1970e257-7c93-4342-9ff4-a96af21acc67.json b/data/hfopenllm_v2/hotmailuser/Falcon3Slerp1-10B/1970e257-7c93-4342-9ff4-a96af21acc67.json deleted file mode 100644 index 5c04c6bd5..000000000 --- a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp1-10B/1970e257-7c93-4342-9ff4-a96af21acc67.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Falcon3Slerp1-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3Slerp1-10B", - "id": "hotmailuser/Falcon3Slerp1-10B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5694 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.617 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2598 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4318 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp2-10B/15d71696-4b21-41ff-a4c6-0aea92fb844a.json b/data/hfopenllm_v2/hotmailuser/Falcon3Slerp2-10B/15d71696-4b21-41ff-a4c6-0aea92fb844a.json deleted file mode 100644 index 7fc89171e..000000000 --- a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp2-10B/15d71696-4b21-41ff-a4c6-0aea92fb844a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Falcon3Slerp2-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3Slerp2-10B", - "id": "hotmailuser/Falcon3Slerp2-10B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6118 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6164 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2319 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp4-10B/ccb85394-5252-48d4-8980-8b3a6c67ab1a.json b/data/hfopenllm_v2/hotmailuser/Falcon3Slerp4-10B/ccb85394-5252-48d4-8980-8b3a6c67ab1a.json deleted file mode 100644 index ad19a7974..000000000 --- a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp4-10B/ccb85394-5252-48d4-8980-8b3a6c67ab1a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Falcon3Slerp4-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3Slerp4-10B", - "id": "hotmailuser/Falcon3Slerp4-10B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6072 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6114 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2289 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4017 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp-3B/ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp-3B/ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json deleted file mode 100644 index e5f0a213e..000000000 --- a/data/hfopenllm_v2/hotmailuser/FalconSlerp-3B/ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FalconSlerp-3B", - "id": "hotmailuser/FalconSlerp-3B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.228 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5695 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3989 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2968 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp1-7B/fe9012a7-d07f-48d4-b460-eca256078d8b.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp1-7B/fe9012a7-d07f-48d4-b460-eca256078d8b.json deleted file mode 100644 index 500ef8052..000000000 --- a/data/hfopenllm_v2/hotmailuser/FalconSlerp1-7B/fe9012a7-d07f-48d4-b460-eca256078d8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FalconSlerp1-7B", - "id": "hotmailuser/FalconSlerp1-7B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5355 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2379 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp2-7B/8e8d2071-8e7d-4dad-8536-4698b2d00316.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp2-7B/8e8d2071-8e7d-4dad-8536-4698b2d00316.json deleted file mode 100644 index 8e8a72c09..000000000 --- a/data/hfopenllm_v2/hotmailuser/FalconSlerp2-7B/8e8d2071-8e7d-4dad-8536-4698b2d00316.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FalconSlerp2-7B", - "id": "hotmailuser/FalconSlerp2-7B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.616 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5538 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2983 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp3-10B/dbcb41be-9ed6-4244-ada8-77f363c3487e.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp3-10B/dbcb41be-9ed6-4244-ada8-77f363c3487e.json deleted file mode 100644 index 3031ed1a6..000000000 --- a/data/hfopenllm_v2/hotmailuser/FalconSlerp3-10B/dbcb41be-9ed6-4244-ada8-77f363c3487e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp3-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FalconSlerp3-10B", - "id": "hotmailuser/FalconSlerp3-10B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6002 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.606 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2273 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp3-7B/e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp3-7B/e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json deleted file mode 100644 index 5165cca5b..000000000 --- a/data/hfopenllm_v2/hotmailuser/FalconSlerp3-7B/e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp3-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FalconSlerp3-7B", - "id": "hotmailuser/FalconSlerp3-7B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6096 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5533 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4507 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp4-7B/30c2d908-3eaf-408a-a2b5-301e0cd9e052.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp4-7B/30c2d908-3eaf-408a-a2b5-301e0cd9e052.json deleted file mode 100644 index 7cb786f59..000000000 --- a/data/hfopenllm_v2/hotmailuser/FalconSlerp4-7B/30c2d908-3eaf-408a-a2b5-301e0cd9e052.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp4-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FalconSlerp4-7B", - "id": "hotmailuser/FalconSlerp4-7B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6285 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4585 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4032 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp6-7B/f7624d04-66d1-4c05-8c01-d015ecf8412c.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp6-7B/f7624d04-66d1-4c05-8c01-d015ecf8412c.json deleted file mode 100644 index df78b9118..000000000 --- a/data/hfopenllm_v2/hotmailuser/FalconSlerp6-7B/f7624d04-66d1-4c05-8c01-d015ecf8412c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp6-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FalconSlerp6-7B", - "id": "hotmailuser/FalconSlerp6-7B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6027 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2047 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4492 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3995 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Gemma2Crono-27B/511e4aad-1e5a-4515-9433-46989fc3945b.json b/data/hfopenllm_v2/hotmailuser/Gemma2Crono-27B/511e4aad-1e5a-4515-9433-46989fc3945b.json deleted file mode 100644 index fbd307ade..000000000 --- a/data/hfopenllm_v2/hotmailuser/Gemma2Crono-27B/511e4aad-1e5a-4515-9433-46989fc3945b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Gemma2Crono-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2Crono-27B", - "id": "hotmailuser/Gemma2Crono-27B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7086 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6505 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4567 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4633 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Gemma2SimPO-27B/863e71ec-03a4-47ed-8bc9-b064d5571162.json b/data/hfopenllm_v2/hotmailuser/Gemma2SimPO-27B/863e71ec-03a4-47ed-8bc9-b064d5571162.json deleted file mode 100644 index 710203d42..000000000 --- a/data/hfopenllm_v2/hotmailuser/Gemma2SimPO-27B/863e71ec-03a4-47ed-8bc9-b064d5571162.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Gemma2SimPO-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2SimPO-27B", - "id": "hotmailuser/Gemma2SimPO-27B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7222 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6413 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4447 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4642 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Gemma2atlas-27B/6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json b/data/hfopenllm_v2/hotmailuser/Gemma2atlas-27B/6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json deleted file mode 100644 index b057bd05c..000000000 --- a/data/hfopenllm_v2/hotmailuser/Gemma2atlas-27B/6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Gemma2atlas-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2atlas-27B", - "id": "hotmailuser/Gemma2atlas-27B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7214 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6545 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.475 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Gemma2magnum-27b/e0dbec0b-a154-448a-be23-ef9b764469ea.json b/data/hfopenllm_v2/hotmailuser/Gemma2magnum-27b/e0dbec0b-a154-448a-be23-ef9b764469ea.json deleted file mode 100644 index 29f7bb0c0..000000000 --- a/data/hfopenllm_v2/hotmailuser/Gemma2magnum-27b/e0dbec0b-a154-448a-be23-ef9b764469ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Gemma2magnum-27b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2magnum-27b", - "id": "hotmailuser/Gemma2magnum-27b", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5051 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.62 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4723 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4596 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp-8B/ecd91300-b0cf-48ce-9e5c-253a7991f90e.json b/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp-8B/ecd91300-b0cf-48ce-9e5c-253a7991f90e.json deleted file mode 100644 index 2af872ca1..000000000 --- a/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp-8B/ecd91300-b0cf-48ce-9e5c-253a7991f90e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Llama-Hermes-slerp-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Hermes-slerp-8B", - "id": "hotmailuser/Llama-Hermes-slerp-8B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.339 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0801 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4078 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp2-8B/e3df71f1-63e1-40f1-918d-07cb3ec939cf.json b/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp2-8B/e3df71f1-63e1-40f1-918d-07cb3ec939cf.json deleted file mode 100644 index c653551a2..000000000 --- a/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp2-8B/e3df71f1-63e1-40f1-918d-07cb3ec939cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Llama-Hermes-slerp2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Hermes-slerp2-8B", - "id": "hotmailuser/Llama-Hermes-slerp2-8B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5265 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4248 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/LlamaStock-8B/52066a23-9847-490e-90e3-57eee3c63276.json b/data/hfopenllm_v2/hotmailuser/LlamaStock-8B/52066a23-9847-490e-90e3-57eee3c63276.json deleted file mode 100644 index 8711f86d4..000000000 --- a/data/hfopenllm_v2/hotmailuser/LlamaStock-8B/52066a23-9847-490e-90e3-57eee3c63276.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_LlamaStock-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LlamaStock-8B", - "id": "hotmailuser/LlamaStock-8B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5329 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1699 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4129 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Mistral-modelstock-24B/91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json b/data/hfopenllm_v2/hotmailuser/Mistral-modelstock-24B/91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json deleted file mode 100644 index 656587aa6..000000000 --- a/data/hfopenllm_v2/hotmailuser/Mistral-modelstock-24B/91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Mistral-modelstock-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-modelstock-24B", - "id": "hotmailuser/Mistral-modelstock-24B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3424 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4102 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Mistral-modelstock2-24B/323630ee-fbe0-49a7-aa11-816fde38ba2d.json b/data/hfopenllm_v2/hotmailuser/Mistral-modelstock2-24B/323630ee-fbe0-49a7-aa11-816fde38ba2d.json deleted file mode 100644 index 2a8edbdfe..000000000 --- a/data/hfopenllm_v2/hotmailuser/Mistral-modelstock2-24B/323630ee-fbe0-49a7-aa11-816fde38ba2d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Mistral-modelstock2-24B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-modelstock2-24B", - "id": "hotmailuser/Mistral-modelstock2-24B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4318 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6689 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2402 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4616 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5318 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Phi4-Slerp4-14B/e5c8f97d-1873-4c9d-8bed-50dc592543db.json b/data/hfopenllm_v2/hotmailuser/Phi4-Slerp4-14B/e5c8f97d-1873-4c9d-8bed-50dc592543db.json deleted file mode 100644 index 4377cee38..000000000 --- a/data/hfopenllm_v2/hotmailuser/Phi4-Slerp4-14B/e5c8f97d-1873-4c9d-8bed-50dc592543db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Phi4-Slerp4-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi4-Slerp4-14B", - "id": "hotmailuser/Phi4-Slerp4-14B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0629 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6731 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3474 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5097 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/Qwen2.5-HomerSlerp-7B/7ee2803c-b8f8-4156-8472-bab4baab8863.json b/data/hfopenllm_v2/hotmailuser/Qwen2.5-HomerSlerp-7B/7ee2803c-b8f8-4156-8472-bab4baab8863.json deleted file mode 100644 index 1e3e4c4b1..000000000 --- a/data/hfopenllm_v2/hotmailuser/Qwen2.5-HomerSlerp-7B/7ee2803c-b8f8-4156-8472-bab4baab8863.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_Qwen2.5-HomerSlerp-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-HomerSlerp-7B", - "id": "hotmailuser/Qwen2.5-HomerSlerp-7B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4488 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5633 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4383 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4549 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenModelStock-1.8B/78573f63-3073-4be4-93a7-0ea00b1383fd.json b/data/hfopenllm_v2/hotmailuser/QwenModelStock-1.8B/78573f63-3073-4be4-93a7-0ea00b1383fd.json deleted file mode 100644 index 818c900fe..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenModelStock-1.8B/78573f63-3073-4be4-93a7-0ea00b1383fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenModelStock-1.8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenModelStock-1.8B", - "id": "hotmailuser/QwenModelStock-1.8B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4359 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2959 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp-14B/42da7295-d78d-49a4-9279-8406063240c4.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp-14B/42da7295-d78d-49a4-9279-8406063240c4.json deleted file mode 100644 index 448439df9..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenSlerp-14B/42da7295-d78d-49a4-9279-8406063240c4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp-14B", - "id": "hotmailuser/QwenSlerp-14B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7025 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6491 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3837 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4634 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp-3B/b61c5735-53ca-4dda-a223-79921eee7f3e.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp-3B/b61c5735-53ca-4dda-a223-79921eee7f3e.json deleted file mode 100644 index ab9d3d420..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenSlerp-3B/b61c5735-53ca-4dda-a223-79921eee7f3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp-3B", - "id": "hotmailuser/QwenSlerp-3B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4334 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4892 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2749 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4317 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp-7B/310124ef-e33f-49de-83eb-e665a5143aaa.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp-7B/310124ef-e33f-49de-83eb-e665a5143aaa.json deleted file mode 100644 index 14cc9dac6..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenSlerp-7B/310124ef-e33f-49de-83eb-e665a5143aaa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp-7B", - "id": "hotmailuser/QwenSlerp-7B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4673 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5636 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3444 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp2-14B/c9b056df-8bbe-4959-ab44-85813157c95c.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp2-14B/c9b056df-8bbe-4959-ab44-85813157c95c.json deleted file mode 100644 index 746637a2e..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenSlerp2-14B/c9b056df-8bbe-4959-ab44-85813157c95c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp2-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp2-14B", - "id": "hotmailuser/QwenSlerp2-14B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7037 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3965 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4807 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp2-3B/7a60385f-48dd-4926-8b66-3d42a1631db3.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp2-3B/7a60385f-48dd-4926-8b66-3d42a1631db3.json deleted file mode 100644 index 4a1951fcf..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenSlerp2-3B/7a60385f-48dd-4926-8b66-3d42a1631db3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp2-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp2-3B", - "id": "hotmailuser/QwenSlerp2-3B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4802 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2606 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4252 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp3-14B/da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp3-14B/da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json deleted file mode 100644 index 812599df3..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenSlerp3-14B/da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp3-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSlerp3-14B", - "id": "hotmailuser/QwenSlerp3-14B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6632 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6267 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4305 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4808 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5263 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenSparse-7B/e2930715-b616-49a4-83bc-53e92fc3580f.json b/data/hfopenllm_v2/hotmailuser/QwenSparse-7B/e2930715-b616-49a4-83bc-53e92fc3580f.json deleted file mode 100644 index 8f33a3dbf..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenSparse-7B/e2930715-b616-49a4-83bc-53e92fc3580f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSparse-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenSparse-7B", - "id": "hotmailuser/QwenSparse-7B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1086 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2896 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3562 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenStock-0.5B/543f45e0-a158-4fdb-bbb1-8deb38f4515b.json b/data/hfopenllm_v2/hotmailuser/QwenStock-0.5B/543f45e0-a158-4fdb-bbb1-8deb38f4515b.json deleted file mode 100644 index be59d9fa4..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenStock-0.5B/543f45e0-a158-4fdb-bbb1-8deb38f4515b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenStock-0.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenStock-0.5B", - "id": "hotmailuser/QwenStock-0.5B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2049 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2912 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenStock-1.7B/b96a20e0-d044-4a66-8909-437aeaef569c.json b/data/hfopenllm_v2/hotmailuser/QwenStock-1.7B/b96a20e0-d044-4a66-8909-437aeaef569c.json deleted file mode 100644 index efbdd247d..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenStock-1.7B/b96a20e0-d044-4a66-8909-437aeaef569c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenStock-1.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenStock-1.7B", - "id": "hotmailuser/QwenStock-1.7B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3214 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2955 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/QwenStock1-14B/408742ff-4b21-46dc-b4d6-4c78d652d228.json b/data/hfopenllm_v2/hotmailuser/QwenStock1-14B/408742ff-4b21-46dc-b4d6-4c78d652d228.json deleted file mode 100644 index 2d5a4767d..000000000 --- a/data/hfopenllm_v2/hotmailuser/QwenStock1-14B/408742ff-4b21-46dc-b4d6-4c78d652d228.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_QwenStock1-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwenStock1-14B", - "id": "hotmailuser/QwenStock1-14B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6693 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6502 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/hotmailuser/RombosBeagle-v2beta-MGS-32B/496a9fbe-376c-4546-bd90-b42f583924ce.json b/data/hfopenllm_v2/hotmailuser/RombosBeagle-v2beta-MGS-32B/496a9fbe-376c-4546-bd90-b42f583924ce.json deleted file mode 100644 index 3e888318d..000000000 --- a/data/hfopenllm_v2/hotmailuser/RombosBeagle-v2beta-MGS-32B/496a9fbe-376c-4546-bd90-b42f583924ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/hotmailuser_RombosBeagle-v2beta-MGS-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RombosBeagle-v2beta-MGS-32B", - "id": "hotmailuser/RombosBeagle-v2beta-MGS-32B", - "developer": "hotmailuser", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5157 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7037 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4992 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5908 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huggyllama/llama-13b/f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json b/data/hfopenllm_v2/huggyllama/llama-13b/f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json deleted file mode 100644 index a7410ce47..000000000 --- a/data/hfopenllm_v2/huggyllama/llama-13b/f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huggyllama_llama-13b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-13b", - "id": "huggyllama/llama-13b", - "developer": "huggyllama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.016 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2411 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3988 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3462 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1952 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huggyllama/llama-65b/cc36cc37-0f41-42aa-8051-54cc135820ef.json b/data/hfopenllm_v2/huggyllama/llama-65b/cc36cc37-0f41-42aa-8051-54cc135820ef.json deleted file mode 100644 index 89a0f62be..000000000 --- a/data/hfopenllm_v2/huggyllama/llama-65b/cc36cc37-0f41-42aa-8051-54cc135820ef.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huggyllama_llama-65b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-65b", - "id": "huggyllama/llama-65b", - "developer": "huggyllama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 65.286 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2526 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4703 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3078 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huggyllama/llama-7b/20d3dac4-9f8c-431c-b20f-364dd860e37f.json b/data/hfopenllm_v2/huggyllama/llama-7b/20d3dac4-9f8c-431c-b20f-364dd860e37f.json deleted file mode 100644 index 18ff58949..000000000 --- a/data/hfopenllm_v2/huggyllama/llama-7b/20d3dac4-9f8c-431c-b20f-364dd860e37f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huggyllama_llama-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-7b", - "id": "huggyllama/llama-7b", - "developer": "huggyllama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2501 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3277 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1313 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json b/data/hfopenllm_v2/huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json deleted file mode 100644 index 3494d0b6e..000000000 --- a/data/hfopenllm_v2/huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huihui-ai_DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-14B-abliterated-v2", - "id": "huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2", - "developer": "huihui-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4701 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1915 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-7030/97bfd152-79c6-4c96-8d3e-588275339e41.json b/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-7030/97bfd152-79c6-4c96-8d3e-588275339e41.json deleted file mode 100644 index 20f0742c2..000000000 --- a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-7030/97bfd152-79c6-4c96-8d3e-588275339e41.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huihui-ai_QwQ-32B-Coder-Fusion-7030/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-32B-Coder-Fusion-7030", - "id": "huihui-ai/QwQ-32B-Coder-Fusion-7030", - "developer": "huihui-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3865 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2795 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-8020/93061947-2bcf-482e-ab22-38ef8ee33bcf.json b/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-8020/93061947-2bcf-482e-ab22-38ef8ee33bcf.json deleted file mode 100644 index 10b206b42..000000000 --- a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-8020/93061947-2bcf-482e-ab22-38ef8ee33bcf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huihui-ai_QwQ-32B-Coder-Fusion-8020/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-32B-Coder-Fusion-8020", - "id": "huihui-ai/QwQ-32B-Coder-Fusion-8020", - "developer": "huihui-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6021 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6665 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4592 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5367 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-9010/8f65748b-1251-49f8-bfed-d1e4a937d5ba.json b/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-9010/8f65748b-1251-49f8-bfed-d1e4a937d5ba.json deleted file mode 100644 index 9d7bf036a..000000000 --- a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-9010/8f65748b-1251-49f8-bfed-d1e4a937d5ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huihui-ai_QwQ-32B-Coder-Fusion-9010/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-32B-Coder-Fusion-9010", - "id": "huihui-ai/QwQ-32B-Coder-Fusion-9010", - "developer": "huihui-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5778 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6727 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4682 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2/4f278881-69d3-42b5-b72c-ff8627a6ef44.json b/data/hfopenllm_v2/huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2/4f278881-69d3-42b5-b72c-ff8627a6ef44.json deleted file mode 100644 index ca8850291..000000000 --- a/data/hfopenllm_v2/huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2/4f278881-69d3-42b5-b72c-ff8627a6ef44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huihui-ai_Qwen2.5-14B-Instruct-abliterated-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Instruct-abliterated-v2", - "id": "huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2", - "developer": "huihui-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8328 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6324 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huihui-ai/Qwen2.5-72B-Instruct-abliterated/d88e85c5-73df-46cc-9234-f0556592ad5a.json b/data/hfopenllm_v2/huihui-ai/Qwen2.5-72B-Instruct-abliterated/d88e85c5-73df-46cc-9234-f0556592ad5a.json deleted file mode 100644 index 716aaba17..000000000 --- a/data/hfopenllm_v2/huihui-ai/Qwen2.5-72B-Instruct-abliterated/d88e85c5-73df-46cc-9234-f0556592ad5a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huihui-ai_Qwen2.5-72B-Instruct-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-72B-Instruct-abliterated", - "id": "huihui-ai/Qwen2.5-72B-Instruct-abliterated", - "developer": "huihui-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8593 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4233 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5537 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2/44d2a20d-e867-4fa5-af3d-087f9c1b4067.json b/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2/44d2a20d-e867-4fa5-af3d-087f9c1b4067.json deleted file mode 100644 index da59c59b9..000000000 --- a/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2/44d2a20d-e867-4fa5-af3d-087f9c1b4067.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huihui-ai_Qwen2.5-7B-Instruct-abliterated-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Instruct-abliterated-v2", - "id": "huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2", - "developer": "huihui-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7606 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5377 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4208 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated/e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json b/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated/e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json deleted file mode 100644 index 092cf89ac..000000000 --- a/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated/e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huihui-ai_Qwen2.5-7B-Instruct-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Instruct-abliterated", - "id": "huihui-ai/Qwen2.5-7B-Instruct-abliterated", - "developer": "huihui-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7546 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4577 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3967 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3/44f2948c-4564-44cc-98d8-4f82a30e1f09.json b/data/hfopenllm_v2/huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3/44f2948c-4564-44cc-98d8-4f82a30e1f09.json deleted file mode 100644 index a707ad657..000000000 --- a/data/hfopenllm_v2/huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3/44f2948c-4564-44cc-98d8-4f82a30e1f09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/huu-ontocord_wide_3b_orpo_stage1.1-ss1-orpo3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_orpo_stage1.1-ss1-orpo3", - "id": "huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3", - "developer": "huu-ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2937 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iFaz/llama31_8B_en_emo_v4/846cf1ff-62c3-44e7-b6dd-0135ec77451a.json b/data/hfopenllm_v2/iFaz/llama31_8B_en_emo_v4/846cf1ff-62c3-44e7-b6dd-0135ec77451a.json deleted file mode 100644 index 099d88b45..000000000 --- a/data/hfopenllm_v2/iFaz/llama31_8B_en_emo_v4/846cf1ff-62c3-44e7-b6dd-0135ec77451a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iFaz_llama31_8B_en_emo_v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama31_8B_en_emo_v4", - "id": "iFaz/llama31_8B_en_emo_v4", - "developer": "iFaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "", - "params_billions": 4.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4916 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0884 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3643 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3049 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iFaz/llama32_1B_en_emo_v1/d2054469-b38b-4b1d-bd40-7324319f8eca.json b/data/hfopenllm_v2/iFaz/llama32_1B_en_emo_v1/d2054469-b38b-4b1d-bd40-7324319f8eca.json deleted file mode 100644 index 03f4a931f..000000000 --- a/data/hfopenllm_v2/iFaz/llama32_1B_en_emo_v1/d2054469-b38b-4b1d-bd40-7324319f8eca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iFaz_llama32_1B_en_emo_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama32_1B_en_emo_v1", - "id": "iFaz/llama32_1B_en_emo_v1", - "developer": "iFaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.765 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.338 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3489 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1761 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_1000_stp/ce60608d-5b52-49d4-bbce-4b20e8272cef.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_1000_stp/ce60608d-5b52-49d4-bbce-4b20e8272cef.json deleted file mode 100644 index d3b7b81e6..000000000 --- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_1000_stp/ce60608d-5b52-49d4-bbce-4b20e8272cef.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_1000_stp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama32_3B_en_emo_1000_stp", - "id": "iFaz/llama32_3B_en_emo_1000_stp", - "developer": "iFaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.848 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7295 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4522 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1465 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_2000_stp/f177bb70-fb7c-4b57-965d-acbcb4936bfa.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_2000_stp/f177bb70-fb7c-4b57-965d-acbcb4936bfa.json deleted file mode 100644 index 3a25847fc..000000000 --- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_2000_stp/f177bb70-fb7c-4b57-965d-acbcb4936bfa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_2000_stp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama32_3B_en_emo_2000_stp", - "id": "iFaz/llama32_3B_en_emo_2000_stp", - "developer": "iFaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.848 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7369 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4535 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_300_stp/a5b2ab3d-1f12-4a5a-a110-2514185568b6.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_300_stp/a5b2ab3d-1f12-4a5a-a110-2514185568b6.json deleted file mode 100644 index b84be7a8b..000000000 --- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_300_stp/a5b2ab3d-1f12-4a5a-a110-2514185568b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_300_stp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama32_3B_en_emo_300_stp", - "id": "iFaz/llama32_3B_en_emo_300_stp", - "developer": "iFaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.848 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7256 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1601 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3148 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_5000_stp/63b887a1-a0b9-46db-a563-b9bd67a0805a.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_5000_stp/63b887a1-a0b9-46db-a563-b9bd67a0805a.json deleted file mode 100644 index 2c79dd41d..000000000 --- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_5000_stp/63b887a1-a0b9-46db-a563-b9bd67a0805a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_5000_stp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama32_3B_en_emo_5000_stp", - "id": "iFaz/llama32_3B_en_emo_5000_stp", - "developer": "iFaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.848 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.71 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4568 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3067 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v2/92d122f7-f29d-49e3-99da-bf20edf377a2.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v2/92d122f7-f29d-49e3-99da-bf20edf377a2.json deleted file mode 100644 index bc0aeebe3..000000000 --- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v2/92d122f7-f29d-49e3-99da-bf20edf377a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama32_3B_en_emo_v2", - "id": "iFaz/llama32_3B_en_emo_v2", - "developer": "iFaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.848 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5454 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v3/a0b71344-f3a8-4ad0-87c5-6393148488b1.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v3/a0b71344-f3a8-4ad0-87c5-6393148488b1.json deleted file mode 100644 index 6e61f1e41..000000000 --- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v3/a0b71344-f3a8-4ad0-87c5-6393148488b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama32_3B_en_emo_v3", - "id": "iFaz/llama32_3B_en_emo_v3", - "developer": "iFaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.848 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5759 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4301 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3553 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iRyanBell/ARC1-II/821ff784-c48a-4623-9fb5-b77b7114b625.json b/data/hfopenllm_v2/iRyanBell/ARC1-II/821ff784-c48a-4623-9fb5-b77b7114b625.json deleted file mode 100644 index 2ccabcdfe..000000000 --- a/data/hfopenllm_v2/iRyanBell/ARC1-II/821ff784-c48a-4623-9fb5-b77b7114b625.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iRyanBell_ARC1-II/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ARC1-II", - "id": "iRyanBell/ARC1-II", - "developer": "iRyanBell", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1708 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4913 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1686 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/iRyanBell/ARC1/ed251513-4807-4e31-bc8e-3ab0217ae4f3.json b/data/hfopenllm_v2/iRyanBell/ARC1/ed251513-4807-4e31-bc8e-3ab0217ae4f3.json deleted file mode 100644 index 0ff922243..000000000 --- a/data/hfopenllm_v2/iRyanBell/ARC1/ed251513-4807-4e31-bc8e-3ab0217ae4f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/iRyanBell_ARC1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ARC1", - "id": "iRyanBell/ARC1", - "developer": "iRyanBell", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4411 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4903 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibivibiv/colossus_120b/e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json b/data/hfopenllm_v2/ibivibiv/colossus_120b/e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json deleted file mode 100644 index 7022b74fb..000000000 --- a/data/hfopenllm_v2/ibivibiv/colossus_120b/e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibivibiv_colossus_120b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "colossus_120b", - "id": "ibivibiv/colossus_120b", - "developer": "ibivibiv", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 117.749 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4276 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6061 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4733 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3961 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibivibiv/multimaster-7b-v6/11dfd131-00bf-4561-a913-f1c0cb15bf9c.json b/data/hfopenllm_v2/ibivibiv/multimaster-7b-v6/11dfd131-00bf-4561-a913-f1c0cb15bf9c.json deleted file mode 100644 index 6b86a8133..000000000 --- a/data/hfopenllm_v2/ibivibiv/multimaster-7b-v6/11dfd131-00bf-4561-a913-f1c0cb15bf9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibivibiv_multimaster-7b-v6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "multimaster-7b-v6", - "id": "ibivibiv/multimaster-7b-v6", - "developer": "ibivibiv", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 35.428 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4473 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-base/3ba34f38-2340-407f-a7b5-82749f8a0ee6.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-base/3ba34f38-2340-407f-a7b5-82749f8a0ee6.json deleted file mode 100644 index f51a2ca74..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-base/3ba34f38-2340-407f-a7b5-82749f8a0ee6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-1b-a400m-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.0-1b-a400m-base", - "id": "ibm-granite/granite-3.0-1b-a400m-base", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 1.335 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2404 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-instruct/91b9649b-bdf6-4b15-a038-47edc2e79ef6.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-instruct/91b9649b-bdf6-4b15-a038-47edc2e79ef6.json deleted file mode 100644 index d041957dc..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-instruct/91b9649b-bdf6-4b15-a038-47edc2e79ef6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-1b-a400m-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.0-1b-a400m-instruct", - "id": "ibm-granite/granite-3.0-1b-a400m-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 1.335 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3332 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3224 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3623 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1244 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-base/24670e63-32e1-4c5d-82fe-0d0c45a4e165.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-base/24670e63-32e1-4c5d-82fe-0d0c45a4e165.json deleted file mode 100644 index 875fbc380..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-base/24670e63-32e1-4c5d-82fe-0d0c45a4e165.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-2b-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.0-2b-base", - "id": "ibm-granite/granite-3.0-2b-base", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 2.634 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-instruct/198d1441-1d13-468a-a998-c8cf9f1e7a57.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-instruct/198d1441-1d13-468a-a998-c8cf9f1e7a57.json deleted file mode 100644 index 3d240c282..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-instruct/198d1441-1d13-468a-a998-c8cf9f1e7a57.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-2b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.0-2b-instruct", - "id": "ibm-granite/granite-3.0-2b-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 2.634 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0921 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2814 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-base/e9eb1499-835c-4a70-b531-4be5a9718c34.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-base/e9eb1499-835c-4a70-b531-4be5a9718c34.json deleted file mode 100644 index 226473a3b..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-base/e9eb1499-835c-4a70-b531-4be5a9718c34.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-3b-a800m-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.0-3b-a800m-base", - "id": "ibm-granite/granite-3.0-3b-a800m-base", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 3.374 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2732 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3667 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1891 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-instruct/b1fd95ad-767d-4c13-a936-00b08c74ca3d.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-instruct/b1fd95ad-767d-4c13-a936-00b08c74ca3d.json deleted file mode 100644 index b7c39baf2..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-instruct/b1fd95ad-767d-4c13-a936-00b08c74ca3d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-3b-a800m-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.0-3b-a800m-instruct", - "id": "ibm-granite/granite-3.0-3b-a800m-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 3.374 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-base/f87bd357-535e-4450-b01d-b41e1b7571e0.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-base/f87bd357-535e-4450-b01d-b41e1b7571e0.json deleted file mode 100644 index fba09f2d6..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-base/f87bd357-535e-4450-b01d-b41e1b7571e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-8b-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.0-8b-base", - "id": "ibm-granite/granite-3.0-8b-base", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 8.171 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4583 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4944 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3313 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-instruct/300fd27e-4dce-441f-91da-f38bd14ffe5e.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-instruct/300fd27e-4dce-441f-91da-f38bd14ffe5e.json deleted file mode 100644 index b5c6943a0..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-instruct/300fd27e-4dce-441f-91da-f38bd14ffe5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-8b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.0-8b-instruct", - "id": "ibm-granite/granite-3.0-8b-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 8.171 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5192 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.142 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3901 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3457 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-base/1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-base/1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json deleted file mode 100644 index 81e9a2dd8..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-base/1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-1b-a400m-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.1-1b-a400m-base", - "id": "ibm-granite/granite-3.1-1b-a400m-base", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteMoeForCausalLM", - "params_billions": 1.335 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2519 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3299 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-instruct/4c34d5c6-af1b-4519-8d08-67bd837e9b97.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-instruct/4c34d5c6-af1b-4519-8d08-67bd837e9b97.json deleted file mode 100644 index 18f994f0f..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-instruct/4c34d5c6-af1b-4519-8d08-67bd837e9b97.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-1b-a400m-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.1-1b-a400m-instruct", - "id": "ibm-granite/granite-3.1-1b-a400m-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GraniteMoeForCausalLM", - "params_billions": 1.335 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4686 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3302 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1217 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-base/ddc27df7-1c4c-4563-92b2-5a39380423a8.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-base/ddc27df7-1c4c-4563-92b2-5a39380423a8.json deleted file mode 100644 index d36351d16..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-base/ddc27df7-1c4c-4563-92b2-5a39380423a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-2b-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.1-2b-base", - "id": "ibm-granite/granite-3.1-2b-base", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 2.534 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3522 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2251 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-instruct/3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-instruct/3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json deleted file mode 100644 index 8f8fd088d..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-instruct/3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-2b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.1-2b-instruct", - "id": "ibm-granite/granite-3.1-2b-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GraniteForCausalLM", - "params_billions": 2.534 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6286 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4409 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1526 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3605 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-base/b9053559-3b90-4de0-981a-dbb49db38eb5.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-base/b9053559-3b90-4de0-981a-dbb49db38eb5.json deleted file mode 100644 index d42475062..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-base/b9053559-3b90-4de0-981a-dbb49db38eb5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-3b-a800m-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.1-3b-a800m-base", - "id": "ibm-granite/granite-3.1-3b-a800m-base", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteMoeForCausalLM", - "params_billions": 3.299 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2996 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3628 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1793 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-instruct/cea89bc6-b1a1-4b67-a136-45e097563a5b.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-instruct/cea89bc6-b1a1-4b67-a136-45e097563a5b.json deleted file mode 100644 index cd1998b51..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-instruct/cea89bc6-b1a1-4b67-a136-45e097563a5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-3b-a800m-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.1-3b-a800m-instruct", - "id": "ibm-granite/granite-3.1-3b-a800m-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GraniteMoeForCausalLM", - "params_billions": 3.299 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5516 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4009 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2148 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-base/5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-base/5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json deleted file mode 100644 index 0a47fdba2..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-base/5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-8b-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.1-8b-base", - "id": "ibm-granite/granite-3.1-8b-base", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GraniteForCausalLM", - "params_billions": 8.171 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4221 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4777 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3232 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-instruct/45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-instruct/45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json deleted file mode 100644 index 5014bf7fd..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-instruct/45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-8b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.1-8b-instruct", - "id": "ibm-granite/granite-3.1-8b-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GraniteForCausalLM", - "params_billions": 8.171 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5364 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4707 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3537 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.2-2b-instruct/c94079d1-d8b1-4198-8129-8c5a11c310ca.json b/data/hfopenllm_v2/ibm-granite/granite-3.2-2b-instruct/c94079d1-d8b1-4198-8129-8c5a11c310ca.json deleted file mode 100644 index b97bdbfbb..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.2-2b-instruct/c94079d1-d8b1-4198-8129-8c5a11c310ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.2-2b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.2-2b-instruct", - "id": "ibm-granite/granite-3.2-2b-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 2.534 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6152 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1443 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3646 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2783 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.2-8b-instruct/cb45306a-096c-4ed5-a028-6d720b26afe9.json b/data/hfopenllm_v2/ibm-granite/granite-3.2-8b-instruct/cb45306a-096c-4ed5-a028-6d720b26afe9.json deleted file mode 100644 index 15fc669a2..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-3.2-8b-instruct/cb45306a-096c-4ed5-a028-6d720b26afe9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.2-8b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-3.2-8b-instruct", - "id": "ibm-granite/granite-3.2-8b-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 8.171 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5402 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2379 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3512 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-7b-base/f301908e-474b-4ba2-a873-610ca1b6c2bd.json b/data/hfopenllm_v2/ibm-granite/granite-7b-base/f301908e-474b-4ba2-a873-610ca1b6c2bd.json deleted file mode 100644 index 678678ab3..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-7b-base/f301908e-474b-4ba2-a873-610ca1b6c2bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-7b-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-7b-base", - "id": "ibm-granite/granite-7b-base", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2414 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.348 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1834 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm-granite/granite-7b-instruct/06f5865d-a62a-48da-b33f-486fe29e3685.json b/data/hfopenllm_v2/ibm-granite/granite-7b-instruct/06f5865d-a62a-48da-b33f-486fe29e3685.json deleted file mode 100644 index d82e7bb12..000000000 --- a/data/hfopenllm_v2/ibm-granite/granite-7b-instruct/06f5865d-a62a-48da-b33f-486fe29e3685.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm-granite_granite-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "granite-7b-instruct", - "id": "ibm-granite/granite-7b-instruct", - "developer": "ibm-granite", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2972 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2286 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm/PowerLM-3b/4f952c51-91dc-446e-bda1-43ed66e1ca3e.json b/data/hfopenllm_v2/ibm/PowerLM-3b/4f952c51-91dc-446e-bda1-43ed66e1ca3e.json deleted file mode 100644 index fb5d9a315..000000000 --- a/data/hfopenllm_v2/ibm/PowerLM-3b/4f952c51-91dc-446e-bda1-43ed66e1ca3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm_PowerLM-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PowerLM-3b", - "id": "ibm/PowerLM-3b", - "developer": "ibm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GraniteForCausalLM", - "params_billions": 3.512 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2016 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ibm/merlinite-7b/dcba3a6f-8f4f-49f6-af74-541de16be435.json b/data/hfopenllm_v2/ibm/merlinite-7b/dcba3a6f-8f4f-49f6-af74-541de16be435.json deleted file mode 100644 index 240a78e34..000000000 --- a/data/hfopenllm_v2/ibm/merlinite-7b/dcba3a6f-8f4f-49f6-af74-541de16be435.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ibm_merlinite-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merlinite-7b", - "id": "ibm/merlinite-7b", - "developer": "ibm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2499 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5007 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3068 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.15-02.10-RP/b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json b/data/hfopenllm_v2/icefog72/Ice0.15-02.10-RP/b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json deleted file mode 100644 index 920868959..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.15-02.10-RP/b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.15-02.10-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.15-02.10-RP", - "id": "icefog72/Ice0.15-02.10-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5343 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4976 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3066 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.16-02.10-RP/1e597e9b-4e75-4981-842b-dad6f1c15ed7.json b/data/hfopenllm_v2/icefog72/Ice0.16-02.10-RP/1e597e9b-4e75-4981-842b-dad6f1c15ed7.json deleted file mode 100644 index 6d89ee60d..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.16-02.10-RP/1e597e9b-4e75-4981-842b-dad6f1c15ed7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.16-02.10-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.16-02.10-RP", - "id": "icefog72/Ice0.16-02.10-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5069 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4946 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0589 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4334 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3068 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.17-03.10-RP/18752dc4-76d1-40dc-9f43-62b8087b7a88.json b/data/hfopenllm_v2/icefog72/Ice0.17-03.10-RP/18752dc4-76d1-40dc-9f43-62b8087b7a88.json deleted file mode 100644 index 1e5cdb6bc..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.17-03.10-RP/18752dc4-76d1-40dc-9f43-62b8087b7a88.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.17-03.10-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.17-03.10-RP", - "id": "icefog72/Ice0.17-03.10-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5124 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5007 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4334 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3085 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.27-06.11-RP/fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json b/data/hfopenllm_v2/icefog72/Ice0.27-06.11-RP/fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json deleted file mode 100644 index ce00faedd..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.27-06.11-RP/fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.27-06.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.27-06.11-RP", - "id": "icefog72/Ice0.27-06.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4918 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.29-06.11-RP/5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json b/data/hfopenllm_v2/icefog72/Ice0.29-06.11-RP/5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json deleted file mode 100644 index 8a1fccbff..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.29-06.11-RP/5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.29-06.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.29-06.11-RP", - "id": "icefog72/Ice0.29-06.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4861 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5088 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4459 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.31-08.11-RP/a95ab4cf-456f-4b3d-9bab-2b755649758d.json b/data/hfopenllm_v2/icefog72/Ice0.31-08.11-RP/a95ab4cf-456f-4b3d-9bab-2b755649758d.json deleted file mode 100644 index b2b7ce69b..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.31-08.11-RP/a95ab4cf-456f-4b3d-9bab-2b755649758d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.31-08.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.31-08.11-RP", - "id": "icefog72/Ice0.31-08.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5146 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.32-10.11-RP/9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json b/data/hfopenllm_v2/icefog72/Ice0.32-10.11-RP/9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json deleted file mode 100644 index c07a2e5fe..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.32-10.11-RP/9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.32-10.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.32-10.11-RP", - "id": "icefog72/Ice0.32-10.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4915 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5048 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.34b-14.11-RP/26ff113c-95ca-4716-83f7-4792b46be246.json b/data/hfopenllm_v2/icefog72/Ice0.34b-14.11-RP/26ff113c-95ca-4716-83f7-4792b46be246.json deleted file mode 100644 index e450135ff..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.34b-14.11-RP/26ff113c-95ca-4716-83f7-4792b46be246.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.34b-14.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.34b-14.11-RP", - "id": "icefog72/Ice0.34b-14.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5067 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.442 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.34n-14.11-RP/285e1d08-15a0-4d8b-a844-e4cad923ea9b.json b/data/hfopenllm_v2/icefog72/Ice0.34n-14.11-RP/285e1d08-15a0-4d8b-a844-e4cad923ea9b.json deleted file mode 100644 index 8fc9467cf..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.34n-14.11-RP/285e1d08-15a0-4d8b-a844-e4cad923ea9b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.34n-14.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.34n-14.11-RP", - "id": "icefog72/Ice0.34n-14.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4787 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5091 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.438 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.37-18.11-RP/0462269d-94a3-4991-9af5-e55592f344e5.json b/data/hfopenllm_v2/icefog72/Ice0.37-18.11-RP/0462269d-94a3-4991-9af5-e55592f344e5.json deleted file mode 100644 index 11c77c634..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.37-18.11-RP/0462269d-94a3-4991-9af5-e55592f344e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.37-18.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.37-18.11-RP", - "id": "icefog72/Ice0.37-18.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4972 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4339 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.38-19.11-RP/c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json b/data/hfopenllm_v2/icefog72/Ice0.38-19.11-RP/c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json deleted file mode 100644 index d940d4450..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.38-19.11-RP/c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.38-19.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.38-19.11-RP", - "id": "icefog72/Ice0.38-19.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4403 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5101 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.314 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.39-19.11-RP/0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json b/data/hfopenllm_v2/icefog72/Ice0.39-19.11-RP/0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json deleted file mode 100644 index 07a2c60d3..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.39-19.11-RP/0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.39-19.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.39-19.11-RP", - "id": "icefog72/Ice0.39-19.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5093 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.40-20.11-RP/4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json b/data/hfopenllm_v2/icefog72/Ice0.40-20.11-RP/4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json deleted file mode 100644 index ef805d1b3..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.40-20.11-RP/4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.40-20.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.40-20.11-RP", - "id": "icefog72/Ice0.40-20.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5093 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3099 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.41-22.11-RP/84783e4d-5eed-474d-9463-a01a0890850e.json b/data/hfopenllm_v2/icefog72/Ice0.41-22.11-RP/84783e4d-5eed-474d-9463-a01a0890850e.json deleted file mode 100644 index baa053681..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.41-22.11-RP/84783e4d-5eed-474d-9463-a01a0890850e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.41-22.11-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.41-22.11-RP", - "id": "icefog72/Ice0.41-22.11-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4723 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.456 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2618 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.50-16.01-RP/d9fe39c5-24a5-4240-bfc9-59860fcb3911.json b/data/hfopenllm_v2/icefog72/Ice0.50-16.01-RP/d9fe39c5-24a5-4240-bfc9-59860fcb3911.json deleted file mode 100644 index eb81cb8cd..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.50-16.01-RP/d9fe39c5-24a5-4240-bfc9-59860fcb3911.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.50-16.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.50-16.01-RP", - "id": "icefog72/Ice0.50-16.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4385 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.498 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3069 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.50.1-16.01-RP/2ddf850e-36dc-41b2-92da-e2b45d1544c6.json b/data/hfopenllm_v2/icefog72/Ice0.50.1-16.01-RP/2ddf850e-36dc-41b2-92da-e2b45d1544c6.json deleted file mode 100644 index bcbafb408..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.50.1-16.01-RP/2ddf850e-36dc-41b2-92da-e2b45d1544c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.50.1-16.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.50.1-16.01-RP", - "id": "icefog72/Ice0.50.1-16.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4829 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4327 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3132 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.51-16.01-RP/b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json b/data/hfopenllm_v2/icefog72/Ice0.51-16.01-RP/b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json deleted file mode 100644 index a158fdd3f..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.51-16.01-RP/b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.51-16.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.51-16.01-RP", - "id": "icefog72/Ice0.51-16.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4431 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.51.1-16.01-RP/2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json b/data/hfopenllm_v2/icefog72/Ice0.51.1-16.01-RP/2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json deleted file mode 100644 index 1cd3c840d..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.51.1-16.01-RP/2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.51.1-16.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.51.1-16.01-RP", - "id": "icefog72/Ice0.51.1-16.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4573 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5121 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.52-16.01-RP/425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json b/data/hfopenllm_v2/icefog72/Ice0.52-16.01-RP/425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json deleted file mode 100644 index 10d868dab..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.52-16.01-RP/425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.52-16.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.52-16.01-RP", - "id": "icefog72/Ice0.52-16.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4503 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.308 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.52.1-16.01-RP/7e1fcf4e-9f64-4112-934c-4808f07d32b2.json b/data/hfopenllm_v2/icefog72/Ice0.52.1-16.01-RP/7e1fcf4e-9f64-4112-934c-4808f07d32b2.json deleted file mode 100644 index 51094c005..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.52.1-16.01-RP/7e1fcf4e-9f64-4112-934c-4808f07d32b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.52.1-16.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.52.1-16.01-RP", - "id": "icefog72/Ice0.52.1-16.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4549 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5106 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.53-16.01-RP/d3666566-09dc-4d53-9996-2301c6fb2721.json b/data/hfopenllm_v2/icefog72/Ice0.53-16.01-RP/d3666566-09dc-4d53-9996-2301c6fb2721.json deleted file mode 100644 index 2b83a20cc..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.53-16.01-RP/d3666566-09dc-4d53-9996-2301c6fb2721.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.53-16.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.53-16.01-RP", - "id": "icefog72/Ice0.53-16.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4741 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5102 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4327 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.313 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.54-17.01-RP/36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json b/data/hfopenllm_v2/icefog72/Ice0.54-17.01-RP/36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json deleted file mode 100644 index aac00db76..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.54-17.01-RP/36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.54-17.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.54-17.01-RP", - "id": "icefog72/Ice0.54-17.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4379 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4853 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4874 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2326 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.55-17.01-RP/a6dba337-81d2-40c6-89c2-aee6de82282e.json b/data/hfopenllm_v2/icefog72/Ice0.55-17.01-RP/a6dba337-81d2-40c6-89c2-aee6de82282e.json deleted file mode 100644 index d93982f8a..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.55-17.01-RP/a6dba337-81d2-40c6-89c2-aee6de82282e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.55-17.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.55-17.01-RP", - "id": "icefog72/Ice0.55-17.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5077 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4725 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.57-17.01-RP/e44b8d9a-f270-45c8-b126-6a8911c35436.json b/data/hfopenllm_v2/icefog72/Ice0.57-17.01-RP/e44b8d9a-f270-45c8-b126-6a8911c35436.json deleted file mode 100644 index 100730827..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.57-17.01-RP/e44b8d9a-f270-45c8-b126-6a8911c35436.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.57-17.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.57-17.01-RP", - "id": "icefog72/Ice0.57-17.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5152 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5064 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.60-18.01-RP/44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json b/data/hfopenllm_v2/icefog72/Ice0.60-18.01-RP/44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json deleted file mode 100644 index 79e0d8058..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.60-18.01-RP/44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.60-18.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.60-18.01-RP", - "id": "icefog72/Ice0.60-18.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5094 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.467 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2837 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.60.1-18.01-RP/4246401d-9049-4c83-83d4-e2d9efa4dded.json b/data/hfopenllm_v2/icefog72/Ice0.60.1-18.01-RP/4246401d-9049-4c83-83d4-e2d9efa4dded.json deleted file mode 100644 index a75134cde..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.60.1-18.01-RP/4246401d-9049-4c83-83d4-e2d9efa4dded.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.60.1-18.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.60.1-18.01-RP", - "id": "icefog72/Ice0.60.1-18.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5188 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4498 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2914 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.61-18.01-RP/26c4785a-0caf-4b01-be5d-1e421bfeb698.json b/data/hfopenllm_v2/icefog72/Ice0.61-18.01-RP/26c4785a-0caf-4b01-be5d-1e421bfeb698.json deleted file mode 100644 index de443599c..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.61-18.01-RP/26c4785a-0caf-4b01-be5d-1e421bfeb698.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.61-18.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.61-18.01-RP", - "id": "icefog72/Ice0.61-18.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5441 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5105 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4697 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2709 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.62-18.01-RP/cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json b/data/hfopenllm_v2/icefog72/Ice0.62-18.01-RP/cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json deleted file mode 100644 index 1c76bab97..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.62-18.01-RP/cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.62-18.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.62-18.01-RP", - "id": "icefog72/Ice0.62-18.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5367 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5103 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2877 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.62.1-24.01-RP/b4edb7f5-a675-4627-af96-7ed0909da1e5.json b/data/hfopenllm_v2/icefog72/Ice0.62.1-24.01-RP/b4edb7f5-a675-4627-af96-7ed0909da1e5.json deleted file mode 100644 index da0bcab1f..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.62.1-24.01-RP/b4edb7f5-a675-4627-af96-7ed0909da1e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.62.1-24.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.62.1-24.01-RP", - "id": "icefog72/Ice0.62.1-24.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5182 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5109 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4551 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2871 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.64-24.01-RP/461b6f40-6f19-48b1-857e-f0fb37f929f9.json b/data/hfopenllm_v2/icefog72/Ice0.64-24.01-RP/461b6f40-6f19-48b1-857e-f0fb37f929f9.json deleted file mode 100644 index 3cff42e22..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.64-24.01-RP/461b6f40-6f19-48b1-857e-f0fb37f929f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.64-24.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.64-24.01-RP", - "id": "icefog72/Ice0.64-24.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5441 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.506 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2933 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.64.1-24.01-RP/e924270d-a655-4093-91b2-f73b7f12eefd.json b/data/hfopenllm_v2/icefog72/Ice0.64.1-24.01-RP/e924270d-a655-4093-91b2-f73b7f12eefd.json deleted file mode 100644 index 63500c106..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.64.1-24.01-RP/e924270d-a655-4093-91b2-f73b7f12eefd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.64.1-24.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.64.1-24.01-RP", - "id": "icefog72/Ice0.64.1-24.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5447 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.506 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2933 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.65-25.01-RP/af8905e0-e969-45bd-8e09-e7316fff0914.json b/data/hfopenllm_v2/icefog72/Ice0.65-25.01-RP/af8905e0-e969-45bd-8e09-e7316fff0914.json deleted file mode 100644 index 6575a19cf..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.65-25.01-RP/af8905e0-e969-45bd-8e09-e7316fff0914.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.65-25.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.65-25.01-RP", - "id": "icefog72/Ice0.65-25.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5029 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2997 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.66-25.01-RP/e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json b/data/hfopenllm_v2/icefog72/Ice0.66-25.01-RP/e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json deleted file mode 100644 index f0e31f665..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.66-25.01-RP/e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.66-25.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.66-25.01-RP", - "id": "icefog72/Ice0.66-25.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5325 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5129 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3039 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.67-25.01-RP/47472cd9-36d3-4074-83d4-af53b9c23758.json b/data/hfopenllm_v2/icefog72/Ice0.67-25.01-RP/47472cd9-36d3-4074-83d4-af53b9c23758.json deleted file mode 100644 index 46ca836b5..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.67-25.01-RP/47472cd9-36d3-4074-83d4-af53b9c23758.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.67-25.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.67-25.01-RP", - "id": "icefog72/Ice0.67-25.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5361 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5113 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3097 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.68-25.01-RP/b922f4e1-1fd9-4a32-94ce-4784430cef51.json b/data/hfopenllm_v2/icefog72/Ice0.68-25.01-RP/b922f4e1-1fd9-4a32-94ce-4784430cef51.json deleted file mode 100644 index 02b0eca7c..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.68-25.01-RP/b922f4e1-1fd9-4a32-94ce-4784430cef51.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.68-25.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.68-25.01-RP", - "id": "icefog72/Ice0.68-25.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5514 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.513 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.69-25.01-RP/5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json b/data/hfopenllm_v2/icefog72/Ice0.69-25.01-RP/5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json deleted file mode 100644 index 87263c5f0..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.69-25.01-RP/5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.69-25.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.69-25.01-RP", - "id": "icefog72/Ice0.69-25.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5438 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5098 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2965 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.7-29.09-RP/35937213-bb16-4935-9d92-9fa8fd61aac3.json b/data/hfopenllm_v2/icefog72/Ice0.7-29.09-RP/35937213-bb16-4935-9d92-9fa8fd61aac3.json deleted file mode 100644 index 88704a4a5..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.7-29.09-RP/35937213-bb16-4935-9d92-9fa8fd61aac3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.7-29.09-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.7-29.09-RP", - "id": "icefog72/Ice0.7-29.09-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5176 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5048 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4238 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.70-25.01-RP/04122d1b-929d-439c-bb8d-f08508f7a00e.json b/data/hfopenllm_v2/icefog72/Ice0.70-25.01-RP/04122d1b-929d-439c-bb8d-f08508f7a00e.json deleted file mode 100644 index 57317f5c1..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.70-25.01-RP/04122d1b-929d-439c-bb8d-f08508f7a00e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.70-25.01-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.70-25.01-RP", - "id": "icefog72/Ice0.70-25.01-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5498 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5136 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4512 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2996 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.70.1-01.02-RP/03beb242-2628-4ea0-a2f3-c3ec43d379de.json b/data/hfopenllm_v2/icefog72/Ice0.70.1-01.02-RP/03beb242-2628-4ea0-a2f3-c3ec43d379de.json deleted file mode 100644 index 72031f33b..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.70.1-01.02-RP/03beb242-2628-4ea0-a2f3-c3ec43d379de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.70.1-01.02-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.70.1-01.02-RP", - "id": "icefog72/Ice0.70.1-01.02-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.506 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4599 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2749 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.73-01.02-RP/46d55b7b-1972-4cb0-97ca-e04d306282a7.json b/data/hfopenllm_v2/icefog72/Ice0.73-01.02-RP/46d55b7b-1972-4cb0-97ca-e04d306282a7.json deleted file mode 100644 index 6f985e2ba..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.73-01.02-RP/46d55b7b-1972-4cb0-97ca-e04d306282a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.73-01.02-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.73-01.02-RP", - "id": "icefog72/Ice0.73-01.02-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5292 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5103 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4664 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2702 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.74-02.02-RP/32730d82-cfac-481f-9a22-9cbe40646218.json b/data/hfopenllm_v2/icefog72/Ice0.74-02.02-RP/32730d82-cfac-481f-9a22-9cbe40646218.json deleted file mode 100644 index db1bba27d..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.74-02.02-RP/32730d82-cfac-481f-9a22-9cbe40646218.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.74-02.02-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.74-02.02-RP", - "id": "icefog72/Ice0.74-02.02-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2935 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4646 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.76-02.02-RP/a290a75f-753b-489d-87a2-ce0637c09f41.json b/data/hfopenllm_v2/icefog72/Ice0.76-02.02-RP/a290a75f-753b-489d-87a2-ce0637c09f41.json deleted file mode 100644 index 9c170c9cd..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.76-02.02-RP/a290a75f-753b-489d-87a2-ce0637c09f41.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.76-02.02-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.76-02.02-RP", - "id": "icefog72/Ice0.76-02.02-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4529 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5086 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4362 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2652 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.77-02.02-RP/54032eb0-c4cd-4c76-be2e-f0c81bd26365.json b/data/hfopenllm_v2/icefog72/Ice0.77-02.02-RP/54032eb0-c4cd-4c76-be2e-f0c81bd26365.json deleted file mode 100644 index d50d5a736..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.77-02.02-RP/54032eb0-c4cd-4c76-be2e-f0c81bd26365.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.77-02.02-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.77-02.02-RP", - "id": "icefog72/Ice0.77-02.02-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.531 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5109 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4765 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2999 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.78-02.02-RP/73b59506-cc1d-413c-a28b-d25e0e6bf413.json b/data/hfopenllm_v2/icefog72/Ice0.78-02.02-RP/73b59506-cc1d-413c-a28b-d25e0e6bf413.json deleted file mode 100644 index cbf1db27b..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.78-02.02-RP/73b59506-cc1d-413c-a28b-d25e0e6bf413.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.78-02.02-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.78-02.02-RP", - "id": "icefog72/Ice0.78-02.02-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4053 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5002 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2955 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/Ice0.80-03.02-RP/bea2dcd6-4772-4aac-bcbc-4802cfb33495.json b/data/hfopenllm_v2/icefog72/Ice0.80-03.02-RP/bea2dcd6-4772-4aac-bcbc-4802cfb33495.json deleted file mode 100644 index 1c2303ea6..000000000 --- a/data/hfopenllm_v2/icefog72/Ice0.80-03.02-RP/bea2dcd6-4772-4aac-bcbc-4802cfb33495.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_Ice0.80-03.02-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ice0.80-03.02-RP", - "id": "icefog72/Ice0.80-03.02-RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5516 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5098 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4923 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2912 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceCocoaRP-7b/66275215-28e6-42bc-bc22-5d152682ce53.json b/data/hfopenllm_v2/icefog72/IceCocoaRP-7b/66275215-28e6-42bc-bc22-5d152682ce53.json deleted file mode 100644 index 481d7723a..000000000 --- a/data/hfopenllm_v2/icefog72/IceCocoaRP-7b/66275215-28e6-42bc-bc22-5d152682ce53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceCocoaRP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceCocoaRP-7b", - "id": "icefog72/IceCocoaRP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4938 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceCoffeeRP-7b/9015365c-400b-4fa3-85f2-a1033b030cf7.json b/data/hfopenllm_v2/icefog72/IceCoffeeRP-7b/9015365c-400b-4fa3-85f2-a1033b030cf7.json deleted file mode 100644 index dd02fc052..000000000 --- a/data/hfopenllm_v2/icefog72/IceCoffeeRP-7b/9015365c-400b-4fa3-85f2-a1033b030cf7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceCoffeeRP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceCoffeeRP-7b", - "id": "icefog72/IceCoffeeRP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4959 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4889 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2975 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceDrinkByFrankensteinV3RP/55d52914-0904-4e6e-8b37-c22b06f5f2bf.json b/data/hfopenllm_v2/icefog72/IceDrinkByFrankensteinV3RP/55d52914-0904-4e6e-8b37-c22b06f5f2bf.json deleted file mode 100644 index ccf1b184c..000000000 --- a/data/hfopenllm_v2/icefog72/IceDrinkByFrankensteinV3RP/55d52914-0904-4e6e-8b37-c22b06f5f2bf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceDrinkByFrankensteinV3RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceDrinkByFrankensteinV3RP", - "id": "icefog72/IceDrinkByFrankensteinV3RP", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4975 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4833 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2927 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock/3677260a-2fd5-41bf-9010-f1b31cedacbc.json b/data/hfopenllm_v2/icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock/3677260a-2fd5-41bf-9010-f1b31cedacbc.json deleted file mode 100644 index 24025b247..000000000 --- a/data/hfopenllm_v2/icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock/3677260a-2fd5-41bf-9010-f1b31cedacbc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceDrinkNameGoesHereRP-7b-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceDrinkNameGoesHereRP-7b-Model_Stock", - "id": "icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4968 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4658 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4067 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock/fc54f87a-2e4a-4f3f-b407-e268c4487d16.json b/data/hfopenllm_v2/icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock/fc54f87a-2e4a-4f3f-b407-e268c4487d16.json deleted file mode 100644 index 2746a283f..000000000 --- a/data/hfopenllm_v2/icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock/fc54f87a-2e4a-4f3f-b407-e268c4487d16.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceDrinkNameNotFoundRP-7b-Model_Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceDrinkNameNotFoundRP-7b-Model_Stock", - "id": "icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.513 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3064 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceDrunkCherryRP-7b/8d893736-1707-4c0b-860d-16c62ec26d78.json b/data/hfopenllm_v2/icefog72/IceDrunkCherryRP-7b/8d893736-1707-4c0b-860d-16c62ec26d78.json deleted file mode 100644 index fdf69bdb5..000000000 --- a/data/hfopenllm_v2/icefog72/IceDrunkCherryRP-7b/8d893736-1707-4c0b-860d-16c62ec26d78.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceDrunkCherryRP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceDrunkCherryRP-7b", - "id": "icefog72/IceDrunkCherryRP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4898 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4847 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4292 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3009 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceDrunkenCherryRP-7b/d3d2728f-74bf-4196-a909-43797d8b628a.json b/data/hfopenllm_v2/icefog72/IceDrunkenCherryRP-7b/d3d2728f-74bf-4196-a909-43797d8b628a.json deleted file mode 100644 index 2dca8d588..000000000 --- a/data/hfopenllm_v2/icefog72/IceDrunkenCherryRP-7b/d3d2728f-74bf-4196-a909-43797d8b628a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceDrunkenCherryRP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceDrunkenCherryRP-7b", - "id": "icefog72/IceDrunkenCherryRP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5093 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3099 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceEspressoRPv2-7b/ed241e67-8718-48be-a6e8-19e295a2b5cd.json b/data/hfopenllm_v2/icefog72/IceEspressoRPv2-7b/ed241e67-8718-48be-a6e8-19e295a2b5cd.json deleted file mode 100644 index 1ca2a66f9..000000000 --- a/data/hfopenllm_v2/icefog72/IceEspressoRPv2-7b/ed241e67-8718-48be-a6e8-19e295a2b5cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceEspressoRPv2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceEspressoRPv2-7b", - "id": "icefog72/IceEspressoRPv2-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4977 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5055 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4331 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3061 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceLemonTeaRP-32k-7b/05aafad3-e07a-453b-a70b-f18fbd4eb218.json b/data/hfopenllm_v2/icefog72/IceLemonTeaRP-32k-7b/05aafad3-e07a-453b-a70b-f18fbd4eb218.json deleted file mode 100644 index 751ec205c..000000000 --- a/data/hfopenllm_v2/icefog72/IceLemonTeaRP-32k-7b/05aafad3-e07a-453b-a70b-f18fbd4eb218.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceLemonTeaRP-32k-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceLemonTeaRP-32k-7b", - "id": "icefog72/IceLemonTeaRP-32k-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5212 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4997 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3068 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceMartiniRP-7b/f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json b/data/hfopenllm_v2/icefog72/IceMartiniRP-7b/f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json deleted file mode 100644 index e94d36e4e..000000000 --- a/data/hfopenllm_v2/icefog72/IceMartiniRP-7b/f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceMartiniRP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceMartiniRP-7b", - "id": "icefog72/IceMartiniRP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5045 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4972 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4345 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3073 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceNalyvkaRP-7b/cec76b15-1069-4d37-b8bc-74dde28101f6.json b/data/hfopenllm_v2/icefog72/IceNalyvkaRP-7b/cec76b15-1069-4d37-b8bc-74dde28101f6.json deleted file mode 100644 index f70b98ac1..000000000 --- a/data/hfopenllm_v2/icefog72/IceNalyvkaRP-7b/cec76b15-1069-4d37-b8bc-74dde28101f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceNalyvkaRP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceNalyvkaRP-7b", - "id": "icefog72/IceNalyvkaRP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5498 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5136 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4512 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2996 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceSakeRP-7b/e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json b/data/hfopenllm_v2/icefog72/IceSakeRP-7b/e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json deleted file mode 100644 index 70a43f1aa..000000000 --- a/data/hfopenllm_v2/icefog72/IceSakeRP-7b/e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceSakeRP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceSakeRP-7b", - "id": "icefog72/IceSakeRP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5228 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5119 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.413 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3177 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceSakeV4RP-7b/f8d629bf-df0b-4c6a-8c18-17dda002b089.json b/data/hfopenllm_v2/icefog72/IceSakeV4RP-7b/f8d629bf-df0b-4c6a-8c18-17dda002b089.json deleted file mode 100644 index 620f3fe56..000000000 --- a/data/hfopenllm_v2/icefog72/IceSakeV4RP-7b/f8d629bf-df0b-4c6a-8c18-17dda002b089.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceSakeV4RP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceSakeV4RP-7b", - "id": "icefog72/IceSakeV4RP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4634 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4082 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3103 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceSakeV6RP-7b/6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json b/data/hfopenllm_v2/icefog72/IceSakeV6RP-7b/6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json deleted file mode 100644 index 477e0dbce..000000000 --- a/data/hfopenllm_v2/icefog72/IceSakeV6RP-7b/6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceSakeV6RP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceSakeV6RP-7b", - "id": "icefog72/IceSakeV6RP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5033 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4976 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceSakeV8RP-7b/a51722f4-29f4-47a5-acba-4c8b5355551b.json b/data/hfopenllm_v2/icefog72/IceSakeV8RP-7b/a51722f4-29f4-47a5-acba-4c8b5355551b.json deleted file mode 100644 index 7a5af5dc8..000000000 --- a/data/hfopenllm_v2/icefog72/IceSakeV8RP-7b/a51722f4-29f4-47a5-acba-4c8b5355551b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceSakeV8RP-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceSakeV8RP-7b", - "id": "icefog72/IceSakeV8RP-7b", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6086 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4885 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3993 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.301 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5/06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json b/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5/06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json deleted file mode 100644 index bf984e1a4..000000000 --- a/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5/06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceTea21EnergyDrinkRPV13-DPOv3.5", - "id": "icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4871 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3964 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2498 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3/04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json b/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3/04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json deleted file mode 100644 index 87ff25f1f..000000000 --- a/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3/04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/icefog72_IceTea21EnergyDrinkRPV13-DPOv3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IceTea21EnergyDrinkRPV13-DPOv3", - "id": "icefog72/IceTea21EnergyDrinkRPV13-DPOv3", - "developer": "icefog72", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5263 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.502 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3056 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ifable/gemma-2-Ifable-9B/e4668365-d3dd-4996-9bb1-5b4e6f510264.json b/data/hfopenllm_v2/ifable/gemma-2-Ifable-9B/e4668365-d3dd-4996-9bb1-5b4e6f510264.json deleted file mode 100644 index 186f9833c..000000000 --- a/data/hfopenllm_v2/ifable/gemma-2-Ifable-9B/e4668365-d3dd-4996-9bb1-5b4e6f510264.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ifable_gemma-2-Ifable-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-Ifable-9B", - "id": "ifable/gemma-2-Ifable-9B", - "developer": "ifable", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2984 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5866 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4053 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ilsp/Llama-Krikri-8B-Instruct/4d743678-e14d-4866-b1bf-0d660787847b.json b/data/hfopenllm_v2/ilsp/Llama-Krikri-8B-Instruct/4d743678-e14d-4866-b1bf-0d660787847b.json deleted file mode 100644 index 4ae435b53..000000000 --- a/data/hfopenllm_v2/ilsp/Llama-Krikri-8B-Instruct/4d743678-e14d-4866-b1bf-0d660787847b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ilsp_Llama-Krikri-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Krikri-8B-Instruct", - "id": "ilsp/Llama-Krikri-8B-Instruct", - "developer": "ilsp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.202 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6079 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3313 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/inflatebot/MN-12B-Mag-Mell-R1/720b1476-876c-47d1-bf46-d037389b4b2f.json b/data/hfopenllm_v2/inflatebot/MN-12B-Mag-Mell-R1/720b1476-876c-47d1-bf46-d037389b4b2f.json deleted file mode 100644 index 848ed7790..000000000 --- a/data/hfopenllm_v2/inflatebot/MN-12B-Mag-Mell-R1/720b1476-876c-47d1-bf46-d037389b4b2f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/inflatebot_MN-12B-Mag-Mell-R1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Mag-Mell-R1", - "id": "inflatebot/MN-12B-Mag-Mell-R1", - "developer": "inflatebot", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4613 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5304 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4002 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3438 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/informatiker/Qwen2-7B-Instruct-abliterated/4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json b/data/hfopenllm_v2/informatiker/Qwen2-7B-Instruct-abliterated/4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json deleted file mode 100644 index 69b688c62..000000000 --- a/data/hfopenllm_v2/informatiker/Qwen2-7B-Instruct-abliterated/4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/informatiker_Qwen2-7B-Instruct-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-7B-Instruct-abliterated", - "id": "informatiker/Qwen2-7B-Instruct-abliterated", - "developer": "informatiker", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5822 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5534 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2636 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3888 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json b/data/hfopenllm_v2/insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json deleted file mode 100644 index 35ea90e61..000000000 --- a/data/hfopenllm_v2/insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/insightfactory_Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model", - "id": "insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model", - "developer": "insightfactory", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "", - "params_billions": 1.933 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4588 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4146 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3499 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.296 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/instruction-pretrain/InstructLM-500M/623f1b73-1505-4527-b41c-dcb2b711226d.json b/data/hfopenllm_v2/instruction-pretrain/InstructLM-500M/623f1b73-1505-4527-b41c-dcb2b711226d.json deleted file mode 100644 index 250a5012f..000000000 --- a/data/hfopenllm_v2/instruction-pretrain/InstructLM-500M/623f1b73-1505-4527-b41c-dcb2b711226d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/instruction-pretrain_InstructLM-500M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "InstructLM-500M", - "id": "instruction-pretrain/InstructLM-500M", - "developer": "instruction-pretrain", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 0.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1028 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2941 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1141 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/internlm/internlm2-1_8b/53f03454-9587-4208-bc01-21de62f59195.json b/data/hfopenllm_v2/internlm/internlm2-1_8b/53f03454-9587-4208-bc01-21de62f59195.json deleted file mode 100644 index 112c6e318..000000000 --- a/data/hfopenllm_v2/internlm/internlm2-1_8b/53f03454-9587-4208-bc01-21de62f59195.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/internlm_internlm2-1_8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm2-1_8b", - "id": "internlm/internlm2-1_8b", - "developer": "internlm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "InternLM2ForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3813 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1588 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/internlm/internlm2-7b/fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json b/data/hfopenllm_v2/internlm/internlm2-7b/fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json deleted file mode 100644 index 0954fe225..000000000 --- a/data/hfopenllm_v2/internlm/internlm2-7b/fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/internlm_internlm2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm2-7b", - "id": "internlm/internlm2-7b", - "developer": "internlm", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Unknown", - "params_billions": 0.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.228 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5825 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0857 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3367 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.19 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/internlm/internlm2-chat-1_8b/b127a923-3bf2-4cad-9225-d738efe800e3.json b/data/hfopenllm_v2/internlm/internlm2-chat-1_8b/b127a923-3bf2-4cad-9225-d738efe800e3.json deleted file mode 100644 index 0ea7fed94..000000000 --- a/data/hfopenllm_v2/internlm/internlm2-chat-1_8b/b127a923-3bf2-4cad-9225-d738efe800e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/internlm_internlm2-chat-1_8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm2-chat-1_8b", - "id": "internlm/internlm2-chat-1_8b", - "developer": "internlm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "InternLM2ForCausalLM", - "params_billions": 1.889 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2387 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1839 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/internlm/internlm2_5-1_8b-chat/a94ae52a-7936-4750-83f5-4740f23adf15.json b/data/hfopenllm_v2/internlm/internlm2_5-1_8b-chat/a94ae52a-7936-4750-83f5-4740f23adf15.json deleted file mode 100644 index 89f784e4a..000000000 --- a/data/hfopenllm_v2/internlm/internlm2_5-1_8b-chat/a94ae52a-7936-4750-83f5-4740f23adf15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/internlm_internlm2_5-1_8b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm2_5-1_8b-chat", - "id": "internlm/internlm2_5-1_8b-chat", - "developer": "internlm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "InternLM2ForCausalLM", - "params_billions": 1.89 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3849 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1586 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3594 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/internlm/internlm2_5-20b-chat/95e689c6-cd19-4114-b3b5-1672ab849214.json b/data/hfopenllm_v2/internlm/internlm2_5-20b-chat/95e689c6-cd19-4114-b3b5-1672ab849214.json deleted file mode 100644 index c3237ef07..000000000 --- a/data/hfopenllm_v2/internlm/internlm2_5-20b-chat/95e689c6-cd19-4114-b3b5-1672ab849214.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/internlm_internlm2_5-20b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm2_5-20b-chat", - "id": "internlm/internlm2_5-20b-chat", - "developer": "internlm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "InternLM2ForCausalLM", - "params_billions": 19.86 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.701 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7474 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4079 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4558 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/internlm/internlm2_5-7b-chat/890a8414-bccf-4a66-8013-6c270d017965.json b/data/hfopenllm_v2/internlm/internlm2_5-7b-chat/890a8414-bccf-4a66-8013-6c270d017965.json deleted file mode 100644 index 39a2746e0..000000000 --- a/data/hfopenllm_v2/internlm/internlm2_5-7b-chat/890a8414-bccf-4a66-8013-6c270d017965.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/internlm_internlm2_5-7b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm2_5-7b-chat", - "id": "internlm/internlm2_5-7b-chat", - "developer": "internlm", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "InternLM2ForCausalLM", - "params_billions": 7.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5539 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.253 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4594 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/intervitens/mini-magnum-12b-v1.1/0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json b/data/hfopenllm_v2/intervitens/mini-magnum-12b-v1.1/0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json deleted file mode 100644 index 37988d1da..000000000 --- a/data/hfopenllm_v2/intervitens/mini-magnum-12b-v1.1/0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/intervitens_mini-magnum-12b-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mini-magnum-12b-v1.1", - "id": "intervitens/mini-magnum-12b-v1.1", - "developer": "intervitens", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5156 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5062 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4004 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3291 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/inumulaisk/eval_model/121096cf-356b-4069-a0a3-8cf6aad52b81.json b/data/hfopenllm_v2/inumulaisk/eval_model/121096cf-356b-4069-a0a3-8cf6aad52b81.json deleted file mode 100644 index fd8c4c747..000000000 --- a/data/hfopenllm_v2/inumulaisk/eval_model/121096cf-356b-4069-a0a3-8cf6aad52b81.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/inumulaisk_eval_model/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "eval_model", - "id": "inumulaisk/eval_model", - "developer": "inumulaisk", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1931 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2976 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1664 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/fb0bcadf-32a0-4320-909f-2c38ba7d9372.json b/data/hfopenllm_v2/invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/fb0bcadf-32a0-4320-909f-2c38ba7d9372.json deleted file mode 100644 index 8bfc59d66..000000000 --- a/data/hfopenllm_v2/invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/fb0bcadf-32a0-4320-909f-2c38ba7d9372.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/invalid-coder_Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp", - "id": "invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp", - "developer": "invalid-coder", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5158 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3992 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.2-8B/ab941c52-cf33-4b8e-87af-4a73930cf72a.json b/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.2-8B/ab941c52-cf33-4b8e-87af-4a73930cf72a.json deleted file mode 100644 index a6bf367ef..000000000 --- a/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.2-8B/ab941c52-cf33-4b8e-87af-4a73930cf72a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/invisietch_EtherealRainbow-v0.2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EtherealRainbow-v0.2-8B", - "id": "invisietch/EtherealRainbow-v0.2-8B", - "developer": "invisietch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3903 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5102 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0823 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3827 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3653 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.3-8B/08c242fd-0258-4817-970a-668584ed9385.json b/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.3-8B/08c242fd-0258-4817-970a-668584ed9385.json deleted file mode 100644 index d5ccbfe08..000000000 --- a/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.3-8B/08c242fd-0258-4817-970a-668584ed9385.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/invisietch_EtherealRainbow-v0.3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EtherealRainbow-v0.3-8B", - "id": "invisietch/EtherealRainbow-v0.3-8B", - "developer": "invisietch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3682 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5097 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3904 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3626 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/invisietch/MiS-Firefly-v0.2-22B/2171af9a-be5e-4daf-8e67-a5239ccec7bd.json b/data/hfopenllm_v2/invisietch/MiS-Firefly-v0.2-22B/2171af9a-be5e-4daf-8e67-a5239ccec7bd.json deleted file mode 100644 index 501989bf4..000000000 --- a/data/hfopenllm_v2/invisietch/MiS-Firefly-v0.2-22B/2171af9a-be5e-4daf-8e67-a5239ccec7bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/invisietch_MiS-Firefly-v0.2-22B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiS-Firefly-v0.2-22B", - "id": "invisietch/MiS-Firefly-v0.2-22B", - "developer": "invisietch", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5371 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5514 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1654 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4694 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.362 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/invisietch/Nimbus-Miqu-v0.1-70B/706f75a1-2f6b-47dd-809e-a830e739b574.json b/data/hfopenllm_v2/invisietch/Nimbus-Miqu-v0.1-70B/706f75a1-2f6b-47dd-809e-a830e739b574.json deleted file mode 100644 index 4802b7933..000000000 --- a/data/hfopenllm_v2/invisietch/Nimbus-Miqu-v0.1-70B/706f75a1-2f6b-47dd-809e-a830e739b574.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/invisietch_Nimbus-Miqu-v0.1-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nimbus-Miqu-v0.1-70B", - "id": "invisietch/Nimbus-Miqu-v0.1-70B", - "developer": "invisietch", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 68.977 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4647 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.601 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4133 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3853 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/irahulpandey/mistralai-7B-slerp-v0.1/a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json b/data/hfopenllm_v2/irahulpandey/mistralai-7B-slerp-v0.1/a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json deleted file mode 100644 index 42cef2a10..000000000 --- a/data/hfopenllm_v2/irahulpandey/mistralai-7B-slerp-v0.1/a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/irahulpandey_mistralai-7B-slerp-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistralai-7B-slerp-v0.1", - "id": "irahulpandey/mistralai-7B-slerp-v0.1", - "developer": "irahulpandey", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4966 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5011 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.455 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2951 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/67cfd12d-0551-406d-bd1d-8ced75c69478.json b/data/hfopenllm_v2/jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/67cfd12d-0551-406d-bd1d-8ced75c69478.json deleted file mode 100644 index 6cad09ae3..000000000 --- a/data/hfopenllm_v2/jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/67cfd12d-0551-406d-bd1d-8ced75c69478.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaredjoss_pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model", - "id": "jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model", - "developer": "jaredjoss", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 0.407 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1572 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2863 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1169 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2-8B/0a31d2f0-196b-4508-861a-1ba7bd28ea23.json b/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2-8B/0a31d2f0-196b-4508-861a-1ba7bd28ea23.json deleted file mode 100644 index 8ebe97770..000000000 --- a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2-8B/0a31d2f0-196b-4508-861a-1ba7bd28ea23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Auro-Kosmos-EVAA-v2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Auro-Kosmos-EVAA-v2-8B", - "id": "jaspionjader/Auro-Kosmos-EVAA-v2-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4778 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5447 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3858 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.1-8B/57576999-2749-441a-91d6-5a976e83a658.json b/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.1-8B/57576999-2749-441a-91d6-5a976e83a658.json deleted file mode 100644 index 315886d52..000000000 --- a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.1-8B/57576999-2749-441a-91d6-5a976e83a658.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Auro-Kosmos-EVAA-v2.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Auro-Kosmos-EVAA-v2.1-8B", - "id": "jaspionjader/Auro-Kosmos-EVAA-v2.1-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4666 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5444 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4317 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.2-8B/e44792e6-0329-4784-832b-3043478e70a4.json b/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.2-8B/e44792e6-0329-4784-832b-3043478e70a4.json deleted file mode 100644 index 3152352b6..000000000 --- a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.2-8B/e44792e6-0329-4784-832b-3043478e70a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Auro-Kosmos-EVAA-v2.2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Auro-Kosmos-EVAA-v2.2-8B", - "id": "jaspionjader/Auro-Kosmos-EVAA-v2.2-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4268 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3798 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.3-8B/8b3789d6-51be-472a-95d3-2ae7c34ad140.json b/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.3-8B/8b3789d6-51be-472a-95d3-2ae7c34ad140.json deleted file mode 100644 index f7a617f03..000000000 --- a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.3-8B/8b3789d6-51be-472a-95d3-2ae7c34ad140.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Auro-Kosmos-EVAA-v2.3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Auro-Kosmos-EVAA-v2.3-8B", - "id": "jaspionjader/Auro-Kosmos-EVAA-v2.3-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4271 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5441 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4278 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Aurora_faustus-8B/3f4765f2-551b-485f-9020-0cf17a36a887.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Aurora_faustus-8B/3f4765f2-551b-485f-9020-0cf17a36a887.json deleted file mode 100644 index 0ae45d0a4..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-Aurora_faustus-8B/3f4765f2-551b-485f-9020-0cf17a36a887.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Aurora_faustus-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-Aurora_faustus-8B", - "id": "jaspionjader/Kosmos-Aurora_faustus-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.526 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4117 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3813 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-8B/6375a845-5d86-4dcf-bfd2-e836daa4ca11.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-8B/6375a845-5d86-4dcf-bfd2-e836daa4ca11.json deleted file mode 100644 index f9aae6a67..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-8B/6375a845-5d86-4dcf-bfd2-e836daa4ca11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-8B", - "id": "jaspionjader/Kosmos-EVAA-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4405 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5312 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3818 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B/65a74446-6964-4f5f-8ea6-aeb1b09595ae.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B/65a74446-6964-4f5f-8ea6-aeb1b09595ae.json deleted file mode 100644 index 79154e34e..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B/65a74446-6964-4f5f-8ea6-aeb1b09595ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-Franken-Immersive-v39-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-Franken-Immersive-v39-8B", - "id": "jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.519 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-v38-8B/dcba5998-3b84-4753-a4fa-2558ffe3e69b.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-v38-8B/dcba5998-3b84-4753-a4fa-2558ffe3e69b.json deleted file mode 100644 index 394557379..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-v38-8B/dcba5998-3b84-4753-a4fa-2558ffe3e69b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-Franken-v38-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-Franken-v38-8B", - "id": "jaspionjader/Kosmos-EVAA-Franken-v38-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.523 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4212 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json deleted file mode 100644 index d05e11595..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-Fusion-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-Fusion-8B", - "id": "jaspionjader/Kosmos-EVAA-Fusion-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4345 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3854 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/4e332594-d0b9-4913-9950-208abe4faab7.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/4e332594-d0b9-4913-9950-208abe4faab7.json deleted file mode 100644 index eb9755b5c..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/4e332594-d0b9-4913-9950-208abe4faab7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-Fusion-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-Fusion-8B", - "id": "jaspionjader/Kosmos-EVAA-Fusion-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4418 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1352 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-8B/5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-8B/5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json deleted file mode 100644 index b95e94419..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-8B/5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3405 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5196 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0884 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4301 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3647 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-light-8B/c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-light-8B/c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json deleted file mode 100644 index 8874fb70f..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-light-8B/c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-light-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-light-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-light-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3824 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5271 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v23-8B/a2e32a77-867c-4921-ada4-c7b169efbebe.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v23-8B/a2e32a77-867c-4921-ada4-c7b169efbebe.json deleted file mode 100644 index 3407edf36..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v23-8B/a2e32a77-867c-4921-ada4-c7b169efbebe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v23-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v23-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v23-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4041 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3706 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v24-8B/f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v24-8B/f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json deleted file mode 100644 index 269ec5b3a..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v24-8B/f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v24-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v24-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v24-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4259 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5276 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3779 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v25-8B/ece0bd6b-4eec-485c-942b-e23f3295c2f8.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v25-8B/ece0bd6b-4eec-485c-942b-e23f3295c2f8.json deleted file mode 100644 index c4a81c92e..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v25-8B/ece0bd6b-4eec-485c-942b-e23f3295c2f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v25-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v25-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v25-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4421 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5291 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4303 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v26-8B/ada110bb-0988-4c19-9798-74577dde5ce9.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v26-8B/ada110bb-0988-4c19-9798-74577dde5ce9.json deleted file mode 100644 index fb137b1ab..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v26-8B/ada110bb-0988-4c19-9798-74577dde5ce9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v26-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v26-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v26-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4414 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5271 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4264 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3793 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v27-8B/ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v27-8B/ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json deleted file mode 100644 index 3a868b97e..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v27-8B/ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v27-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v27-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v27-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3755 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v28-8B/57395f9a-0534-453e-80fc-96e9dc5cd9c3.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v28-8B/57395f9a-0534-453e-80fc-96e9dc5cd9c3.json deleted file mode 100644 index 9e9f72bfb..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v28-8B/57395f9a-0534-453e-80fc-96e9dc5cd9c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v28-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v28-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v28-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5295 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.433 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v29-8B/f8f70702-9ab4-4e1a-a11d-090627d58f02.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v29-8B/f8f70702-9ab4-4e1a-a11d-090627d58f02.json deleted file mode 100644 index 444db8839..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v29-8B/f8f70702-9ab4-4e1a-a11d-090627d58f02.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v29-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v29-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v29-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3765 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v30-8B/3cab8bda-bdf6-4345-b89e-18d34a8f6361.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v30-8B/3cab8bda-bdf6-4345-b89e-18d34a8f6361.json deleted file mode 100644 index 325f0d8d0..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v30-8B/3cab8bda-bdf6-4345-b89e-18d34a8f6361.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v30-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v30-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v30-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4295 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5328 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4263 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3938 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v31-8B/0955fc17-8878-401a-9ec3-149528ee51e1.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v31-8B/0955fc17-8878-401a-9ec3-149528ee51e1.json deleted file mode 100644 index 4cfc98e57..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v31-8B/0955fc17-8878-401a-9ec3-149528ee51e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v31-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v31-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v31-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4399 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5315 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v32-8B/c63bf49a-e7d4-4853-8684-9cc03eaa7840.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v32-8B/c63bf49a-e7d4-4853-8684-9cc03eaa7840.json deleted file mode 100644 index ddd83d684..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v32-8B/c63bf49a-e7d4-4853-8684-9cc03eaa7840.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v32-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v32-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v32-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5293 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v33-8B/65e6a3b6-4291-4591-bc0b-576930061c68.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v33-8B/65e6a3b6-4291-4591-bc0b-576930061c68.json deleted file mode 100644 index 48e58506f..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v33-8B/65e6a3b6-4291-4591-bc0b-576930061c68.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v33-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v33-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v33-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4302 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5321 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4184 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v34-8B/1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v34-8B/1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json deleted file mode 100644 index 89f19eb09..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v34-8B/1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v34-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-PRP-v34-8B", - "id": "jaspionjader/Kosmos-EVAA-PRP-v34-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4563 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5333 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3927 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-8B/f9f96bb2-edbc-4112-97aa-a7420dea32a1.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-8B/f9f96bb2-edbc-4112-97aa-a7420dea32a1.json deleted file mode 100644 index f496d05c5..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-8B/f9f96bb2-edbc-4112-97aa-a7420dea32a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-TSN-8B", - "id": "jaspionjader/Kosmos-EVAA-TSN-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4721 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5177 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4329 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3816 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-light-8B/3a24b30f-7698-4ecb-ac26-3537a0b38616.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-light-8B/3a24b30f-7698-4ecb-ac26-3537a0b38616.json deleted file mode 100644 index 6959b532b..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-light-8B/3a24b30f-7698-4ecb-ac26-3537a0b38616.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-light-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-TSN-light-8B", - "id": "jaspionjader/Kosmos-EVAA-TSN-light-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5235 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3806 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v19-8B/d4030df6-2be6-4f46-9c9b-ce3037b9a004.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v19-8B/d4030df6-2be6-4f46-9c9b-ce3037b9a004.json deleted file mode 100644 index e3426247a..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v19-8B/d4030df6-2be6-4f46-9c9b-ce3037b9a004.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-v19-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-TSN-v19-8B", - "id": "jaspionjader/Kosmos-EVAA-TSN-v19-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v20-8B/ec234403-f43d-46a0-84a4-ab47673226b3.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v20-8B/ec234403-f43d-46a0-84a4-ab47673226b3.json deleted file mode 100644 index 3081940fd..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v20-8B/ec234403-f43d-46a0-84a4-ab47673226b3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-v20-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-TSN-v20-8B", - "id": "jaspionjader/Kosmos-EVAA-TSN-v20-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4423 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.525 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3936 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v21-8B/805379f4-784f-4602-92e8-180df4da9fc3.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v21-8B/805379f4-784f-4602-92e8-180df4da9fc3.json deleted file mode 100644 index 3d18c5f8b..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v21-8B/805379f4-784f-4602-92e8-180df4da9fc3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-v21-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-TSN-v21-8B", - "id": "jaspionjader/Kosmos-EVAA-TSN-v21-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.467 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5248 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3816 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v22-8B/9f3920aa-9400-46f1-bcfa-969f69b3335c.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v22-8B/9f3920aa-9400-46f1-bcfa-969f69b3335c.json deleted file mode 100644 index c54dd019f..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v22-8B/9f3920aa-9400-46f1-bcfa-969f69b3335c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-v22-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-TSN-v22-8B", - "id": "jaspionjader/Kosmos-EVAA-TSN-v22-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4673 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5246 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4303 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-8B/26cbf444-ab93-409a-b85d-e2bd267eae5e.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-8B/26cbf444-ab93-409a-b85d-e2bd267eae5e.json deleted file mode 100644 index 92fa310e3..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-8B/26cbf444-ab93-409a-b85d-e2bd267eae5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4572 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5322 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4306 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3901 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-alt-8B/7c2b17a8-1de2-4441-a281-fe3fd043f831.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-alt-8B/7c2b17a8-1de2-4441-a281-fe3fd043f831.json deleted file mode 100644 index 90cb19b6b..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-alt-8B/7c2b17a8-1de2-4441-a281-fe3fd043f831.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-alt-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-alt-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-alt-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4542 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4292 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-8B/94c5756c-cbde-46e2-90d2-207678373061.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-8B/94c5756c-cbde-46e2-90d2-207678373061.json deleted file mode 100644 index 980ba2f9f..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-8B/94c5756c-cbde-46e2-90d2-207678373061.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-light-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-light-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-light-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4581 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5376 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4291 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-alt-8B/e0048124-89bf-4327-88a8-00aa51ee29af.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-alt-8B/e0048124-89bf-4327-88a8-00aa51ee29af.json deleted file mode 100644 index e1956821f..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-alt-8B/e0048124-89bf-4327-88a8-00aa51ee29af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-light-alt-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-light-alt-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-light-alt-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4454 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5327 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4305 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3923 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B/9d776307-43af-43bb-ab64-52fb7f331cfe.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B/9d776307-43af-43bb-ab64-52fb7f331cfe.json deleted file mode 100644 index 2646a813c..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B/9d776307-43af-43bb-ab64-52fb7f331cfe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-ultra-light-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-ultra-light-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4563 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4197 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3915 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v13-8B/d8d41981-a7c8-48e9-a63c-86520a0f23d5.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v13-8B/d8d41981-a7c8-48e9-a63c-86520a0f23d5.json deleted file mode 100644 index d1b28b8b8..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v13-8B/d8d41981-a7c8-48e9-a63c-86520a0f23d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v13-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-v13-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-v13-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5359 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4278 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v14-8B/1355985c-fbcb-4eac-8435-417d6034f2f0.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v14-8B/1355985c-fbcb-4eac-8435-417d6034f2f0.json deleted file mode 100644 index 8087ad74c..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v14-8B/1355985c-fbcb-4eac-8435-417d6034f2f0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v14-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-v14-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-v14-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.438 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5363 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3931 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v15-8B/44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v15-8B/44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json deleted file mode 100644 index a02407d4d..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v15-8B/44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v15-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-v15-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-v15-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4654 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3941 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v16-8B/45ae3dc3-6dc0-4d10-99cb-a7f330110906.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v16-8B/45ae3dc3-6dc0-4d10-99cb-a7f330110906.json deleted file mode 100644 index 703c586b6..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v16-8B/45ae3dc3-6dc0-4d10-99cb-a7f330110906.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v16-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-v16-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-v16-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4557 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5344 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4264 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3917 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v17-8B/6b54763a-6329-47fb-bf50-296604251b47.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v17-8B/6b54763a-6329-47fb-bf50-296604251b47.json deleted file mode 100644 index d04ea2204..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v17-8B/6b54763a-6329-47fb-bf50-296604251b47.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v17-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-v17-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-v17-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5347 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4291 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3923 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v18-8B/96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v18-8B/96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json deleted file mode 100644 index c9226fc57..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v18-8B/96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v18-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-gamma-v18-8B", - "id": "jaspionjader/Kosmos-EVAA-gamma-v18-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4341 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5339 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4317 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3905 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B/655b047f-c3a8-4c9c-b864-81d318b2f506.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B/655b047f-c3a8-4c9c-b864-81d318b2f506.json deleted file mode 100644 index 53ac1daaf..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B/655b047f-c3a8-4c9c-b864-81d318b2f506.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-immersive-sof-v44-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-immersive-sof-v44-8B", - "id": "jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4144 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3888 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v10-8B/f62fed77-e166-422d-b5ce-c50b7bccbf4c.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v10-8B/f62fed77-e166-422d-b5ce-c50b7bccbf4c.json deleted file mode 100644 index 5c80d82d6..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v10-8B/f62fed77-e166-422d-b5ce-c50b7bccbf4c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v10-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v10-8B", - "id": "jaspionjader/Kosmos-EVAA-v10-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4262 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5376 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4224 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v11-8B/7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v11-8B/7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json deleted file mode 100644 index ce89b68ef..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v11-8B/7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v11-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v11-8B", - "id": "jaspionjader/Kosmos-EVAA-v11-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4426 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5359 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4184 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3836 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v12-8B/2c93c987-b32d-4a02-8df4-949cc45b8eb2.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v12-8B/2c93c987-b32d-4a02-8df4-949cc45b8eb2.json deleted file mode 100644 index cc567f9c9..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v12-8B/2c93c987-b32d-4a02-8df4-949cc45b8eb2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v12-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v12-8B", - "id": "jaspionjader/Kosmos-EVAA-v12-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5349 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3836 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v2-8B/02e7c1d6-9db1-4de8-b13e-afd752b3669a.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v2-8B/02e7c1d6-9db1-4de8-b13e-afd752b3669a.json deleted file mode 100644 index 308d1927e..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v2-8B/02e7c1d6-9db1-4de8-b13e-afd752b3669a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v2-8B", - "id": "jaspionjader/Kosmos-EVAA-v2-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5341 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v3-8B/580a3045-338a-47b2-8ed7-54c993d5aa90.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v3-8B/580a3045-338a-47b2-8ed7-54c993d5aa90.json deleted file mode 100644 index f03f0b4c8..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v3-8B/580a3045-338a-47b2-8ed7-54c993d5aa90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v3-8B", - "id": "jaspionjader/Kosmos-EVAA-v3-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4411 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4224 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3821 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v4-8B/e71d3be5-ea9d-4426-aa58-5806b7541aa6.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v4-8B/e71d3be5-ea9d-4426-aa58-5806b7541aa6.json deleted file mode 100644 index c7bfce252..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v4-8B/e71d3be5-ea9d-4426-aa58-5806b7541aa6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v4-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v4-8B", - "id": "jaspionjader/Kosmos-EVAA-v4-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5337 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1254 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4197 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v5-8B/1174683a-9488-4c6b-be6b-e5a96328a96f.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v5-8B/1174683a-9488-4c6b-be6b-e5a96328a96f.json deleted file mode 100644 index d9d42b842..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v5-8B/1174683a-9488-4c6b-be6b-e5a96328a96f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v5-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v5-8B", - "id": "jaspionjader/Kosmos-EVAA-v5-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5345 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4224 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3821 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v6-8B/3789b37f-daf0-4c21-82b8-309cbf00312e.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v6-8B/3789b37f-daf0-4c21-82b8-309cbf00312e.json deleted file mode 100644 index e333bd78c..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v6-8B/3789b37f-daf0-4c21-82b8-309cbf00312e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v6-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v6-8B", - "id": "jaspionjader/Kosmos-EVAA-v6-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.538 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4184 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3821 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v7-8B/8586cdc1-dd4e-4112-a59c-f6bc2766701b.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v7-8B/8586cdc1-dd4e-4112-a59c-f6bc2766701b.json deleted file mode 100644 index 7a7e506dc..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v7-8B/8586cdc1-dd4e-4112-a59c-f6bc2766701b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v7-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v7-8B", - "id": "jaspionjader/Kosmos-EVAA-v7-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5335 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3836 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v8-8B/946a7b16-dfa6-42ad-97c1-955bf8a40dae.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v8-8B/946a7b16-dfa6-42ad-97c1-955bf8a40dae.json deleted file mode 100644 index 5b504be25..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v8-8B/946a7b16-dfa6-42ad-97c1-955bf8a40dae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v8-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v8-8B", - "id": "jaspionjader/Kosmos-EVAA-v8-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4383 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5359 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3827 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-8B/d9a6cc31-57c4-4480-a019-25a34b31fcc8.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-8B/d9a6cc31-57c4-4480-a019-25a34b31fcc8.json deleted file mode 100644 index 8db1d146b..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-8B/d9a6cc31-57c4-4480-a019-25a34b31fcc8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v9-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v9-8B", - "id": "jaspionjader/Kosmos-EVAA-v9-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4369 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5361 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4184 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B/279bd5fa-0ab1-411b-871b-bd9ff23853f6.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B/279bd5fa-0ab1-411b-871b-bd9ff23853f6.json deleted file mode 100644 index 9e70660a3..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B/279bd5fa-0ab1-411b-871b-bd9ff23853f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v9-TitanFusion-Mix-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-EVAA-v9-TitanFusion-Mix-8B", - "id": "jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3836 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-8b/c26fae10-e65a-49ac-a2da-2dbf024fd10d.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-8b/c26fae10-e65a-49ac-a2da-2dbf024fd10d.json deleted file mode 100644 index ab43cc04d..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-8b/c26fae10-e65a-49ac-a2da-2dbf024fd10d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Elusive-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-Elusive-8b", - "id": "jaspionjader/Kosmos-Elusive-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4169 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5339 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4078 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-8B/6d37b2b4-630e-4471-b7a8-50f8a58902fe.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-8B/6d37b2b4-630e-4471-b7a8-50f8a58902fe.json deleted file mode 100644 index 47b464069..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-8B/6d37b2b4-630e-4471-b7a8-50f8a58902fe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Elusive-VENN-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-Elusive-VENN-8B", - "id": "jaspionjader/Kosmos-Elusive-VENN-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4233 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5356 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4157 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3797 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B/de687865-4297-4130-bcfe-0c5116c9b0d1.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B/de687865-4297-4130-bcfe-0c5116c9b0d1.json deleted file mode 100644 index c8e3d1871..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B/de687865-4297-4130-bcfe-0c5116c9b0d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Elusive-VENN-Asymmetric-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-Elusive-VENN-Asymmetric-8B", - "id": "jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4542 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5313 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1344 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B/ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B/ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json deleted file mode 100644 index c80a27e40..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B/ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Elusive-VENN-Aurora_faustus-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-Elusive-VENN-Aurora_faustus-8B", - "id": "jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4335 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5304 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3795 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-VENN-8B/52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json b/data/hfopenllm_v2/jaspionjader/Kosmos-VENN-8B/52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json deleted file mode 100644 index 4060c65ec..000000000 --- a/data/hfopenllm_v2/jaspionjader/Kosmos-VENN-8B/52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-VENN-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kosmos-VENN-8B", - "id": "jaspionjader/Kosmos-VENN-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5318 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3801 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-8B/2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json b/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-8B/2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json deleted file mode 100644 index 331d553d5..000000000 --- a/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-8B/2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_PRP-Kosmos-EVAA-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRP-Kosmos-EVAA-8B", - "id": "jaspionjader/PRP-Kosmos-EVAA-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5237 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3766 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-light-8B/5120e433-f5c7-45fa-be56-566101556271.json b/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-light-8B/5120e433-f5c7-45fa-be56-566101556271.json deleted file mode 100644 index a17b8d20f..000000000 --- a/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-light-8B/5120e433-f5c7-45fa-be56-566101556271.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_PRP-Kosmos-EVAA-light-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PRP-Kosmos-EVAA-light-8B", - "id": "jaspionjader/PRP-Kosmos-EVAA-light-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4235 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-8B/7f4b4668-c3a0-4575-957d-ba321d55f420.json b/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-8B/7f4b4668-c3a0-4575-957d-ba321d55f420.json deleted file mode 100644 index c4d0d61b7..000000000 --- a/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-8B/7f4b4668-c3a0-4575-957d-ba321d55f420.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_TSN-Kosmos-EVAA-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TSN-Kosmos-EVAA-8B", - "id": "jaspionjader/TSN-Kosmos-EVAA-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4903 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5347 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-v2-8B/9245b74d-4b9d-4158-a402-0c3742097eba.json b/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-v2-8B/9245b74d-4b9d-4158-a402-0c3742097eba.json deleted file mode 100644 index 7eb53a06b..000000000 --- a/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-v2-8B/9245b74d-4b9d-4158-a402-0c3742097eba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_TSN-Kosmos-EVAA-v2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TSN-Kosmos-EVAA-v2-8B", - "id": "jaspionjader/TSN-Kosmos-EVAA-v2-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4667 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3762 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bbb-1/29a5fcd3-9c22-424c-ab17-70cfe187aea1.json b/data/hfopenllm_v2/jaspionjader/bbb-1/29a5fcd3-9c22-424c-ab17-70cfe187aea1.json deleted file mode 100644 index 3ef1955b2..000000000 --- a/data/hfopenllm_v2/jaspionjader/bbb-1/29a5fcd3-9c22-424c-ab17-70cfe187aea1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbb-1", - "id": "jaspionjader/bbb-1", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4864 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5376 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3897 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bbb-2/af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json b/data/hfopenllm_v2/jaspionjader/bbb-2/af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json deleted file mode 100644 index 6b4a860f0..000000000 --- a/data/hfopenllm_v2/jaspionjader/bbb-2/af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbb-2", - "id": "jaspionjader/bbb-2", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5067 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4145 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3635 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bbb-3/258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json b/data/hfopenllm_v2/jaspionjader/bbb-3/258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json deleted file mode 100644 index 2f10bb3d8..000000000 --- a/data/hfopenllm_v2/jaspionjader/bbb-3/258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbb-3", - "id": "jaspionjader/bbb-3", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4168 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5158 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1405 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4265 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3856 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bbb-4/4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json b/data/hfopenllm_v2/jaspionjader/bbb-4/4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json deleted file mode 100644 index 2e270c373..000000000 --- a/data/hfopenllm_v2/jaspionjader/bbb-4/4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbb-4", - "id": "jaspionjader/bbb-4", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5212 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4092 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3773 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bbb-5/a5d66f97-1f4b-43da-a83a-4a262e297fd9.json b/data/hfopenllm_v2/jaspionjader/bbb-5/a5d66f97-1f4b-43da-a83a-4a262e297fd9.json deleted file mode 100644 index e596a11bf..000000000 --- a/data/hfopenllm_v2/jaspionjader/bbb-5/a5d66f97-1f4b-43da-a83a-4a262e297fd9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbb-5", - "id": "jaspionjader/bbb-5", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4703 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bbb-6/5d29cf73-65d6-4965-a504-4caf07108cc8.json b/data/hfopenllm_v2/jaspionjader/bbb-6/5d29cf73-65d6-4965-a504-4caf07108cc8.json deleted file mode 100644 index 33980d484..000000000 --- a/data/hfopenllm_v2/jaspionjader/bbb-6/5d29cf73-65d6-4965-a504-4caf07108cc8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbb-6", - "id": "jaspionjader/bbb-6", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.488 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5211 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.139 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4052 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3871 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bbb-7/15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json b/data/hfopenllm_v2/jaspionjader/bbb-7/15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json deleted file mode 100644 index 36795088a..000000000 --- a/data/hfopenllm_v2/jaspionjader/bbb-7/15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbb-7", - "id": "jaspionjader/bbb-7", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4828 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5211 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4038 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-1/2ed96c70-390b-44de-aa08-9883a2f33ff3.json b/data/hfopenllm_v2/jaspionjader/bh-1/2ed96c70-390b-44de-aa08-9883a2f33ff3.json deleted file mode 100644 index a17534661..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-1/2ed96c70-390b-44de-aa08-9883a2f33ff3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-1", - "id": "jaspionjader/bh-1", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3449 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-10/67c95889-8a67-40fd-99e2-62e767c16416.json b/data/hfopenllm_v2/jaspionjader/bh-10/67c95889-8a67-40fd-99e2-62e767c16416.json deleted file mode 100644 index bef06887f..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-10/67c95889-8a67-40fd-99e2-62e767c16416.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-10/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-10", - "id": "jaspionjader/bh-10", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4618 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5856 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-11/a518f39d-e073-493d-9a4f-9af53fc71abf.json b/data/hfopenllm_v2/jaspionjader/bh-11/a518f39d-e073-493d-9a4f-9af53fc71abf.json deleted file mode 100644 index 517e4b745..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-11/a518f39d-e073-493d-9a4f-9af53fc71abf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-11/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-11", - "id": "jaspionjader/bh-11", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5851 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4146 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-12/24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json b/data/hfopenllm_v2/jaspionjader/bh-12/24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json deleted file mode 100644 index 1eb93e6a6..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-12/24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-12/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-12", - "id": "jaspionjader/bh-12", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4734 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5802 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4145 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3737 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-13/3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json b/data/hfopenllm_v2/jaspionjader/bh-13/3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json deleted file mode 100644 index 932f2640d..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-13/3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-13/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-13", - "id": "jaspionjader/bh-13", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4698 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5778 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4159 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.373 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-15/ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json b/data/hfopenllm_v2/jaspionjader/bh-15/ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json deleted file mode 100644 index 87663e79d..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-15/ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-15/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-15", - "id": "jaspionjader/bh-15", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4745 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5819 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4105 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-16/0da639d4-181c-4ee1-808c-3de8003c2471.json b/data/hfopenllm_v2/jaspionjader/bh-16/0da639d4-181c-4ee1-808c-3de8003c2471.json deleted file mode 100644 index 5973a4f9d..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-16/0da639d4-181c-4ee1-808c-3de8003c2471.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-16", - "id": "jaspionjader/bh-16", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4731 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5783 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4159 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-17/480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json b/data/hfopenllm_v2/jaspionjader/bh-17/480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json deleted file mode 100644 index cd717325d..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-17/480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-17/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-17", - "id": "jaspionjader/bh-17", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4722 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5776 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4158 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3757 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-18/dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json b/data/hfopenllm_v2/jaspionjader/bh-18/dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json deleted file mode 100644 index 0e961f484..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-18/dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-18/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-18", - "id": "jaspionjader/bh-18", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4725 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5824 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3757 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-19/a2ae2953-e341-49be-8469-32bd41d780d7.json b/data/hfopenllm_v2/jaspionjader/bh-19/a2ae2953-e341-49be-8469-32bd41d780d7.json deleted file mode 100644 index 4406baf52..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-19/a2ae2953-e341-49be-8469-32bd41d780d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-19/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-19", - "id": "jaspionjader/bh-19", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4584 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5766 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-2/23bdd694-f250-46dd-9b8b-526fda47bc9e.json b/data/hfopenllm_v2/jaspionjader/bh-2/23bdd694-f250-46dd-9b8b-526fda47bc9e.json deleted file mode 100644 index 504f5d126..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-2/23bdd694-f250-46dd-9b8b-526fda47bc9e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-2", - "id": "jaspionjader/bh-2", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4579 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5937 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1027 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3695 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-20/d600a69d-1952-4e30-abe8-1769ab63ac29.json b/data/hfopenllm_v2/jaspionjader/bh-20/d600a69d-1952-4e30-abe8-1769ab63ac29.json deleted file mode 100644 index 47f8d0aaa..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-20/d600a69d-1952-4e30-abe8-1769ab63ac29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-20/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-20", - "id": "jaspionjader/bh-20", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.575 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4105 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3768 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-21/afc031d4-852e-4ead-9098-6ce30112b459.json b/data/hfopenllm_v2/jaspionjader/bh-21/afc031d4-852e-4ead-9098-6ce30112b459.json deleted file mode 100644 index 00f6374fa..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-21/afc031d4-852e-4ead-9098-6ce30112b459.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-21/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-21", - "id": "jaspionjader/bh-21", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.47 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5738 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4158 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-22/cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json b/data/hfopenllm_v2/jaspionjader/bh-22/cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json deleted file mode 100644 index 1e2ca8954..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-22/cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-22/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-22", - "id": "jaspionjader/bh-22", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5793 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4172 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-23/a4b93124-1151-4f69-8a5e-6b916e8cf11f.json b/data/hfopenllm_v2/jaspionjader/bh-23/a4b93124-1151-4f69-8a5e-6b916e8cf11f.json deleted file mode 100644 index 532376bf1..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-23/a4b93124-1151-4f69-8a5e-6b916e8cf11f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-23/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-23", - "id": "jaspionjader/bh-23", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4658 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4197 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3796 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-24/efe11d8f-65e6-4ba6-8148-fdd43c9346be.json b/data/hfopenllm_v2/jaspionjader/bh-24/efe11d8f-65e6-4ba6-8148-fdd43c9346be.json deleted file mode 100644 index 22202dd5f..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-24/efe11d8f-65e6-4ba6-8148-fdd43c9346be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-24/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-24", - "id": "jaspionjader/bh-24", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4715 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5717 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4158 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-25/923da7be-2ec8-46b2-8187-fe08eb86d5a0.json b/data/hfopenllm_v2/jaspionjader/bh-25/923da7be-2ec8-46b2-8187-fe08eb86d5a0.json deleted file mode 100644 index ae523eebf..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-25/923da7be-2ec8-46b2-8187-fe08eb86d5a0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-25/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-25", - "id": "jaspionjader/bh-25", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4752 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5706 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-26/1652b9fe-640a-48f9-b7a5-20ae28fb5985.json b/data/hfopenllm_v2/jaspionjader/bh-26/1652b9fe-640a-48f9-b7a5-20ae28fb5985.json deleted file mode 100644 index 4a003c489..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-26/1652b9fe-640a-48f9-b7a5-20ae28fb5985.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-26/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-26", - "id": "jaspionjader/bh-26", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4691 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5735 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3772 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-27/572463ed-f6b9-460d-9c38-0e0ee5327511.json b/data/hfopenllm_v2/jaspionjader/bh-27/572463ed-f6b9-460d-9c38-0e0ee5327511.json deleted file mode 100644 index 041f63301..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-27/572463ed-f6b9-460d-9c38-0e0ee5327511.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-27/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-27", - "id": "jaspionjader/bh-27", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4819 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5714 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4091 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3799 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-28/5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json b/data/hfopenllm_v2/jaspionjader/bh-28/5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json deleted file mode 100644 index 9d000a51b..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-28/5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-28/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-28", - "id": "jaspionjader/bh-28", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4785 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5703 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4131 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-29/32322361-f18d-480d-9475-cd11a45bc4bc.json b/data/hfopenllm_v2/jaspionjader/bh-29/32322361-f18d-480d-9475-cd11a45bc4bc.json deleted file mode 100644 index 015e4ce5c..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-29/32322361-f18d-480d-9475-cd11a45bc4bc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-29/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-29", - "id": "jaspionjader/bh-29", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.567 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3819 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-3/f62d1aee-2d9e-466e-85e2-002fae5d2504.json b/data/hfopenllm_v2/jaspionjader/bh-3/f62d1aee-2d9e-466e-85e2-002fae5d2504.json deleted file mode 100644 index 7c0e9e2c4..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-3/f62d1aee-2d9e-466e-85e2-002fae5d2504.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-3", - "id": "jaspionjader/bh-3", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4664 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5891 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3702 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-30/af389bf1-da63-49a9-9e49-32613d8d05b8.json b/data/hfopenllm_v2/jaspionjader/bh-30/af389bf1-da63-49a9-9e49-32613d8d05b8.json deleted file mode 100644 index 8e6b682db..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-30/af389bf1-da63-49a9-9e49-32613d8d05b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-30/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-30", - "id": "jaspionjader/bh-30", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4666 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5706 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4144 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-31/ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json b/data/hfopenllm_v2/jaspionjader/bh-31/ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json deleted file mode 100644 index da3e5651c..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-31/ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-31/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-31", - "id": "jaspionjader/bh-31", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5665 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4104 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-32/1e697620-36a7-459c-b88c-405febb57c3a.json b/data/hfopenllm_v2/jaspionjader/bh-32/1e697620-36a7-459c-b88c-405febb57c3a.json deleted file mode 100644 index ab338397a..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-32/1e697620-36a7-459c-b88c-405febb57c3a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-32/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-32", - "id": "jaspionjader/bh-32", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4636 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5662 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4157 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-33/532723e8-a9b7-4f72-a015-c2bd9363b5d8.json b/data/hfopenllm_v2/jaspionjader/bh-33/532723e8-a9b7-4f72-a015-c2bd9363b5d8.json deleted file mode 100644 index cb0a494fb..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-33/532723e8-a9b7-4f72-a015-c2bd9363b5d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-33/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-33", - "id": "jaspionjader/bh-33", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5653 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4157 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3808 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-34/be096a57-7d81-4999-919a-ed8a243012b2.json b/data/hfopenllm_v2/jaspionjader/bh-34/be096a57-7d81-4999-919a-ed8a243012b2.json deleted file mode 100644 index e3c52f212..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-34/be096a57-7d81-4999-919a-ed8a243012b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-34/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-34", - "id": "jaspionjader/bh-34", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5681 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-35/cadeb016-e158-4a49-921c-efe0e4eb0cb2.json b/data/hfopenllm_v2/jaspionjader/bh-35/cadeb016-e158-4a49-921c-efe0e4eb0cb2.json deleted file mode 100644 index 3cf6f3543..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-35/cadeb016-e158-4a49-921c-efe0e4eb0cb2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-35/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-35", - "id": "jaspionjader/bh-35", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4721 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.564 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4183 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.383 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-36/c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json b/data/hfopenllm_v2/jaspionjader/bh-36/c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json deleted file mode 100644 index c44481774..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-36/c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-36/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-36", - "id": "jaspionjader/bh-36", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4666 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5664 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-37/04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json b/data/hfopenllm_v2/jaspionjader/bh-37/04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json deleted file mode 100644 index 00efc0c32..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-37/04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-37/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-37", - "id": "jaspionjader/bh-37", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.488 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5625 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4156 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3828 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-38/a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json b/data/hfopenllm_v2/jaspionjader/bh-38/a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json deleted file mode 100644 index 8a76bd6cd..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-38/a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-38/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-38", - "id": "jaspionjader/bh-38", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4618 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5658 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4117 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3811 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-39/29c7bc9b-6833-497b-a553-2941026efea5.json b/data/hfopenllm_v2/jaspionjader/bh-39/29c7bc9b-6833-497b-a553-2941026efea5.json deleted file mode 100644 index 8f546e7ae..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-39/29c7bc9b-6833-497b-a553-2941026efea5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-39/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-39", - "id": "jaspionjader/bh-39", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4576 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5633 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1254 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-4/09a60955-978e-4136-bdde-d5459e37ad2c.json b/data/hfopenllm_v2/jaspionjader/bh-4/09a60955-978e-4136-bdde-d5459e37ad2c.json deleted file mode 100644 index 76f667723..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-4/09a60955-978e-4136-bdde-d5459e37ad2c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-4", - "id": "jaspionjader/bh-4", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4673 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5892 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3705 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-40/501744a2-070a-4378-9232-f7ccd9b2a67e.json b/data/hfopenllm_v2/jaspionjader/bh-40/501744a2-070a-4378-9232-f7ccd9b2a67e.json deleted file mode 100644 index e827aaca1..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-40/501744a2-070a-4378-9232-f7ccd9b2a67e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-40/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-40", - "id": "jaspionjader/bh-40", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4536 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5634 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3835 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-41/369efdc6-6529-477c-b5f0-d229c8102491.json b/data/hfopenllm_v2/jaspionjader/bh-41/369efdc6-6529-477c-b5f0-d229c8102491.json deleted file mode 100644 index 559005960..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-41/369efdc6-6529-477c-b5f0-d229c8102491.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-41/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-41", - "id": "jaspionjader/bh-41", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.474 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5614 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1254 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4183 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-42/906645f3-2041-4380-8118-ac26b92297ba.json b/data/hfopenllm_v2/jaspionjader/bh-42/906645f3-2041-4380-8118-ac26b92297ba.json deleted file mode 100644 index 8e371ce85..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-42/906645f3-2041-4380-8118-ac26b92297ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-42/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-42", - "id": "jaspionjader/bh-42", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.466 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5646 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-43/57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json b/data/hfopenllm_v2/jaspionjader/bh-43/57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json deleted file mode 100644 index 2479e939a..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-43/57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-43/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-43", - "id": "jaspionjader/bh-43", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5635 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4156 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-44/95f2fa22-3da9-4876-ace3-50763f2b2453.json b/data/hfopenllm_v2/jaspionjader/bh-44/95f2fa22-3da9-4876-ace3-50763f2b2453.json deleted file mode 100644 index e04123e5c..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-44/95f2fa22-3da9-4876-ace3-50763f2b2453.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-44/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-44", - "id": "jaspionjader/bh-44", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4706 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5643 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-46/b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json b/data/hfopenllm_v2/jaspionjader/bh-46/b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json deleted file mode 100644 index b0fd30662..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-46/b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-46/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-46", - "id": "jaspionjader/bh-46", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5632 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3822 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-47/b3173a2a-8309-498d-961b-0167d5d5dea6.json b/data/hfopenllm_v2/jaspionjader/bh-47/b3173a2a-8309-498d-961b-0167d5d5dea6.json deleted file mode 100644 index 12bf1bc95..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-47/b3173a2a-8309-498d-961b-0167d5d5dea6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-47/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-47", - "id": "jaspionjader/bh-47", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4652 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5546 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4156 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3855 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-48/0d59dd75-c999-4a7e-919a-fd084202fc9c.json b/data/hfopenllm_v2/jaspionjader/bh-48/0d59dd75-c999-4a7e-919a-fd084202fc9c.json deleted file mode 100644 index 0b617599c..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-48/0d59dd75-c999-4a7e-919a-fd084202fc9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-48/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-48", - "id": "jaspionjader/bh-48", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5541 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1254 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-49/639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json b/data/hfopenllm_v2/jaspionjader/bh-49/639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json deleted file mode 100644 index be5e9a125..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-49/639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-49/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-49", - "id": "jaspionjader/bh-49", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4725 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4129 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3808 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-5/56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json b/data/hfopenllm_v2/jaspionjader/bh-5/56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json deleted file mode 100644 index f712b9f05..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-5/56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-5", - "id": "jaspionjader/bh-5", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4652 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5882 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1057 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3702 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-50/d03fb481-be0b-4dfb-bb4d-54067e058e99.json b/data/hfopenllm_v2/jaspionjader/bh-50/d03fb481-be0b-4dfb-bb4d-54067e058e99.json deleted file mode 100644 index fab8832f6..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-50/d03fb481-be0b-4dfb-bb4d-54067e058e99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-50/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-50", - "id": "jaspionjader/bh-50", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4725 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5553 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4169 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-51/d8fc3475-83e9-4790-a472-72b442087562.json b/data/hfopenllm_v2/jaspionjader/bh-51/d8fc3475-83e9-4790-a472-72b442087562.json deleted file mode 100644 index 82a672fc1..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-51/d8fc3475-83e9-4790-a472-72b442087562.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-51/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-51", - "id": "jaspionjader/bh-51", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5557 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4168 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-52/57efd335-4873-4e01-bfc3-0d704b3d482a.json b/data/hfopenllm_v2/jaspionjader/bh-52/57efd335-4873-4e01-bfc3-0d704b3d482a.json deleted file mode 100644 index 5d9a2c2b6..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-52/57efd335-4873-4e01-bfc3-0d704b3d482a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-52/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-52", - "id": "jaspionjader/bh-52", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4536 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5444 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4169 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-53/25fdcc8a-0e7d-4148-8508-2631ea6deb05.json b/data/hfopenllm_v2/jaspionjader/bh-53/25fdcc8a-0e7d-4148-8508-2631ea6deb05.json deleted file mode 100644 index c656925be..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-53/25fdcc8a-0e7d-4148-8508-2631ea6deb05.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-53/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-53", - "id": "jaspionjader/bh-53", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5494 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3858 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-54/f5f63d06-7e51-4b91-8814-ecbda604fe6b.json b/data/hfopenllm_v2/jaspionjader/bh-54/f5f63d06-7e51-4b91-8814-ecbda604fe6b.json deleted file mode 100644 index b9fb7dee3..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-54/f5f63d06-7e51-4b91-8814-ecbda604fe6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-54/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-54", - "id": "jaspionjader/bh-54", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4841 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4155 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-55/5326c33b-6b8a-472a-9058-a9e9fe83b599.json b/data/hfopenllm_v2/jaspionjader/bh-55/5326c33b-6b8a-472a-9058-a9e9fe83b599.json deleted file mode 100644 index f9f52d25d..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-55/5326c33b-6b8a-472a-9058-a9e9fe83b599.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-55/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-55", - "id": "jaspionjader/bh-55", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4709 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3846 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-56/28674053-e1b6-4f0a-a90e-5dd5082ec164.json b/data/hfopenllm_v2/jaspionjader/bh-56/28674053-e1b6-4f0a-a90e-5dd5082ec164.json deleted file mode 100644 index 4876a15a1..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-56/28674053-e1b6-4f0a-a90e-5dd5082ec164.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-56/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-56", - "id": "jaspionjader/bh-56", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5447 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4116 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-57/fd27bfa7-11b3-46d3-915c-373ddf5a9865.json b/data/hfopenllm_v2/jaspionjader/bh-57/fd27bfa7-11b3-46d3-915c-373ddf5a9865.json deleted file mode 100644 index f2fb7ade4..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-57/fd27bfa7-11b3-46d3-915c-373ddf5a9865.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-57/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-57", - "id": "jaspionjader/bh-57", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4405 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5425 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-58/91f190ba-39c8-47af-8351-73d1f382dd99.json b/data/hfopenllm_v2/jaspionjader/bh-58/91f190ba-39c8-47af-8351-73d1f382dd99.json deleted file mode 100644 index 3b78d4b75..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-58/91f190ba-39c8-47af-8351-73d1f382dd99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-58/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-58", - "id": "jaspionjader/bh-58", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5446 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4183 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-59/b637b55c-dd05-4060-bf33-e63e9de7fac9.json b/data/hfopenllm_v2/jaspionjader/bh-59/b637b55c-dd05-4060-bf33-e63e9de7fac9.json deleted file mode 100644 index 40582b054..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-59/b637b55c-dd05-4060-bf33-e63e9de7fac9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-59/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-59", - "id": "jaspionjader/bh-59", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4341 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1541 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3838 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-6/bcacef79-d7c0-46e7-9194-43541c2f01fc.json b/data/hfopenllm_v2/jaspionjader/bh-6/bcacef79-d7c0-46e7-9194-43541c2f01fc.json deleted file mode 100644 index 66016308e..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-6/bcacef79-d7c0-46e7-9194-43541c2f01fc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-6", - "id": "jaspionjader/bh-6", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5891 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-60/77a358c7-59fa-4b22-a190-dfca86c5166b.json b/data/hfopenllm_v2/jaspionjader/bh-60/77a358c7-59fa-4b22-a190-dfca86c5166b.json deleted file mode 100644 index 60b6bf47a..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-60/77a358c7-59fa-4b22-a190-dfca86c5166b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-60/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-60", - "id": "jaspionjader/bh-60", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5369 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1579 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3689 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-61/ad4c8922-7079-4383-8f42-d3de6326a1e1.json b/data/hfopenllm_v2/jaspionjader/bh-61/ad4c8922-7079-4383-8f42-d3de6326a1e1.json deleted file mode 100644 index 1e6b5f4bd..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-61/ad4c8922-7079-4383-8f42-d3de6326a1e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-61/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-61", - "id": "jaspionjader/bh-61", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4247 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5271 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-62/7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json b/data/hfopenllm_v2/jaspionjader/bh-62/7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json deleted file mode 100644 index ba9efa108..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-62/7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-62/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-62", - "id": "jaspionjader/bh-62", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5379 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1624 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3719 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-63/07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json b/data/hfopenllm_v2/jaspionjader/bh-63/07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json deleted file mode 100644 index 61c277886..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-63/07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-63/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-63", - "id": "jaspionjader/bh-63", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4308 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4917 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4313 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-64/5fb04756-c7bb-4772-b209-0d9a300bbf7d.json b/data/hfopenllm_v2/jaspionjader/bh-64/5fb04756-c7bb-4772-b209-0d9a300bbf7d.json deleted file mode 100644 index 4fa30fb6a..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-64/5fb04756-c7bb-4772-b209-0d9a300bbf7d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-64/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-64", - "id": "jaspionjader/bh-64", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-7/0c02d1b6-2d31-4c54-b881-588cbfb0c686.json b/data/hfopenllm_v2/jaspionjader/bh-7/0c02d1b6-2d31-4c54-b881-588cbfb0c686.json deleted file mode 100644 index 898316818..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-7/0c02d1b6-2d31-4c54-b881-588cbfb0c686.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-7", - "id": "jaspionjader/bh-7", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4624 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5861 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4119 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3715 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-8/a32e4d22-8096-4537-a68a-98ff9171ac8c.json b/data/hfopenllm_v2/jaspionjader/bh-8/a32e4d22-8096-4537-a68a-98ff9171ac8c.json deleted file mode 100644 index 2f29f8cdb..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-8/a32e4d22-8096-4537-a68a-98ff9171ac8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-8", - "id": "jaspionjader/bh-8", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4597 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4265 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/bh-9/4e45b666-fa7e-4a38-8b6b-65846876c8d9.json b/data/hfopenllm_v2/jaspionjader/bh-9/4e45b666-fa7e-4a38-8b6b-65846876c8d9.json deleted file mode 100644 index 038a2b90e..000000000 --- a/data/hfopenllm_v2/jaspionjader/bh-9/4e45b666-fa7e-4a38-8b6b-65846876c8d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_bh-9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bh-9", - "id": "jaspionjader/bh-9", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.585 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4146 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3703 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/dp-6-8b/d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json b/data/hfopenllm_v2/jaspionjader/dp-6-8b/d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json deleted file mode 100644 index d490d1352..000000000 --- a/data/hfopenllm_v2/jaspionjader/dp-6-8b/d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_dp-6-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dp-6-8b", - "id": "jaspionjader/dp-6-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4806 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3897 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/dp-7-8b/6afaec07-ebb8-4f3f-af48-c679f38f4917.json b/data/hfopenllm_v2/jaspionjader/dp-7-8b/6afaec07-ebb8-4f3f-af48-c679f38f4917.json deleted file mode 100644 index 2aba7ace1..000000000 --- a/data/hfopenllm_v2/jaspionjader/dp-7-8b/6afaec07-ebb8-4f3f-af48-c679f38f4917.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_dp-7-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "dp-7-8b", - "id": "jaspionjader/dp-7-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4498 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5291 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3934 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/ek-6/bf8370c9-baed-4034-ac38-c6f796baca15.json b/data/hfopenllm_v2/jaspionjader/ek-6/bf8370c9-baed-4034-ac38-c6f796baca15.json deleted file mode 100644 index 3351de25f..000000000 --- a/data/hfopenllm_v2/jaspionjader/ek-6/bf8370c9-baed-4034-ac38-c6f796baca15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_ek-6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ek-6", - "id": "jaspionjader/ek-6", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4642 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5219 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4144 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3861 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/ek-7/d397c078-6fe3-44a8-859c-a0f7c551dc3a.json b/data/hfopenllm_v2/jaspionjader/ek-7/d397c078-6fe3-44a8-859c-a0f7c551dc3a.json deleted file mode 100644 index 7e3d311e0..000000000 --- a/data/hfopenllm_v2/jaspionjader/ek-7/d397c078-6fe3-44a8-859c-a0f7c551dc3a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_ek-7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ek-7", - "id": "jaspionjader/ek-7", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4767 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3887 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/f-1-8b/ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json b/data/hfopenllm_v2/jaspionjader/f-1-8b/ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json deleted file mode 100644 index 1d3bd64ed..000000000 --- a/data/hfopenllm_v2/jaspionjader/f-1-8b/ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_f-1-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "f-1-8b", - "id": "jaspionjader/f-1-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4983 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5141 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3907 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/f-2-8b/6be795f4-0784-44bf-8926-e3060ec37dcf.json b/data/hfopenllm_v2/jaspionjader/f-2-8b/6be795f4-0784-44bf-8926-e3060ec37dcf.json deleted file mode 100644 index cdc2207e8..000000000 --- a/data/hfopenllm_v2/jaspionjader/f-2-8b/6be795f4-0784-44bf-8926-e3060ec37dcf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_f-2-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "f-2-8b", - "id": "jaspionjader/f-2-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4824 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5294 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3962 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/f-3-8b/d4d808f5-3b79-43b5-8076-d3f785083789.json b/data/hfopenllm_v2/jaspionjader/f-3-8b/d4d808f5-3b79-43b5-8076-d3f785083789.json deleted file mode 100644 index 9a8bac39d..000000000 --- a/data/hfopenllm_v2/jaspionjader/f-3-8b/d4d808f5-3b79-43b5-8076-d3f785083789.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_f-3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "f-3-8b", - "id": "jaspionjader/f-3-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4803 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3954 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/f-4-8b/370f5923-91d7-40d2-bd06-bf2b657b8ef2.json b/data/hfopenllm_v2/jaspionjader/f-4-8b/370f5923-91d7-40d2-bd06-bf2b657b8ef2.json deleted file mode 100644 index 85c9a3a2a..000000000 --- a/data/hfopenllm_v2/jaspionjader/f-4-8b/370f5923-91d7-40d2-bd06-bf2b657b8ef2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_f-4-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "f-4-8b", - "id": "jaspionjader/f-4-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4797 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5289 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3956 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/f-5-8b/5334e5e4-d243-4c20-912c-d0ded74d6ea5.json b/data/hfopenllm_v2/jaspionjader/f-5-8b/5334e5e4-d243-4c20-912c-d0ded74d6ea5.json deleted file mode 100644 index dff79aa05..000000000 --- a/data/hfopenllm_v2/jaspionjader/f-5-8b/5334e5e4-d243-4c20-912c-d0ded74d6ea5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_f-5-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "f-5-8b", - "id": "jaspionjader/f-5-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5313 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3949 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/f-6-8b/7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json b/data/hfopenllm_v2/jaspionjader/f-6-8b/7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json deleted file mode 100644 index 64a06c461..000000000 --- a/data/hfopenllm_v2/jaspionjader/f-6-8b/7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_f-6-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "f-6-8b", - "id": "jaspionjader/f-6-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4846 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/f-7-8b/68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json b/data/hfopenllm_v2/jaspionjader/f-7-8b/68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json deleted file mode 100644 index e277f31b0..000000000 --- a/data/hfopenllm_v2/jaspionjader/f-7-8b/68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_f-7-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "f-7-8b", - "id": "jaspionjader/f-7-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5277 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3936 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/f-8-8b/59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json b/data/hfopenllm_v2/jaspionjader/f-8-8b/59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json deleted file mode 100644 index 5cbc566d2..000000000 --- a/data/hfopenllm_v2/jaspionjader/f-8-8b/59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_f-8-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "f-8-8b", - "id": "jaspionjader/f-8-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4739 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5259 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.394 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/f-9-8b/220cd306-0613-4c8f-9848-4af812a1d37f.json b/data/hfopenllm_v2/jaspionjader/f-9-8b/220cd306-0613-4c8f-9848-4af812a1d37f.json deleted file mode 100644 index e31f7fa11..000000000 --- a/data/hfopenllm_v2/jaspionjader/f-9-8b/220cd306-0613-4c8f-9848-4af812a1d37f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_f-9-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "f-9-8b", - "id": "jaspionjader/f-9-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4602 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3944 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/fct-14-8b/39a6a40c-3fa0-41ba-9d13-da9381263d4a.json b/data/hfopenllm_v2/jaspionjader/fct-14-8b/39a6a40c-3fa0-41ba-9d13-da9381263d4a.json deleted file mode 100644 index 4829d4d11..000000000 --- a/data/hfopenllm_v2/jaspionjader/fct-14-8b/39a6a40c-3fa0-41ba-9d13-da9381263d4a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_fct-14-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fct-14-8b", - "id": "jaspionjader/fct-14-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4129 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5206 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/fct-9-8b/4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json b/data/hfopenllm_v2/jaspionjader/fct-9-8b/4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json deleted file mode 100644 index 6ed9866d8..000000000 --- a/data/hfopenllm_v2/jaspionjader/fct-9-8b/4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_fct-9-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fct-9-8b", - "id": "jaspionjader/fct-9-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5205 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4291 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3932 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/fr-1-8b/16baf620-7dcc-49f3-a787-b431e11ad4f6.json b/data/hfopenllm_v2/jaspionjader/fr-1-8b/16baf620-7dcc-49f3-a787-b431e11ad4f6.json deleted file mode 100644 index e4d8bb492..000000000 --- a/data/hfopenllm_v2/jaspionjader/fr-1-8b/16baf620-7dcc-49f3-a787-b431e11ad4f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_fr-1-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fr-1-8b", - "id": "jaspionjader/fr-1-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5142 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.361 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/fr-10-8b/4745add2-7bcb-4c05-8b12-6bd30856890b.json b/data/hfopenllm_v2/jaspionjader/fr-10-8b/4745add2-7bcb-4c05-8b12-6bd30856890b.json deleted file mode 100644 index 9d8f40937..000000000 --- a/data/hfopenllm_v2/jaspionjader/fr-10-8b/4745add2-7bcb-4c05-8b12-6bd30856890b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_fr-10-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fr-10-8b", - "id": "jaspionjader/fr-10-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4402 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4119 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3863 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/fr-3-8b/f68b122d-4dec-4d5c-ac22-198da3d3e96b.json b/data/hfopenllm_v2/jaspionjader/fr-3-8b/f68b122d-4dec-4d5c-ac22-198da3d3e96b.json deleted file mode 100644 index 4ae5f5614..000000000 --- a/data/hfopenllm_v2/jaspionjader/fr-3-8b/f68b122d-4dec-4d5c-ac22-198da3d3e96b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_fr-3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fr-3-8b", - "id": "jaspionjader/fr-3-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4326 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5255 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3863 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-8B/2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json b/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-8B/2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json deleted file mode 100644 index 84a9fa596..000000000 --- a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-8B/2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_gamma-Kosmos-EVAA-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gamma-Kosmos-EVAA-8B", - "id": "jaspionjader/gamma-Kosmos-EVAA-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5253 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v2-8B/f21bcd75-fc9f-4266-8976-3227b18b6b32.json b/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v2-8B/f21bcd75-fc9f-4266-8976-3227b18b6b32.json deleted file mode 100644 index fdf1ccc99..000000000 --- a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v2-8B/f21bcd75-fc9f-4266-8976-3227b18b6b32.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_gamma-Kosmos-EVAA-v2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gamma-Kosmos-EVAA-v2-8B", - "id": "jaspionjader/gamma-Kosmos-EVAA-v2-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4233 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5262 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1057 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4344 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3756 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v3-8B/7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json b/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v3-8B/7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json deleted file mode 100644 index 9aa3757ce..000000000 --- a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v3-8B/7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_gamma-Kosmos-EVAA-v3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gamma-Kosmos-EVAA-v3-8B", - "id": "jaspionjader/gamma-Kosmos-EVAA-v3-8B", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4333 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4263 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3898 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/knf-2-8b/1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json b/data/hfopenllm_v2/jaspionjader/knf-2-8b/1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json deleted file mode 100644 index f0dcdddf2..000000000 --- a/data/hfopenllm_v2/jaspionjader/knf-2-8b/1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_knf-2-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "knf-2-8b", - "id": "jaspionjader/knf-2-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/knfp-2-8b/ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json b/data/hfopenllm_v2/jaspionjader/knfp-2-8b/ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json deleted file mode 100644 index 2ee41f38d..000000000 --- a/data/hfopenllm_v2/jaspionjader/knfp-2-8b/ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_knfp-2-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "knfp-2-8b", - "id": "jaspionjader/knfp-2-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5327 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5305 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1427 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3726 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/knfp-3-8b/df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json b/data/hfopenllm_v2/jaspionjader/knfp-3-8b/df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json deleted file mode 100644 index 3ed3a2d85..000000000 --- a/data/hfopenllm_v2/jaspionjader/knfp-3-8b/df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_knfp-3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "knfp-3-8b", - "id": "jaspionjader/knfp-3-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4946 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3881 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/kstc-1-8b/774d54fb-a445-4ed9-b79a-9c1346537e98.json b/data/hfopenllm_v2/jaspionjader/kstc-1-8b/774d54fb-a445-4ed9-b79a-9c1346537e98.json deleted file mode 100644 index 3c1fc3df6..000000000 --- a/data/hfopenllm_v2/jaspionjader/kstc-1-8b/774d54fb-a445-4ed9-b79a-9c1346537e98.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-1-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "kstc-1-8b", - "id": "jaspionjader/kstc-1-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4643 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4158 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3892 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/kstc-11-8b/420b8be3-3560-48e8-8ab3-bb55338a9069.json b/data/hfopenllm_v2/jaspionjader/kstc-11-8b/420b8be3-3560-48e8-8ab3-bb55338a9069.json deleted file mode 100644 index cbbf1f3a0..000000000 --- a/data/hfopenllm_v2/jaspionjader/kstc-11-8b/420b8be3-3560-48e8-8ab3-bb55338a9069.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-11-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "kstc-11-8b", - "id": "jaspionjader/kstc-11-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3879 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/kstc-4-8b/c118b75c-597f-48a7-a4eb-675af72c9930.json b/data/hfopenllm_v2/jaspionjader/kstc-4-8b/c118b75c-597f-48a7-a4eb-675af72c9930.json deleted file mode 100644 index 3b5882807..000000000 --- a/data/hfopenllm_v2/jaspionjader/kstc-4-8b/c118b75c-597f-48a7-a4eb-675af72c9930.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-4-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "kstc-4-8b", - "id": "jaspionjader/kstc-4-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.477 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5216 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3869 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/kstc-5-8b/e75534d3-b994-4e88-9274-7b62f61916cf.json b/data/hfopenllm_v2/jaspionjader/kstc-5-8b/e75534d3-b994-4e88-9274-7b62f61916cf.json deleted file mode 100644 index 4dec5cab3..000000000 --- a/data/hfopenllm_v2/jaspionjader/kstc-5-8b/e75534d3-b994-4e88-9274-7b62f61916cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-5-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "kstc-5-8b", - "id": "jaspionjader/kstc-5-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4721 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5211 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4224 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3892 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/kstc-6-8b/770a1ff1-057f-49a7-9402-c6dd881ac03d.json b/data/hfopenllm_v2/jaspionjader/kstc-6-8b/770a1ff1-057f-49a7-9402-c6dd881ac03d.json deleted file mode 100644 index 2000e4ffd..000000000 --- a/data/hfopenllm_v2/jaspionjader/kstc-6-8b/770a1ff1-057f-49a7-9402-c6dd881ac03d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-6-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "kstc-6-8b", - "id": "jaspionjader/kstc-6-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4944 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5231 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4105 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3857 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/kstc-8-8b/6cc9790d-9b02-437e-8ac7-be4152f5b17d.json b/data/hfopenllm_v2/jaspionjader/kstc-8-8b/6cc9790d-9b02-437e-8ac7-be4152f5b17d.json deleted file mode 100644 index 3637c9325..000000000 --- a/data/hfopenllm_v2/jaspionjader/kstc-8-8b/6cc9790d-9b02-437e-8ac7-be4152f5b17d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-8-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "kstc-8-8b", - "id": "jaspionjader/kstc-8-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.491 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5239 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3889 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/kstc-9-8b/264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json b/data/hfopenllm_v2/jaspionjader/kstc-9-8b/264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json deleted file mode 100644 index 23d708f9b..000000000 --- a/data/hfopenllm_v2/jaspionjader/kstc-9-8b/264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-9-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "kstc-9-8b", - "id": "jaspionjader/kstc-9-8b", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4861 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5238 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-10/549db368-437a-4982-ba5b-5c4d7bf203ae.json b/data/hfopenllm_v2/jaspionjader/slu-10/549db368-437a-4982-ba5b-5c4d7bf203ae.json deleted file mode 100644 index bde42f6e8..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-10/549db368-437a-4982-ba5b-5c4d7bf203ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-10/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-10", - "id": "jaspionjader/slu-10", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3664 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-11/0d098a19-7e8f-4a52-8466-729be91388d8.json b/data/hfopenllm_v2/jaspionjader/slu-11/0d098a19-7e8f-4a52-8466-729be91388d8.json deleted file mode 100644 index 00f33a8c5..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-11/0d098a19-7e8f-4a52-8466-729be91388d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-11/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-11", - "id": "jaspionjader/slu-11", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.489 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3919 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-13/83335f65-25a4-4bec-a901-587567ed0e99.json b/data/hfopenllm_v2/jaspionjader/slu-13/83335f65-25a4-4bec-a901-587567ed0e99.json deleted file mode 100644 index 7dba81194..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-13/83335f65-25a4-4bec-a901-587567ed0e99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-13/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-13", - "id": "jaspionjader/slu-13", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5097 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-14/02fb24c3-927f-4c21-bd47-b883521162a3.json b/data/hfopenllm_v2/jaspionjader/slu-14/02fb24c3-927f-4c21-bd47-b883521162a3.json deleted file mode 100644 index ff8e6a9e1..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-14/02fb24c3-927f-4c21-bd47-b883521162a3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-14/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-14", - "id": "jaspionjader/slu-14", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4107 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5089 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3627 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-17/2a6507c7-44c1-4416-9ff1-36abd6af3b73.json b/data/hfopenllm_v2/jaspionjader/slu-17/2a6507c7-44c1-4416-9ff1-36abd6af3b73.json deleted file mode 100644 index 5214d57f6..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-17/2a6507c7-44c1-4416-9ff1-36abd6af3b73.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-17/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-17", - "id": "jaspionjader/slu-17", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4217 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5071 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3619 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-2/327a146a-8cfd-4480-8342-46afde530677.json b/data/hfopenllm_v2/jaspionjader/slu-2/327a146a-8cfd-4480-8342-46afde530677.json deleted file mode 100644 index 746324375..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-2/327a146a-8cfd-4480-8342-46afde530677.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-2", - "id": "jaspionjader/slu-2", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4016 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3959 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3506 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-20/0700fb7a-e722-432f-a64d-c040bba4deee.json b/data/hfopenllm_v2/jaspionjader/slu-20/0700fb7a-e722-432f-a64d-c040bba4deee.json deleted file mode 100644 index b57d6ab6a..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-20/0700fb7a-e722-432f-a64d-c040bba4deee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-20/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-20", - "id": "jaspionjader/slu-20", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4393 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5061 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3933 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3665 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-22/131d3a7e-43dd-4189-8466-6562703b3bdd.json b/data/hfopenllm_v2/jaspionjader/slu-22/131d3a7e-43dd-4189-8466-6562703b3bdd.json deleted file mode 100644 index 3b02267ec..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-22/131d3a7e-43dd-4189-8466-6562703b3bdd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-22/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-22", - "id": "jaspionjader/slu-22", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5082 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.365 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-23/8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json b/data/hfopenllm_v2/jaspionjader/slu-23/8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json deleted file mode 100644 index 12d59c50d..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-23/8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-23/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-23", - "id": "jaspionjader/slu-23", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4478 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5132 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4092 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-25/aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json b/data/hfopenllm_v2/jaspionjader/slu-25/aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json deleted file mode 100644 index f8a9592be..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-25/aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-25/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-25", - "id": "jaspionjader/slu-25", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5095 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3946 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-29/a73250f1-399a-4afa-bf83-4036dce78ef3.json b/data/hfopenllm_v2/jaspionjader/slu-29/a73250f1-399a-4afa-bf83-4036dce78ef3.json deleted file mode 100644 index 56321b0e4..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-29/a73250f1-399a-4afa-bf83-4036dce78ef3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-29/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-29", - "id": "jaspionjader/slu-29", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4431 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3933 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3669 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-32/f68bf680-9626-4952-b95e-12a18fd60820.json b/data/hfopenllm_v2/jaspionjader/slu-32/f68bf680-9626-4952-b95e-12a18fd60820.json deleted file mode 100644 index 3e62ecf10..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-32/f68bf680-9626-4952-b95e-12a18fd60820.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-32/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-32", - "id": "jaspionjader/slu-32", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4516 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5167 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4039 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3766 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-33/d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json b/data/hfopenllm_v2/jaspionjader/slu-33/d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json deleted file mode 100644 index c25031e6e..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-33/d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-33/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-33", - "id": "jaspionjader/slu-33", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4457 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5081 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-34/7b5eab2e-fba3-47d5-9839-02249c2568c5.json b/data/hfopenllm_v2/jaspionjader/slu-34/7b5eab2e-fba3-47d5-9839-02249c2568c5.json deleted file mode 100644 index 53e4bfd25..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-34/7b5eab2e-fba3-47d5-9839-02249c2568c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-34/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-34", - "id": "jaspionjader/slu-34", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5077 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-35/2acee2c3-4322-4152-8151-c1d571475b7c.json b/data/hfopenllm_v2/jaspionjader/slu-35/2acee2c3-4322-4152-8151-c1d571475b7c.json deleted file mode 100644 index 3720fab6c..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-35/2acee2c3-4322-4152-8151-c1d571475b7c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-35/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-35", - "id": "jaspionjader/slu-35", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4242 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5103 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3946 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3676 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-36/67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json b/data/hfopenllm_v2/jaspionjader/slu-36/67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json deleted file mode 100644 index 54b88f06a..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-36/67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-36/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-36", - "id": "jaspionjader/slu-36", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5087 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3933 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3711 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-37/2923aeb3-982f-400d-9588-707583c75a1d.json b/data/hfopenllm_v2/jaspionjader/slu-37/2923aeb3-982f-400d-9588-707583c75a1d.json deleted file mode 100644 index c1501c97b..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-37/2923aeb3-982f-400d-9588-707583c75a1d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-37/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-37", - "id": "jaspionjader/slu-37", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4534 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3946 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3695 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-6/b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json b/data/hfopenllm_v2/jaspionjader/slu-6/b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json deleted file mode 100644 index bdbe422b6..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-6/b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-6", - "id": "jaspionjader/slu-6", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4117 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5099 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4066 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3611 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/slu-mix-1/7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json b/data/hfopenllm_v2/jaspionjader/slu-mix-1/7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json deleted file mode 100644 index d2cd73f98..000000000 --- a/data/hfopenllm_v2/jaspionjader/slu-mix-1/7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_slu-mix-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "slu-mix-1", - "id": "jaspionjader/slu-mix-1", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4569 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/sof-1/fd481b93-55b2-4831-9be9-1b1b2886fda3.json b/data/hfopenllm_v2/jaspionjader/sof-1/fd481b93-55b2-4831-9be9-1b1b2886fda3.json deleted file mode 100644 index 23e9adbfe..000000000 --- a/data/hfopenllm_v2/jaspionjader/sof-1/fd481b93-55b2-4831-9be9-1b1b2886fda3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_sof-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sof-1", - "id": "jaspionjader/sof-1", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4314 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.501 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4082 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/sof-10/f159748f-234e-4962-b582-cd5805448f33.json b/data/hfopenllm_v2/jaspionjader/sof-10/f159748f-234e-4962-b582-cd5805448f33.json deleted file mode 100644 index a665ef820..000000000 --- a/data/hfopenllm_v2/jaspionjader/sof-10/f159748f-234e-4962-b582-cd5805448f33.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_sof-10/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sof-10", - "id": "jaspionjader/sof-10", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4648 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5197 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4091 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/sof-3/044d53dd-d134-4959-a70c-46f11cc0b300.json b/data/hfopenllm_v2/jaspionjader/sof-3/044d53dd-d134-4959-a70c-46f11cc0b300.json deleted file mode 100644 index 02de1aae3..000000000 --- a/data/hfopenllm_v2/jaspionjader/sof-3/044d53dd-d134-4959-a70c-46f11cc0b300.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_sof-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sof-3", - "id": "jaspionjader/sof-3", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5206 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1276 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4131 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/sof-6/f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json b/data/hfopenllm_v2/jaspionjader/sof-6/f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json deleted file mode 100644 index f3a8226cf..000000000 --- a/data/hfopenllm_v2/jaspionjader/sof-6/f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_sof-6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sof-6", - "id": "jaspionjader/sof-6", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-10/5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json b/data/hfopenllm_v2/jaspionjader/test-10/5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json deleted file mode 100644 index 6d72f6368..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-10/5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-10/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-10", - "id": "jaspionjader/test-10", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4578 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3936 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-11/80e5134b-0733-41cc-8b4f-ef32fbe57066.json b/data/hfopenllm_v2/jaspionjader/test-11/80e5134b-0733-41cc-8b4f-ef32fbe57066.json deleted file mode 100644 index 91ee0a934..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-11/80e5134b-0733-41cc-8b4f-ef32fbe57066.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-11/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-11", - "id": "jaspionjader/test-11", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4541 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.535 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-12/61123e41-7b2a-40da-9f7f-b830c27d7f12.json b/data/hfopenllm_v2/jaspionjader/test-12/61123e41-7b2a-40da-9f7f-b830c27d7f12.json deleted file mode 100644 index f704e5a09..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-12/61123e41-7b2a-40da-9f7f-b830c27d7f12.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-12/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-12", - "id": "jaspionjader/test-12", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5347 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-13/b93c31d7-54c3-47b9-a267-3f8fdb796805.json b/data/hfopenllm_v2/jaspionjader/test-13/b93c31d7-54c3-47b9-a267-3f8fdb796805.json deleted file mode 100644 index d5aea42d0..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-13/b93c31d7-54c3-47b9-a267-3f8fdb796805.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-13/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-13", - "id": "jaspionjader/test-13", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4581 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5318 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1057 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4264 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-14/b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json b/data/hfopenllm_v2/jaspionjader/test-14/b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json deleted file mode 100644 index d1c730295..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-14/b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-14/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-14", - "id": "jaspionjader/test-14", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4444 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5323 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4317 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-15/3b06f75e-3d22-4428-8d4f-2e704b96961e.json b/data/hfopenllm_v2/jaspionjader/test-15/3b06f75e-3d22-4428-8d4f-2e704b96961e.json deleted file mode 100644 index 1f7a65dff..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-15/3b06f75e-3d22-4428-8d4f-2e704b96961e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-15/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-15", - "id": "jaspionjader/test-15", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4365 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5328 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4264 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-16/dfda4aab-f8d4-49ee-b141-78539b69007c.json b/data/hfopenllm_v2/jaspionjader/test-16/dfda4aab-f8d4-49ee-b141-78539b69007c.json deleted file mode 100644 index e35aa96fb..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-16/dfda4aab-f8d4-49ee-b141-78539b69007c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-16", - "id": "jaspionjader/test-16", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4599 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.533 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4225 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-17/690f3c19-c148-458d-b4c5-87761d72b851.json b/data/hfopenllm_v2/jaspionjader/test-17/690f3c19-c148-458d-b4c5-87761d72b851.json deleted file mode 100644 index ecdd22a51..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-17/690f3c19-c148-458d-b4c5-87761d72b851.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-17/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-17", - "id": "jaspionjader/test-17", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.015 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4267 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5329 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3929 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-18/b6a18246-776d-463f-80d5-140df74e9704.json b/data/hfopenllm_v2/jaspionjader/test-18/b6a18246-776d-463f-80d5-140df74e9704.json deleted file mode 100644 index ae471a895..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-18/b6a18246-776d-463f-80d5-140df74e9704.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-18/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-18", - "id": "jaspionjader/test-18", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5317 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-19/9831abdc-ad08-48c0-8384-86240e7350b5.json b/data/hfopenllm_v2/jaspionjader/test-19/9831abdc-ad08-48c0-8384-86240e7350b5.json deleted file mode 100644 index acc7c5b32..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-19/9831abdc-ad08-48c0-8384-86240e7350b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-19/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-19", - "id": "jaspionjader/test-19", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5319 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4264 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3929 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jaspionjader/test-20/96a572e5-4751-46ce-9202-deb223ef4dfe.json b/data/hfopenllm_v2/jaspionjader/test-20/96a572e5-4751-46ce-9202-deb223ef4dfe.json deleted file mode 100644 index 98974db50..000000000 --- a/data/hfopenllm_v2/jaspionjader/test-20/96a572e5-4751-46ce-9202-deb223ef4dfe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jaspionjader_test-20/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-20", - "id": "jaspionjader/test-20", - "developer": "jaspionjader", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4529 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5327 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jayasuryajsk/Qwen2.5-3B-reasoner/f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json b/data/hfopenllm_v2/jayasuryajsk/Qwen2.5-3B-reasoner/f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json deleted file mode 100644 index b9587c6d6..000000000 --- a/data/hfopenllm_v2/jayasuryajsk/Qwen2.5-3B-reasoner/f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jayasuryajsk_Qwen2.5-3B-reasoner/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-reasoner", - "id": "jayasuryajsk/Qwen2.5-3B-reasoner", - "developer": "jayasuryajsk", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4651 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2085 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4123 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeanmichela/o-distil-qwen/8376c0bf-f9c3-4529-b13c-c57106182d15.json b/data/hfopenllm_v2/jeanmichela/o-distil-qwen/8376c0bf-f9c3-4529-b13c-c57106182d15.json deleted file mode 100644 index 214d189b4..000000000 --- a/data/hfopenllm_v2/jeanmichela/o-distil-qwen/8376c0bf-f9c3-4529-b13c-c57106182d15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeanmichela_o-distil-qwen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "o-distil-qwen", - "id": "jeanmichela/o-distil-qwen", - "developer": "jeanmichela", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.534 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebcarter/psyonic-cetacean-20B/97a80145-e621-4603-8ff8-2cc4bd74190a.json b/data/hfopenllm_v2/jebcarter/psyonic-cetacean-20B/97a80145-e621-4603-8ff8-2cc4bd74190a.json deleted file mode 100644 index bb13bcad2..000000000 --- a/data/hfopenllm_v2/jebcarter/psyonic-cetacean-20B/97a80145-e621-4603-8ff8-2cc4bd74190a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebcarter_psyonic-cetacean-20B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "psyonic-cetacean-20B", - "id": "jebcarter/psyonic-cetacean-20B", - "developer": "jebcarter", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 19.994 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2544 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4907 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebish7/Llama-3-Nanda-10B-Chat/99a7881c-cca0-43d6-96f5-ce5292ed60a0.json b/data/hfopenllm_v2/jebish7/Llama-3-Nanda-10B-Chat/99a7881c-cca0-43d6-96f5-ce5292ed60a0.json deleted file mode 100644 index f63ea2032..000000000 --- a/data/hfopenllm_v2/jebish7/Llama-3-Nanda-10B-Chat/99a7881c-cca0-43d6-96f5-ce5292ed60a0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebish7_Llama-3-Nanda-10B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Nanda-10B-Chat", - "id": "jebish7/Llama-3-Nanda-10B-Chat", - "developer": "jebish7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 9.985 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4959 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebish7/Llama-3.1-8B-Instruct/60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json b/data/hfopenllm_v2/jebish7/Llama-3.1-8B-Instruct/60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json deleted file mode 100644 index d88b01af6..000000000 --- a/data/hfopenllm_v2/jebish7/Llama-3.1-8B-Instruct/60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebish7_Llama-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Instruct", - "id": "jebish7/Llama-3.1-8B-Instruct", - "developer": "jebish7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5058 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5088 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1548 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Base/4a0f8dc7-9446-4dda-bf49-8cca4851746c.json b/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Base/4a0f8dc7-9446-4dda-bf49-8cca4851746c.json deleted file mode 100644 index 9ffd03cf9..000000000 --- a/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Base/4a0f8dc7-9446-4dda-bf49-8cca4851746c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebish7_Nemotron-4-Mini-Hindi-4B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemotron-4-Mini-Hindi-4B-Base", - "id": "jebish7/Nemotron-4-Mini-Hindi-4B-Base", - "developer": "jebish7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "NemotronForCausalLM", - "params_billions": 4.191 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2285 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3924 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2503 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Instruct/6eb3a040-8234-4d31-8274-6987b0e4e3b4.json b/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Instruct/6eb3a040-8234-4d31-8274-6987b0e4e3b4.json deleted file mode 100644 index bd15b91c4..000000000 --- a/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Instruct/6eb3a040-8234-4d31-8274-6987b0e4e3b4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebish7_Nemotron-4-Mini-Hindi-4B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemotron-4-Mini-Hindi-4B-Instruct", - "id": "jebish7/Nemotron-4-Mini-Hindi-4B-Instruct", - "developer": "jebish7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "NemotronForCausalLM", - "params_billions": 4.191 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3345 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4041 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4153 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2595 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebish7/Nemotron-Mini-4B-Instruct/16053077-38fd-4136-81a5-fea0d4cd927a.json b/data/hfopenllm_v2/jebish7/Nemotron-Mini-4B-Instruct/16053077-38fd-4136-81a5-fea0d4cd927a.json deleted file mode 100644 index b0a9fe425..000000000 --- a/data/hfopenllm_v2/jebish7/Nemotron-Mini-4B-Instruct/16053077-38fd-4136-81a5-fea0d4cd927a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebish7_Nemotron-Mini-4B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemotron-Mini-4B-Instruct", - "id": "jebish7/Nemotron-Mini-4B-Instruct", - "developer": "jebish7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "NemotronForCausalLM", - "params_billions": 4.191 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3709 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2783 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebish7/aya-expanse-8b/25abb99f-536e-4638-8611-a1db5dee931d.json b/data/hfopenllm_v2/jebish7/aya-expanse-8b/25abb99f-536e-4638-8611-a1db5dee931d.json deleted file mode 100644 index d66b00bb7..000000000 --- a/data/hfopenllm_v2/jebish7/aya-expanse-8b/25abb99f-536e-4638-8611-a1db5dee931d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebish7_aya-expanse-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "aya-expanse-8b", - "id": "jebish7/aya-expanse-8b", - "developer": "jebish7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "CohereForCausalLM", - "params_billions": 8.028 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3791 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4969 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3869 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3103 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebish7/gemma-2-2b-it/aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json b/data/hfopenllm_v2/jebish7/gemma-2-2b-it/aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json deleted file mode 100644 index d43d0aaca..000000000 --- a/data/hfopenllm_v2/jebish7/gemma-2-2b-it/aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebish7_gemma-2-2b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-it", - "id": "jebish7/gemma-2-2b-it", - "developer": "jebish7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1272 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4395 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2715 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebish7/gemma-2-9b-it/b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json b/data/hfopenllm_v2/jebish7/gemma-2-9b-it/b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json deleted file mode 100644 index 2343609a8..000000000 --- a/data/hfopenllm_v2/jebish7/gemma-2-9b-it/b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebish7_gemma-2-9b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-it", - "id": "jebish7/gemma-2-9b-it", - "developer": "jebish7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1557 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5949 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4554 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jebish7/qwen2.5-0.5B-IHA-Hin/169fb05f-5201-47b8-a06e-7d01e574c689.json b/data/hfopenllm_v2/jebish7/qwen2.5-0.5B-IHA-Hin/169fb05f-5201-47b8-a06e-7d01e574c689.json deleted file mode 100644 index 8d24c633c..000000000 --- a/data/hfopenllm_v2/jebish7/qwen2.5-0.5B-IHA-Hin/169fb05f-5201-47b8-a06e-7d01e574c689.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jebish7_qwen2.5-0.5B-IHA-Hin/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-0.5B-IHA-Hin", - "id": "jebish7/qwen2.5-0.5B-IHA-Hin", - "developer": "jebish7", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2989 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1094 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen-7B-nerd-uncensored-v1.0/db076309-32e5-4d46-9786-ff14f8daf5d2.json b/data/hfopenllm_v2/jeffmeloy/Qwen-7B-nerd-uncensored-v1.0/db076309-32e5-4d46-9786-ff14f8daf5d2.json deleted file mode 100644 index 88151c7d8..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen-7B-nerd-uncensored-v1.0/db076309-32e5-4d46-9786-ff14f8daf5d2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen-7B-nerd-uncensored-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-7B-nerd-uncensored-v1.0", - "id": "jeffmeloy/Qwen-7B-nerd-uncensored-v1.0", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6136 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5421 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4363 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-minperplexity-2/cde914dc-7d57-425f-9787-e4b8d36d61cf.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-minperplexity-2/cde914dc-7d57-425f-9787-e4b8d36d61cf.json deleted file mode 100644 index 470df5e03..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-minperplexity-2/cde914dc-7d57-425f-9787-e4b8d36d61cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-minperplexity-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-minperplexity-2", - "id": "jeffmeloy/Qwen2.5-7B-minperplexity-2", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5097 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3014 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4625 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4346 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9/5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9/5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json deleted file mode 100644 index b70182eb1..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9/5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v0.9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v0.9", - "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6048 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.547 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2946 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4363 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0/8c645c9f-02f6-44a5-b295-d6364ed49464.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0/8c645c9f-02f6-44a5-b295-d6364ed49464.json deleted file mode 100644 index 5f5c44741..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0/8c645c9f-02f6-44a5-b295-d6364ed49464.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v1.0", - "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7695 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5418 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4713 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4551 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1/97bb5519-e2d3-44d5-abf4-b5263c2b3245.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1/97bb5519-e2d3-44d5-abf4-b5263c2b3245.json deleted file mode 100644 index 87afc83a8..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1/97bb5519-e2d3-44d5-abf4-b5263c2b3245.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v1.1", - "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6626 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4864 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.385 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2/bd3d78d3-3ff1-4a92-a316-e4e30787a331.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2/bd3d78d3-3ff1-4a92-a316-e4e30787a331.json deleted file mode 100644 index edab0f5d7..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2/bd3d78d3-3ff1-4a92-a316-e4e30787a331.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v1.2", - "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4965 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4946 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4172 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3969 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3/d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3/d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json deleted file mode 100644 index ae6f30f79..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3/d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v1.3", - "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4995 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4016 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4/e1772d6c-fd26-43a7-82b3-7997d8a6809f.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4/e1772d6c-fd26-43a7-82b3-7997d8a6809f.json deleted file mode 100644 index 130b5f1a1..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4/e1772d6c-fd26-43a7-82b3-7997d8a6809f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v1.4", - "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6079 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5467 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4714 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4419 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5/febaf893-6aaf-4c87-89fc-cc865ebf2859.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5/febaf893-6aaf-4c87-89fc-cc865ebf2859.json deleted file mode 100644 index 41e373c25..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5/febaf893-6aaf-4c87-89fc-cc865ebf2859.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v1.5", - "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5523 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2757 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4982 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4448 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7/0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7/0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json deleted file mode 100644 index adea07aff..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7/0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v1.7", - "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2915 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4848 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8/0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8/0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json deleted file mode 100644 index b0715c0fe..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8/0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v1.8", - "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6256 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5447 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2704 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4767 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.0/e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.0/e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json deleted file mode 100644 index ff3474fac..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.0/e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-olm-v1.0", - "id": "jeffmeloy/Qwen2.5-7B-olm-v1.0", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2863 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4278 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4566 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.1/85426280-8138-46d0-a111-b59b0d7c86c8.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.1/85426280-8138-46d0-a111-b59b0d7c86c8.json deleted file mode 100644 index 9cbcd9f74..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.1/85426280-8138-46d0-a111-b59b0d7c86c8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-olm-v1.1", - "id": "jeffmeloy/Qwen2.5-7B-olm-v1.1", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4329 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5478 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3829 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4808 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.2/32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.2/32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json deleted file mode 100644 index ed920c47e..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.2/32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-olm-v1.2", - "id": "jeffmeloy/Qwen2.5-7B-olm-v1.2", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5533 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2847 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.3/86ed6833-ae85-4a8e-b840-b0c9540083ce.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.3/86ed6833-ae85-4a8e-b840-b0c9540083ce.json deleted file mode 100644 index c0cbb0d3b..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.3/86ed6833-ae85-4a8e-b840-b0c9540083ce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-olm-v1.3", - "id": "jeffmeloy/Qwen2.5-7B-olm-v1.3", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4219 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5532 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4701 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.447 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.4/2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.4/2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json deleted file mode 100644 index 120c4b77c..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.4/2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-olm-v1.4", - "id": "jeffmeloy/Qwen2.5-7B-olm-v1.4", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4545 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5582 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4622 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4457 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.5/9677e68d-afda-4917-825c-83318219ff59.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.5/9677e68d-afda-4917-825c-83318219ff59.json deleted file mode 100644 index c40b034ac..000000000 --- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.5/9677e68d-afda-4917-825c-83318219ff59.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-olm-v1.5", - "id": "jeffmeloy/Qwen2.5-7B-olm-v1.5", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4547 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4399 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1/23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json b/data/hfopenllm_v2/jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1/23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json deleted file mode 100644 index a05b1551c..000000000 --- a/data/hfopenllm_v2/jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1/23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeffmeloy_jeffmeloy_Qwen2.5-7B-minperplexity-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jeffmeloy_Qwen2.5-7B-minperplexity-1", - "id": "jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1", - "developer": "jeffmeloy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5582 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2915 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jeonsworld/CarbonVillain-en-10.7B-v4/bec23315-f98a-4211-81a0-c49f395e66c9.json b/data/hfopenllm_v2/jeonsworld/CarbonVillain-en-10.7B-v4/bec23315-f98a-4211-81a0-c49f395e66c9.json deleted file mode 100644 index 186c8ce3d..000000000 --- a/data/hfopenllm_v2/jeonsworld/CarbonVillain-en-10.7B-v4/bec23315-f98a-4211-81a0-c49f395e66c9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jeonsworld_CarbonVillain-en-10.7B-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CarbonVillain-en-10.7B-v4", - "id": "jeonsworld/CarbonVillain-en-10.7B-v4", - "developer": "jeonsworld", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4579 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5168 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3965 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jiangxinyang-shanda/Homer-LLama3-8B/1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json b/data/hfopenllm_v2/jiangxinyang-shanda/Homer-LLama3-8B/1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json deleted file mode 100644 index 122a2bdfb..000000000 --- a/data/hfopenllm_v2/jiangxinyang-shanda/Homer-LLama3-8B/1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jiangxinyang-shanda_Homer-LLama3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Homer-LLama3-8B", - "id": "jiangxinyang-shanda/Homer-LLama3-8B", - "developer": "jiangxinyang-shanda", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3992 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5173 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4056 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jieliu/Storm-7B/39327803-11e7-4b28-8750-81feb027e8f3.json b/data/hfopenllm_v2/jieliu/Storm-7B/39327803-11e7-4b28-8750-81feb027e8f3.json deleted file mode 100644 index eaad1d2e4..000000000 --- a/data/hfopenllm_v2/jieliu/Storm-7B/39327803-11e7-4b28-8750-81feb027e8f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jieliu_Storm-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Storm-7B", - "id": "jieliu/Storm-7B", - "developer": "jieliu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3424 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5187 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3119 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jiviai/medX_v2/ce2b6874-0fc8-4364-a526-7b25b101e1e3.json b/data/hfopenllm_v2/jiviai/medX_v2/ce2b6874-0fc8-4364-a526-7b25b101e1e3.json deleted file mode 100644 index 57f4909ab..000000000 --- a/data/hfopenllm_v2/jiviai/medX_v2/ce2b6874-0fc8-4364-a526-7b25b101e1e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jiviai_medX_v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "medX_v2", - "id": "jiviai/medX_v2", - "developer": "jiviai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3743 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3498 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3428 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jlzhou/Qwen2.5-3B-Infinity-Instruct-0625/9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json b/data/hfopenllm_v2/jlzhou/Qwen2.5-3B-Infinity-Instruct-0625/9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json deleted file mode 100644 index bacdbbca7..000000000 --- a/data/hfopenllm_v2/jlzhou/Qwen2.5-3B-Infinity-Instruct-0625/9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jlzhou_Qwen2.5-3B-Infinity-Instruct-0625/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-3B-Infinity-Instruct-0625", - "id": "jlzhou/Qwen2.5-3B-Infinity-Instruct-0625", - "developer": "jlzhou", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3558 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4774 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1367 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3199 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json deleted file mode 100644 index c7139a751..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4271 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5036 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4638 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1eb697fe-9dd4-4a41-aa47-33456df39e2d.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1eb697fe-9dd4-4a41-aa47-33456df39e2d.json deleted file mode 100644 index a10daf5d6..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1eb697fe-9dd4-4a41-aa47-33456df39e2d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5019 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/5f10df7b-cd2c-44ca-b13a-2852483c71f8.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/5f10df7b-cd2c-44ca-b13a-2852483c71f8.json deleted file mode 100644 index c3e92c793..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/5f10df7b-cd2c-44ca-b13a-2852483c71f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3377 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4917 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5018 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3533 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json deleted file mode 100644 index 8e8a04411..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4274 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5126 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/5f47e65d-293f-469e-a18f-5627ca1adf44.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/5f47e65d-293f-469e-a18f-5627ca1adf44.json deleted file mode 100644 index f7941ba70..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/5f47e65d-293f-469e-a18f-5627ca1adf44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3204 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4884 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3344 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json deleted file mode 100644 index 9fd6df7ed..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.514 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0801 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3696 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/15c21655-9af8-4bee-9884-b047683e9adf.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/15c21655-9af8-4bee-9884-b047683e9adf.json deleted file mode 100644 index 2d9d72344..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/15c21655-9af8-4bee-9884-b047683e9adf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2814 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4854 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5163 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3295 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/f642de95-218a-4db0-807f-1bb97618b4f6.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/f642de95-218a-4db0-807f-1bb97618b4f6.json deleted file mode 100644 index 0a6e5d576..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/f642de95-218a-4db0-807f-1bb97618b4f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4302 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5157 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json deleted file mode 100644 index 960cb299c..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.279 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4861 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.515 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1ee8c377-2236-4225-942f-ef8ce5770741.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1ee8c377-2236-4225-942f-ef8ce5770741.json deleted file mode 100644 index 25f0a4caa..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1ee8c377-2236-4225-942f-ef8ce5770741.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4223 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5154 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4384 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.365 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json deleted file mode 100644 index 9cbebea51..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4359 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5041 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4532 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3762 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/419c6631-805f-43ba-9db8-5296f8d221ec.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/419c6631-805f-43ba-9db8-5296f8d221ec.json deleted file mode 100644 index bd49fe5af..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/419c6631-805f-43ba-9db8-5296f8d221ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5011 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/3fc1822f-4a43-4a3b-90d7-fc163491c90a.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/3fc1822f-4a43-4a3b-90d7-fc163491c90a.json deleted file mode 100644 index e230b5938..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/3fc1822f-4a43-4a3b-90d7-fc163491c90a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4999 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4871 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3611 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/76b4037b-c5d0-435f-966a-bd88b1665dad.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/76b4037b-c5d0-435f-966a-bd88b1665dad.json deleted file mode 100644 index 449e51cc0..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/76b4037b-c5d0-435f-966a-bd88b1665dad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4204 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/757b85e7-84c8-429f-aeb4-870852fa8959.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/757b85e7-84c8-429f-aeb4-870852fa8959.json deleted file mode 100644 index 3c0a95ced..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/757b85e7-84c8-429f-aeb4-870852fa8959.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3454 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3531 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/acab4982-1205-4362-803e-306b1e2371bf.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/acab4982-1205-4362-803e-306b1e2371bf.json deleted file mode 100644 index 07aaad644..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/acab4982-1205-4362-803e-306b1e2371bf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4092 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5137 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4357 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3669 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json deleted file mode 100644 index a9858ea11..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2904 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4967 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json deleted file mode 100644 index 0de836a5f..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5147 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/708aded5-6252-44e3-bf0d-08bf3e7f32e0.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/708aded5-6252-44e3-bf0d-08bf3e7f32e0.json deleted file mode 100644 index 987fd3b28..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/708aded5-6252-44e3-bf0d-08bf3e7f32e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2913 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4918 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4977 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3454 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json deleted file mode 100644 index 50c1000ff..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5139 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4317 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_linear/5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_linear/5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json deleted file mode 100644 index 4cb808ae2..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_linear/5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_linear/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_dare_linear", - "id": "johnsutor/Llama-3-8B-Instruct_dare_linear", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4283 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4979 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2414 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1/47320824-8064-40d4-a08c-810faafbba77.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1/47320824-8064-40d4-a08c-810faafbba77.json deleted file mode 100644 index 2aa16cb5b..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1/47320824-8064-40d4-a08c-810faafbba77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_dare_ties-density-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1891 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4119 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4658 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2265 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3/8baeef58-0ba6-4723-8f23-7a4c386f2cad.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3/8baeef58-0ba6-4723-8f23-7a4c386f2cad.json deleted file mode 100644 index 0f28a208f..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3/8baeef58-0ba6-4723-8f23-7a4c386f2cad.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_dare_ties-density-0.3", - "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2113 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4559 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5069 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.304 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7/0387ca63-1e31-4eaa-ac7c-35d417548c54.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7/0387ca63-1e31-4eaa-ac7c-35d417548c54.json deleted file mode 100644 index d8989fbe5..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7/0387ca63-1e31-4eaa-ac7c-35d417548c54.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_dare_ties-density-0.7", - "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2034 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4723 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.511 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3148 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9/733983fe-4b9c-47e6-963d-c57829b6f1af.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9/733983fe-4b9c-47e6-963d-c57829b6f1af.json deleted file mode 100644 index ad8724889..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9/733983fe-4b9c-47e6-963d-c57829b6f1af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_dare_ties-density-0.9", - "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2161 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4664 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.523 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_linear/80c4859d-8016-4650-939f-100ba2e6d808.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_linear/80c4859d-8016-4650-939f-100ba2e6d808.json deleted file mode 100644 index 011100dc4..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_linear/80c4859d-8016-4650-939f-100ba2e6d808.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_linear/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_linear", - "id": "johnsutor/Llama-3-8B-Instruct_linear", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4308 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5031 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1005 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4097 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3712 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.1/21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.1/21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json deleted file mode 100644 index 9778173ee..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.1/21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_ties-density-0.1", - "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.1", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4116 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4174 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.36 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.3/d781945e-e9df-4136-90cd-632f0bed6246.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.3/d781945e-e9df-4136-90cd-632f0bed6246.json deleted file mode 100644 index 386af3f39..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.3/d781945e-e9df-4136-90cd-632f0bed6246.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_ties-density-0.3", - "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.3", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3626 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4906 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4025 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3321 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.5/8f146bb5-dd4d-49ce-ac60-76f66321feb8.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.5/8f146bb5-dd4d-49ce-ac60-76f66321feb8.json deleted file mode 100644 index dfb672445..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.5/8f146bb5-dd4d-49ce-ac60-76f66321feb8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_ties-density-0.5", - "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.5", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3797 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4793 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3175 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.7/89bfba6d-c622-445e-b0b9-512aadcea7cf.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.7/89bfba6d-c622-445e-b0b9-512aadcea7cf.json deleted file mode 100644 index 5246939f3..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.7/89bfba6d-c622-445e-b0b9-512aadcea7cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_ties-density-0.7", - "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.7", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3681 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4738 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3881 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.9/9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.9/9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json deleted file mode 100644 index 248669cbb..000000000 --- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.9/9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct_ties-density-0.9", - "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.9", - "developer": "johnsutor", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3858 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4735 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-4k-DPO/455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json b/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-4k-DPO/455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json deleted file mode 100644 index 300e25443..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-4k-DPO/455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-14B-Instruct-4k-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-14B-Instruct-4k-DPO", - "id": "jpacifico/Chocolatine-14B-Instruct-4k-DPO", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4689 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4439 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.2/e04a76a6-ac22-43b2-bbf9-196a08de2949.json b/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.2/e04a76a6-ac22-43b2-bbf9-196a08de2949.json deleted file mode 100644 index ec8836b29..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.2/e04a76a6-ac22-43b2-bbf9-196a08de2949.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-14B-Instruct-DPO-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-14B-Instruct-DPO-v1.2", - "id": "jpacifico/Chocolatine-14B-Instruct-DPO-v1.2", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6852 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6438 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2092 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4268 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4697 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.3/2fcb74f0-add1-4d46-8a0f-8578a616dbed.json b/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.3/2fcb74f0-add1-4d46-8a0f-8578a616dbed.json deleted file mode 100644 index e3fa490e4..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.3/2fcb74f0-add1-4d46-8a0f-8578a616dbed.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-14B-Instruct-DPO-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-14B-Instruct-DPO-v1.3", - "id": "jpacifico/Chocolatine-14B-Instruct-DPO-v1.3", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6846 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4234 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1/51530638-ef76-43ce-9396-8a0d07988712.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1/51530638-ef76-43ce-9396-8a0d07988712.json deleted file mode 100644 index a7ff62b04..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1/51530638-ef76-43ce-9396-8a0d07988712.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-DPO-v2.0b1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-2-14B-Instruct-DPO-v2.0b1", - "id": "jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1033 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6696 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2757 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4467 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.1/74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.1/74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json deleted file mode 100644 index 1b5b639c2..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.1/74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-2-14B-Instruct-v2.0.1", - "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0.1", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0742 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6736 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4796 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3918 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.3/80e8b9f0-b507-4927-9d24-1c793e3783cc.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.3/80e8b9f0-b507-4927-9d24-1c793e3783cc.json deleted file mode 100644 index a34d92883..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.3/80e8b9f0-b507-4927-9d24-1c793e3783cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-2-14B-Instruct-v2.0.3", - "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0.3", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7037 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0/7b037520-a5e9-4b58-80f3-f0ecc5957c67.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0/7b037520-a5e9-4b58-80f3-f0ecc5957c67.json deleted file mode 100644 index 157f2eb32..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0/7b037520-a5e9-4b58-80f3-f0ecc5957c67.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-2-14B-Instruct-v2.0", - "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0885 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4804 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b2/10b88d05-62d2-4603-9d04-b0854e39ed40.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b2/10b88d05-62d2-4603-9d04-b0854e39ed40.json deleted file mode 100644 index eb93d587d..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b2/10b88d05-62d2-4603-9d04-b0854e39ed40.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0b2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-2-14B-Instruct-v2.0b2", - "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0b2", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7241 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6476 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4808 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b3/4b693f41-d811-4b64-892c-d840eee5ace4.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b3/4b693f41-d811-4b64-892c-d840eee5ace4.json deleted file mode 100644 index d9d2d02b9..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b3/4b693f41-d811-4b64-892c-d840eee5ace4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0b3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-2-14B-Instruct-v2.0b3", - "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0b3", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7323 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6469 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4109 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5337 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-Revised/90d86c8c-3aa6-42ba-a94f-75c961e65c41.json b/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-Revised/90d86c8c-3aa6-42ba-a94f-75c961e65c41.json deleted file mode 100644 index 5f96bd801..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-Revised/90d86c8c-3aa6-42ba-a94f-75c961e65c41.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-3B-Instruct-DPO-Revised/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-3B-Instruct-DPO-Revised", - "id": "jpacifico/Chocolatine-3B-Instruct-DPO-Revised", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5623 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1805 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4453 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3989 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.0/8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json b/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.0/8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json deleted file mode 100644 index 8e9903c3a..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.0/8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-3B-Instruct-DPO-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-3B-Instruct-DPO-v1.0", - "id": "jpacifico/Chocolatine-3B-Instruct-DPO-v1.0", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3737 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5471 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4755 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.2/b20a1d13-2f14-42e4-bdde-49f053cef325.json b/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.2/b20a1d13-2f14-42e4-bdde-49f053cef325.json deleted file mode 100644 index ae3e1be3f..000000000 --- a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.2/b20a1d13-2f14-42e4-bdde-49f053cef325.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-3B-Instruct-DPO-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chocolatine-3B-Instruct-DPO-v1.2", - "id": "jpacifico/Chocolatine-3B-Instruct-DPO-v1.2", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5455 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5487 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2047 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3877 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1/51521dfb-d4b5-45df-ac2a-54190aed0b9f.json b/data/hfopenllm_v2/jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1/51521dfb-d4b5-45df-ac2a-54190aed0b9f.json deleted file mode 100644 index e3a74e372..000000000 --- a/data/hfopenllm_v2/jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1/51521dfb-d4b5-45df-ac2a-54190aed0b9f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Distilucie-7B-Math-Instruct-DPO-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Distilucie-7B-Math-Instruct-DPO-v0.1", - "id": "jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3048 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3835 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1.3/997a1ceb-185a-4e6c-8383-eb5a6f976771.json b/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1.3/997a1ceb-185a-4e6c-8383-eb5a6f976771.json deleted file mode 100644 index 524704add..000000000 --- a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1.3/997a1ceb-185a-4e6c-8383-eb5a6f976771.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-7B-Instruct-DPO-v1.1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lucie-7B-Instruct-DPO-v1.1.3", - "id": "jpacifico/Lucie-7B-Instruct-DPO-v1.1.3", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3819 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3818 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1/22101998-c3d3-414f-9ed1-99330cdbe3b2.json b/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1/22101998-c3d3-414f-9ed1-99330cdbe3b2.json deleted file mode 100644 index da8506358..000000000 --- a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1/22101998-c3d3-414f-9ed1-99330cdbe3b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-7B-Instruct-DPO-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lucie-7B-Instruct-DPO-v1.1", - "id": "jpacifico/Lucie-7B-Instruct-DPO-v1.1", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4016 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1838 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0/a2408953-a7eb-449c-b80c-3620915d44d0.json b/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0/a2408953-a7eb-449c-b80c-3620915d44d0.json deleted file mode 100644 index 95eaddbfc..000000000 --- a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0/a2408953-a7eb-449c-b80c-3620915d44d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lucie-7B-Instruct-Merged-Model_Stock-v1.0", - "id": "jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3234 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3802 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1871 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1/d65e5b08-7d3c-4c0d-85fa-496db65a235c.json b/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1/d65e5b08-7d3c-4c0d-85fa-496db65a235c.json deleted file mode 100644 index 6a31392cf..000000000 --- a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1/d65e5b08-7d3c-4c0d-85fa-496db65a235c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lucie-7B-Instruct-Merged-Model_Stock-v1.1", - "id": "jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3014 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3808 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1862 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jpacifico/Lucie-Boosted-7B-Instruct/ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json b/data/hfopenllm_v2/jpacifico/Lucie-Boosted-7B-Instruct/ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json deleted file mode 100644 index b3296bdfe..000000000 --- a/data/hfopenllm_v2/jpacifico/Lucie-Boosted-7B-Instruct/ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-Boosted-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lucie-Boosted-7B-Instruct", - "id": "jpacifico/Lucie-Boosted-7B-Instruct", - "developer": "jpacifico", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.707 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2566 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.163 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jsfs11/L3-8B-Stheno-slerp/e9ba998d-8147-4046-afae-9ee7d544e98d.json b/data/hfopenllm_v2/jsfs11/L3-8B-Stheno-slerp/e9ba998d-8147-4046-afae-9ee7d544e98d.json deleted file mode 100644 index 09c46c3d4..000000000 --- a/data/hfopenllm_v2/jsfs11/L3-8B-Stheno-slerp/e9ba998d-8147-4046-afae-9ee7d544e98d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jsfs11_L3-8B-Stheno-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B-Stheno-slerp", - "id": "jsfs11/L3-8B-Stheno-slerp", - "developer": "jsfs11", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6752 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5326 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v4/c44f1012-1123-42c8-b110-5735dc756fd5.json b/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v4/c44f1012-1123-42c8-b110-5735dc756fd5.json deleted file mode 100644 index b881157bf..000000000 --- a/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v4/c44f1012-1123-42c8-b110-5735dc756fd5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jsfs11_MixtureofMerges-MoE-4x7b-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MixtureofMerges-MoE-4x7b-v4", - "id": "jsfs11/MixtureofMerges-MoE-4x7b-v4", - "developer": "jsfs11", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5169 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3032 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v5/5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json b/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v5/5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json deleted file mode 100644 index fe751bdd0..000000000 --- a/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v5/5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/jsfs11_MixtureofMerges-MoE-4x7b-v5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MixtureofMerges-MoE-4x7b-v5", - "id": "jsfs11/MixtureofMerges-MoE-4x7b-v5", - "developer": "jsfs11", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5198 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4305 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kaist-ai/janus-7b/b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json b/data/hfopenllm_v2/kaist-ai/janus-7b/b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json deleted file mode 100644 index 7009871f9..000000000 --- a/data/hfopenllm_v2/kaist-ai/janus-7b/b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kaist-ai_janus-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "janus-7b", - "id": "kaist-ai/janus-7b", - "developer": "kaist-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4694 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2874 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kaist-ai/janus-dpo-7b/529dba11-53af-4045-ae46-04e1b9838d4a.json b/data/hfopenllm_v2/kaist-ai/janus-dpo-7b/529dba11-53af-4045-ae46-04e1b9838d4a.json deleted file mode 100644 index c70f9f282..000000000 --- a/data/hfopenllm_v2/kaist-ai/janus-dpo-7b/529dba11-53af-4045-ae46-04e1b9838d4a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kaist-ai_janus-dpo-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "janus-dpo-7b", - "id": "kaist-ai/janus-dpo-7b", - "developer": "kaist-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4773 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2976 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kaist-ai/janus-rm-7b/391f6d6c-418f-44be-910a-fb90b5712649.json b/data/hfopenllm_v2/kaist-ai/janus-rm-7b/391f6d6c-418f-44be-910a-fb90b5712649.json deleted file mode 100644 index 979e94c2a..000000000 --- a/data/hfopenllm_v2/kaist-ai/janus-rm-7b/391f6d6c-418f-44be-910a-fb90b5712649.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kaist-ai_janus-rm-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "janus-rm-7b", - "id": "kaist-ai/janus-rm-7b", - "developer": "kaist-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LLMForSequenceRegression", - "params_billions": 7.111 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1778 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3056 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3883 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kaist-ai/mistral-orpo-capybara-7k/2ccccb4b-7260-4a1a-9426-117e359c7c5c.json b/data/hfopenllm_v2/kaist-ai/mistral-orpo-capybara-7k/2ccccb4b-7260-4a1a-9426-117e359c7c5c.json deleted file mode 100644 index 41fa69358..000000000 --- a/data/hfopenllm_v2/kaist-ai/mistral-orpo-capybara-7k/2ccccb4b-7260-4a1a-9426-117e359c7c5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kaist-ai_mistral-orpo-capybara-7k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-orpo-capybara-7k", - "id": "kaist-ai/mistral-orpo-capybara-7k", - "developer": "kaist-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5367 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3964 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2971 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/84afecec-453d-491c-9f5a-de31d8fba43e.json b/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/84afecec-453d-491c-9f5a-de31d8fba43e.json deleted file mode 100644 index 7d5fbcff2..000000000 --- a/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/84afecec-453d-491c-9f5a-de31d8fba43e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kavonalds_BunderMaxx-0710/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BunderMaxx-0710", - "id": "kavonalds/BunderMaxx-0710", - "developer": "kavonalds", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3283 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6651 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3393 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json b/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json deleted file mode 100644 index a38c701e1..000000000 --- a/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kavonalds_BunderMaxx-0710/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BunderMaxx-0710", - "id": "kavonalds/BunderMaxx-0710", - "developer": "kavonalds", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5566 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3682 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1449 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kavonalds/BunderMaxx-1010/1179bcce-558e-40ad-8537-c74c59557975.json b/data/hfopenllm_v2/kavonalds/BunderMaxx-1010/1179bcce-558e-40ad-8537-c74c59557975.json deleted file mode 100644 index a63fbd6f4..000000000 --- a/data/hfopenllm_v2/kavonalds/BunderMaxx-1010/1179bcce-558e-40ad-8537-c74c59557975.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kavonalds_BunderMaxx-1010/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BunderMaxx-1010", - "id": "kavonalds/BunderMaxx-1010", - "developer": "kavonalds", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2981 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.702 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3484 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kavonalds/Lancer-1-1b-Instruct/fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json b/data/hfopenllm_v2/kavonalds/Lancer-1-1b-Instruct/fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json deleted file mode 100644 index 13c6651be..000000000 --- a/data/hfopenllm_v2/kavonalds/Lancer-1-1b-Instruct/fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kavonalds_Lancer-1-1b-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lancer-1-1b-Instruct", - "id": "kavonalds/Lancer-1-1b-Instruct", - "developer": "kavonalds", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5546 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3253 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3144 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1568 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe/81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json b/data/hfopenllm_v2/kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe/81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json deleted file mode 100644 index 773475a94..000000000 --- a/data/hfopenllm_v2/kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe/81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kayfour_T3Q-Qwen2.5-7B-it-KOR-Safe/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "T3Q-Qwen2.5-7B-it-KOR-Safe", - "id": "kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe", - "developer": "kayfour", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6081 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4464 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/keeeeenw/MicroLlama/173bb053-e817-4551-b169-c3f71163650a.json b/data/hfopenllm_v2/keeeeenw/MicroLlama/173bb053-e817-4551-b169-c3f71163650a.json deleted file mode 100644 index 50d40f2d1..000000000 --- a/data/hfopenllm_v2/keeeeenw/MicroLlama/173bb053-e817-4551-b169-c3f71163650a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/keeeeenw_MicroLlama/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MicroLlama", - "id": "keeeeenw/MicroLlama", - "developer": "keeeeenw", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.305 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1985 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3007 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1138 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kekmodel/StopCarbon-10.7B-v5/b7e6a86f-340c-48ed-a828-2e80a13aa515.json b/data/hfopenllm_v2/kekmodel/StopCarbon-10.7B-v5/b7e6a86f-340c-48ed-a828-2e80a13aa515.json deleted file mode 100644 index 087787728..000000000 --- a/data/hfopenllm_v2/kekmodel/StopCarbon-10.7B-v5/b7e6a86f-340c-48ed-a828-2e80a13aa515.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kekmodel_StopCarbon-10.7B-v5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "StopCarbon-10.7B-v5", - "id": "kekmodel/StopCarbon-10.7B-v5", - "developer": "kekmodel", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4728 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5178 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kevin009/llamaRAGdrama/bd221eee-7aa8-4d6f-a6be-89ee5568e729.json b/data/hfopenllm_v2/kevin009/llamaRAGdrama/bd221eee-7aa8-4d6f-a6be-89ee5568e729.json deleted file mode 100644 index 9b012321f..000000000 --- a/data/hfopenllm_v2/kevin009/llamaRAGdrama/bd221eee-7aa8-4d6f-a6be-89ee5568e729.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kevin009_llamaRAGdrama/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llamaRAGdrama", - "id": "kevin009/llamaRAGdrama", - "developer": "kevin009", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2598 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4007 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2724 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khoantap/cheap-moe-merge/8727a325-a515-4456-ba34-65c30f84644a.json b/data/hfopenllm_v2/khoantap/cheap-moe-merge/8727a325-a515-4456-ba34-65c30f84644a.json deleted file mode 100644 index 20cf2da74..000000000 --- a/data/hfopenllm_v2/khoantap/cheap-moe-merge/8727a325-a515-4456-ba34-65c30f84644a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khoantap_cheap-moe-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cheap-moe-merge", - "id": "khoantap/cheap-moe-merge", - "developer": "khoantap", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 19.305 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4557 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5131 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0921 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4103 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khoantap/llama-3-8b-stock-merge/3e4011fa-d480-4c16-9371-2025bc834358.json b/data/hfopenllm_v2/khoantap/llama-3-8b-stock-merge/3e4011fa-d480-4c16-9371-2025bc834358.json deleted file mode 100644 index df818dab9..000000000 --- a/data/hfopenllm_v2/khoantap/llama-3-8b-stock-merge/3e4011fa-d480-4c16-9371-2025bc834358.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khoantap_llama-3-8b-stock-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-stock-merge", - "id": "khoantap/llama-3-8b-stock-merge", - "developer": "khoantap", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4812 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5162 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1616 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3946 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khoantap/llama-breadcrumbs-ties-merge/867499a7-589b-4564-b04d-a004b7c0abb4.json b/data/hfopenllm_v2/khoantap/llama-breadcrumbs-ties-merge/867499a7-589b-4564-b04d-a004b7c0abb4.json deleted file mode 100644 index ce7273c9a..000000000 --- a/data/hfopenllm_v2/khoantap/llama-breadcrumbs-ties-merge/867499a7-589b-4564-b04d-a004b7c0abb4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khoantap_llama-breadcrumbs-ties-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-breadcrumbs-ties-merge", - "id": "khoantap/llama-breadcrumbs-ties-merge", - "developer": "khoantap", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khoantap/llama-evolve-ties-best-merge/52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json b/data/hfopenllm_v2/khoantap/llama-evolve-ties-best-merge/52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json deleted file mode 100644 index 8af7559fa..000000000 --- a/data/hfopenllm_v2/khoantap/llama-evolve-ties-best-merge/52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khoantap_llama-evolve-ties-best-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-evolve-ties-best-merge", - "id": "khoantap/llama-evolve-ties-best-merge", - "developer": "khoantap", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6744 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5414 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1563 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3946 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khoantap/llama-linear-0.5-0.5-1-merge/5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json b/data/hfopenllm_v2/khoantap/llama-linear-0.5-0.5-1-merge/5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json deleted file mode 100644 index 75bc7bcd6..000000000 --- a/data/hfopenllm_v2/khoantap/llama-linear-0.5-0.5-1-merge/5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khoantap_llama-linear-0.5-0.5-1-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-linear-0.5-0.5-1-merge", - "id": "khoantap/llama-linear-0.5-0.5-1-merge", - "developer": "khoantap", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4812 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5643 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2054 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4143 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3833 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khoantap/llama-linear-0.5-1-0.5-merge/3278855d-7bd1-4e7e-b27b-b1393006e7e7.json b/data/hfopenllm_v2/khoantap/llama-linear-0.5-1-0.5-merge/3278855d-7bd1-4e7e-b27b-b1393006e7e7.json deleted file mode 100644 index 28ab86f9c..000000000 --- a/data/hfopenllm_v2/khoantap/llama-linear-0.5-1-0.5-merge/3278855d-7bd1-4e7e-b27b-b1393006e7e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khoantap_llama-linear-0.5-1-0.5-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-linear-0.5-1-0.5-merge", - "id": "khoantap/llama-linear-0.5-1-0.5-merge", - "developer": "khoantap", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5032 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5951 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4172 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khoantap/llama-linear-1-0.5-0.5-merge/5193ab4d-1627-43b5-bfb7-89e08ea1f810.json b/data/hfopenllm_v2/khoantap/llama-linear-1-0.5-0.5-merge/5193ab4d-1627-43b5-bfb7-89e08ea1f810.json deleted file mode 100644 index e1f8e572b..000000000 --- a/data/hfopenllm_v2/khoantap/llama-linear-1-0.5-0.5-merge/5193ab4d-1627-43b5-bfb7-89e08ea1f810.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khoantap_llama-linear-1-0.5-0.5-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-linear-1-0.5-0.5-merge", - "id": "khoantap/llama-linear-1-0.5-0.5-merge", - "developer": "khoantap", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5526 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3635 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khoantap/llama-slerp-merge/598faeda-48fb-43a8-aaa9-849d5dfcea79.json b/data/hfopenllm_v2/khoantap/llama-slerp-merge/598faeda-48fb-43a8-aaa9-849d5dfcea79.json deleted file mode 100644 index b429d2b76..000000000 --- a/data/hfopenllm_v2/khoantap/llama-slerp-merge/598faeda-48fb-43a8-aaa9-849d5dfcea79.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khoantap_llama-slerp-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-slerp-merge", - "id": "khoantap/llama-slerp-merge", - "developer": "khoantap", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.498 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5783 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4053 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3678 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khoantap/moe-out-merge/d1afa2fb-1256-4dd3-b13b-802917bf481b.json b/data/hfopenllm_v2/khoantap/moe-out-merge/d1afa2fb-1256-4dd3-b13b-802917bf481b.json deleted file mode 100644 index 7471ab85d..000000000 --- a/data/hfopenllm_v2/khoantap/moe-out-merge/d1afa2fb-1256-4dd3-b13b-802917bf481b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khoantap_moe-out-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "moe-out-merge", - "id": "khoantap/moe-out-merge", - "developer": "khoantap", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2MoeForCausalLM", - "params_billions": 19.305 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5151 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4063 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3348 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/khulaifi95/Llama-3.1-8B-Reason-Blend-888k/397c9bc3-0af5-453c-9b68-5360783dfbf7.json b/data/hfopenllm_v2/khulaifi95/Llama-3.1-8B-Reason-Blend-888k/397c9bc3-0af5-453c-9b68-5360783dfbf7.json deleted file mode 100644 index b28191893..000000000 --- a/data/hfopenllm_v2/khulaifi95/Llama-3.1-8B-Reason-Blend-888k/397c9bc3-0af5-453c-9b68-5360783dfbf7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/khulaifi95_Llama-3.1-8B-Reason-Blend-888k/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Reason-Blend-888k", - "id": "khulaifi95/Llama-3.1-8B-Reason-Blend-888k", - "developer": "khulaifi95", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5832 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.479 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3379 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json b/data/hfopenllm_v2/kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json deleted file mode 100644 index 06712c0c5..000000000 --- a/data/hfopenllm_v2/kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kms7530_chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1", - "id": "kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1", - "developer": "kms7530", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 9.3 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5455 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3821 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2798 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/7e793244-b746-4aa4-a401-dcf5884f61a4.json b/data/hfopenllm_v2/kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/7e793244-b746-4aa4-a401-dcf5884f61a4.json deleted file mode 100644 index a2ee08c81..000000000 --- a/data/hfopenllm_v2/kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/7e793244-b746-4aa4-a401-dcf5884f61a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kms7530_chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath", - "id": "kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath", - "developer": "kms7530", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 4.132 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4863 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4987 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3983 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3481 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1/26a8da03-debd-41e3-8ee1-2827d76b26ca.json b/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1/26a8da03-debd-41e3-8ee1-2827d76b26ca.json deleted file mode 100644 index 969ace9a0..000000000 --- a/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1/26a8da03-debd-41e3-8ee1-2827d76b26ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kms7530_chemeng_qwen-math-7b_24_1_100_1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "chemeng_qwen-math-7b_24_1_100_1", - "id": "kms7530/chemeng_qwen-math-7b_24_1_100_1", - "developer": "kms7530", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 8.911 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2111 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3578 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2158 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath/e214c326-dd84-4915-bba1-faaafbb026b2.json b/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath/e214c326-dd84-4915-bba1-faaafbb026b2.json deleted file mode 100644 index ce14e36aa..000000000 --- a/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath/e214c326-dd84-4915-bba1-faaafbb026b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kms7530_chemeng_qwen-math-7b_24_1_100_1_nonmath/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "chemeng_qwen-math-7b_24_1_100_1_nonmath", - "id": "kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath", - "developer": "kms7530", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 15.231 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3097 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4087 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2452 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kno10/ende-chat-0.0.5/98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json b/data/hfopenllm_v2/kno10/ende-chat-0.0.5/98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json deleted file mode 100644 index dfb3a0c4e..000000000 --- a/data/hfopenllm_v2/kno10/ende-chat-0.0.5/98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kno10_ende-chat-0.0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ende-chat-0.0.5", - "id": "kno10/ende-chat-0.0.5", - "developer": "kno10", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.891 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3404 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3938 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.179 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kno10/ende-chat-0.0.7/40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json b/data/hfopenllm_v2/kno10/ende-chat-0.0.7/40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json deleted file mode 100644 index eb7bd3e9e..000000000 --- a/data/hfopenllm_v2/kno10/ende-chat-0.0.7/40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kno10_ende-chat-0.0.7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ende-chat-0.0.7", - "id": "kno10/ende-chat-0.0.7", - "developer": "kno10", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.891 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3861 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1966 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kyutai/helium-1-preview-2b/d881a83a-9ba8-4919-8b89-45f5a7220621.json b/data/hfopenllm_v2/kyutai/helium-1-preview-2b/d881a83a-9ba8-4919-8b89-45f5a7220621.json deleted file mode 100644 index 9db17dbd8..000000000 --- a/data/hfopenllm_v2/kyutai/helium-1-preview-2b/d881a83a-9ba8-4919-8b89-45f5a7220621.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kyutai_helium-1-preview-2b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "helium-1-preview-2b", - "id": "kyutai/helium-1-preview-2b", - "developer": "kyutai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "HeliumForCausalLM", - "params_billions": 2.173 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2614 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3638 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/kz919/QwQ-0.5B-Distilled-SFT/d6c966a1-7927-424a-9886-b98688d27e6f.json b/data/hfopenllm_v2/kz919/QwQ-0.5B-Distilled-SFT/d6c966a1-7927-424a-9886-b98688d27e6f.json deleted file mode 100644 index 34900ada9..000000000 --- a/data/hfopenllm_v2/kz919/QwQ-0.5B-Distilled-SFT/d6c966a1-7927-424a-9886-b98688d27e6f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/kz919_QwQ-0.5B-Distilled-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-0.5B-Distilled-SFT", - "id": "kz919/QwQ-0.5B-Distilled-SFT", - "developer": "kz919", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1587 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ladydaina/ECE-FDF/c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json b/data/hfopenllm_v2/ladydaina/ECE-FDF/c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json deleted file mode 100644 index 298900d6a..000000000 --- a/data/hfopenllm_v2/ladydaina/ECE-FDF/c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ladydaina_ECE-FDF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-FDF", - "id": "ladydaina/ECE-FDF", - "developer": "ladydaina", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4504 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3007 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/laislemke/LLaMA-2-vicuna-7b-slerp/b3979c7f-0596-4a24-b264-73a17ba19821.json b/data/hfopenllm_v2/laislemke/LLaMA-2-vicuna-7b-slerp/b3979c7f-0596-4a24-b264-73a17ba19821.json deleted file mode 100644 index d0f53a22e..000000000 --- a/data/hfopenllm_v2/laislemke/LLaMA-2-vicuna-7b-slerp/b3979c7f-0596-4a24-b264-73a17ba19821.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/laislemke_LLaMA-2-vicuna-7b-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA-2-vicuna-7b-slerp", - "id": "laislemke/LLaMA-2-vicuna-7b-slerp", - "developer": "laislemke", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2932 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2986 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3833 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1342 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR/f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR/f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json deleted file mode 100644 index d16b8f1ef..000000000 --- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR/f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-0.5B-FT-V5-MUSR/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR", - "id": "lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR", - "developer": "lalainy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2138 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3269 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3262 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-SLERP-V4/8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-SLERP-V4/8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json deleted file mode 100644 index bf518c502..000000000 --- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-SLERP-V4/8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-0.5B-SLERP-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-SLERP-V4", - "id": "lalainy/ECE-PRYMMAL-0.5B-SLERP-V4", - "developer": "lalainy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3789 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1169 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json deleted file mode 100644 index e44f2c10f..000000000 --- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1", - "id": "lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1", - "developer": "lalainy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1437 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3646 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1121 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3/329e5e91-10ba-4795-ae86-dda95e698b4f.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3/329e5e91-10ba-4795-ae86-dda95e698b4f.json deleted file mode 100644 index 97784607c..000000000 --- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3/329e5e91-10ba-4795-ae86-dda95e698b4f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-1B-SLERP-V3", - "id": "lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3", - "developer": "lalainy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4225 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4213 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2931 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4/3fe89b13-135d-4790-871d-74e7a28ea2e9.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4/3fe89b13-135d-4790-871d-74e7a28ea2e9.json deleted file mode 100644 index e1cc670e9..000000000 --- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4/3fe89b13-135d-4790-871d-74e7a28ea2e9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-1B-SLERP-V4", - "id": "lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4", - "developer": "lalainy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3324 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1005 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4306 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2893 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1/4b807741-f1b9-4964-9bc9-bb93f9b34217.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1/4b807741-f1b9-4964-9bc9-bb93f9b34217.json deleted file mode 100644 index fb7527528..000000000 --- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1/4b807741-f1b9-4964-9bc9-bb93f9b34217.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-6B-SLERP-V1", - "id": "lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1", - "developer": "lalainy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3264 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4629 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4864 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3214 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2/c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2/c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json deleted file mode 100644 index c4e7de98b..000000000 --- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2/c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-6B-SLERP-V2", - "id": "lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2", - "developer": "lalainy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4629 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4864 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3214 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/langgptai/Qwen-las-v0.1/f6e157c4-0ce9-41c9-b885-9222d894ff0c.json b/data/hfopenllm_v2/langgptai/Qwen-las-v0.1/f6e157c4-0ce9-41c9-b885-9222d894ff0c.json deleted file mode 100644 index 0046e64f5..000000000 --- a/data/hfopenllm_v2/langgptai/Qwen-las-v0.1/f6e157c4-0ce9-41c9-b885-9222d894ff0c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/langgptai_Qwen-las-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-las-v0.1", - "id": "langgptai/Qwen-las-v0.1", - "developer": "langgptai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 7.901 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2325 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/langgptai/qwen1.5-7b-chat-sa-v0.1/fe52a94a-5324-4b59-accc-dfd1f9d4aead.json b/data/hfopenllm_v2/langgptai/qwen1.5-7b-chat-sa-v0.1/fe52a94a-5324-4b59-accc-dfd1f9d4aead.json deleted file mode 100644 index 17240757f..000000000 --- a/data/hfopenllm_v2/langgptai/qwen1.5-7b-chat-sa-v0.1/fe52a94a-5324-4b59-accc-dfd1f9d4aead.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/langgptai_qwen1.5-7b-chat-sa-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen1.5-7b-chat-sa-v0.1", - "id": "langgptai/qwen1.5-7b-chat-sa-v0.1", - "developer": "langgptai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 15.443 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4268 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4325 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2993 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lars1234/Mistral-Small-24B-Instruct-2501-writer/1241f5e3-54eb-429e-b109-a5e163e39eda.json b/data/hfopenllm_v2/lars1234/Mistral-Small-24B-Instruct-2501-writer/1241f5e3-54eb-429e-b109-a5e163e39eda.json deleted file mode 100644 index 92337f309..000000000 --- a/data/hfopenllm_v2/lars1234/Mistral-Small-24B-Instruct-2501-writer/1241f5e3-54eb-429e-b109-a5e163e39eda.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lars1234_Mistral-Small-24B-Instruct-2501-writer/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-24B-Instruct-2501-writer", - "id": "lars1234/Mistral-Small-24B-Instruct-2501-writer", - "developer": "lars1234", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6565 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6733 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4645 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5448 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/leafspark/Llama-3.1-8B-MultiReflection-Instruct/8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json b/data/hfopenllm_v2/leafspark/Llama-3.1-8B-MultiReflection-Instruct/8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json deleted file mode 100644 index b9d1ba6c5..000000000 --- a/data/hfopenllm_v2/leafspark/Llama-3.1-8B-MultiReflection-Instruct/8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/leafspark_Llama-3.1-8B-MultiReflection-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-MultiReflection-Instruct", - "id": "leafspark/Llama-3.1-8B-MultiReflection-Instruct", - "developer": "leafspark", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7125 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5009 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3682 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-9B/5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-9B/5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json deleted file mode 100644 index 1a22b28f0..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-9B/5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-9B", - "id": "lemon07r/Gemma-2-Ataraxy-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3009 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5931 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4424 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Advanced-9B/63e82cb3-2f6f-4617-abb7-ae093bc27830.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Advanced-9B/63e82cb3-2f6f-4617-abb7-ae093bc27830.json deleted file mode 100644 index d0d68741f..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Advanced-9B/63e82cb3-2f6f-4617-abb7-ae093bc27830.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-Advanced-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-Advanced-9B", - "id": "lemon07r/Gemma-2-Ataraxy-Advanced-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5516 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5889 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1979 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Remix-9B/0feb74e6-40d4-472d-9233-27faa2d3f802.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Remix-9B/0feb74e6-40d4-472d-9233-27faa2d3f802.json deleted file mode 100644 index 3e6258067..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Remix-9B/0feb74e6-40d4-472d-9233-27faa2d3f802.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-Remix-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-Remix-9B", - "id": "lemon07r/Gemma-2-Ataraxy-Remix-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7083 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5892 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2017 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4239 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2-9B/e74dd005-c9b5-45c9-b7f5-455c3110e09b.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2-9B/e74dd005-c9b5-45c9-b7f5-455c3110e09b.json deleted file mode 100644 index 48d6b6abd..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2-9B/e74dd005-c9b5-45c9-b7f5-455c3110e09b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v2-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v2-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2136 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5766 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3484 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4221 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2a-9B/d094bf6f-9952-45c7-995e-d7eda07f4668.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2a-9B/d094bf6f-9952-45c7-995e-d7eda07f4668.json deleted file mode 100644 index 9dfdf8d74..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2a-9B/d094bf6f-9952-45c7-995e-d7eda07f4668.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v2a-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v2a-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v2a-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1595 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5182 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3165 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2f-9B/0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2f-9B/0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json deleted file mode 100644 index a5ae93322..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2f-9B/0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v2f-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v2f-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v2f-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3791 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5193 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3503 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B/f91982ac-0cab-415a-8503-e090d195bd05.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B/f91982ac-0cab-415a-8503-e090d195bd05.json deleted file mode 100644 index c793e8539..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B/f91982ac-0cab-415a-8503-e090d195bd05.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v3-Advanced-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v3-Advanced-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6602 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5935 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4196 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3b-9B/fb1af66e-7828-495b-8277-5cff77c3070e.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3b-9B/fb1af66e-7828-495b-8277-5cff77c3070e.json deleted file mode 100644 index 5ef333acb..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3b-9B/fb1af66e-7828-495b-8277-5cff77c3070e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v3b-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v3b-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v3b-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6809 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5908 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2153 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4205 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3i-9B/ac84c157-4d11-43c1-8731-b1e5cfa91668.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3i-9B/ac84c157-4d11-43c1-8731-b1e5cfa91668.json deleted file mode 100644 index fb3af6e05..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3i-9B/ac84c157-4d11-43c1-8731-b1e5cfa91668.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v3i-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v3i-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v3i-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5626 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3j-9B/bbc812dd-9a9c-4f99-b813-50361025eea3.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3j-9B/bbc812dd-9a9c-4f99-b813-50361025eea3.json deleted file mode 100644 index 627159167..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3j-9B/bbc812dd-9a9c-4f99-b813-50361025eea3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v3j-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v3j-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v3j-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4169 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5632 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1692 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4134 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B/fc818799-49d5-4fca-b131-ebe8d5d831f1.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B/fc818799-49d5-4fca-b131-ebe8d5d831f1.json deleted file mode 100644 index 5762b4771..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B/fc818799-49d5-4fca-b131-ebe8d5d831f1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4-Advanced-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v4-Advanced-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7015 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6024 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2153 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4581 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4367 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B/33349989-8573-4d71-ae0f-99691fdaffc3.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B/33349989-8573-4d71-ae0f-99691fdaffc3.json deleted file mode 100644 index f1c4a9342..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B/33349989-8573-4d71-ae0f-99691fdaffc3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4a-Advanced-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v4a-Advanced-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7135 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5988 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4309 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4b-9B/91551de5-d8ac-4c0d-b9b4-3627db947f0e.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4b-9B/91551de5-d8ac-4c0d-b9b4-3627db947f0e.json deleted file mode 100644 index de6307f19..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4b-9B/91551de5-d8ac-4c0d-b9b4-3627db947f0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4b-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v4b-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v4b-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6878 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6039 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2334 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4c-9B/c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4c-9B/c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json deleted file mode 100644 index ddb47c621..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4c-9B/c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4c-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v4c-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v4c-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6945 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2266 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4395 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4d-9B/36821a8b-af18-4631-b4b0-7e4b37bb194b.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4d-9B/36821a8b-af18-4631-b4b0-7e4b37bb194b.json deleted file mode 100644 index 911a9ca74..000000000 --- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4d-9B/36821a8b-af18-4631-b4b0-7e4b37bb194b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4d-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-v4d-9B", - "id": "lemon07r/Gemma-2-Ataraxy-v4d-9B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6054 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2334 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4346 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/Llama-3-RedMagic4-8B/e402d129-f4f1-4b95-b079-4f30936119aa.json b/data/hfopenllm_v2/lemon07r/Llama-3-RedMagic4-8B/e402d129-f4f1-4b95-b079-4f30936119aa.json deleted file mode 100644 index 4d4ff0851..000000000 --- a/data/hfopenllm_v2/lemon07r/Llama-3-RedMagic4-8B/e402d129-f4f1-4b95-b079-4f30936119aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_Llama-3-RedMagic4-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-RedMagic4-8B", - "id": "lemon07r/Llama-3-RedMagic4-8B", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4864 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3766 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3676 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lemon07r/llama-3-NeuralMahou-8b/814e1ea7-a639-4b05-9208-0bf537ea5479.json b/data/hfopenllm_v2/lemon07r/llama-3-NeuralMahou-8b/814e1ea7-a639-4b05-9208-0bf537ea5479.json deleted file mode 100644 index 27784d919..000000000 --- a/data/hfopenllm_v2/lemon07r/llama-3-NeuralMahou-8b/814e1ea7-a639-4b05-9208-0bf537ea5479.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lemon07r_llama-3-NeuralMahou-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-NeuralMahou-8b", - "id": "lemon07r/llama-3-NeuralMahou-8b", - "developer": "lemon07r", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4901 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4184 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3B/35a50d36-31d0-454b-a13c-80ca26945f94.json b/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3B/35a50d36-31d0-454b-a13c-80ca26945f94.json deleted file mode 100644 index 83e427178..000000000 --- a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3B/35a50d36-31d0-454b-a13c-80ca26945f94.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lesubra_ECE-EIFFEL-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-EIFFEL-3B", - "id": "lesubra/ECE-EIFFEL-3B", - "developer": "lesubra", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3469 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5102 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4362 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3821 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv2/87347017-4ff1-4bd3-a1d7-8f3999061209.json b/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv2/87347017-4ff1-4bd3-a1d7-8f3999061209.json deleted file mode 100644 index 3d51d39fa..000000000 --- a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv2/87347017-4ff1-4bd3-a1d7-8f3999061209.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lesubra_ECE-EIFFEL-3Bv2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-EIFFEL-3Bv2", - "id": "lesubra/ECE-EIFFEL-3Bv2", - "developer": "lesubra", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3013 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5424 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4443 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3999 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv3/976184ed-c4ed-4898-83c7-521a8a8309ac.json b/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv3/976184ed-c4ed-4898-83c7-521a8a8309ac.json deleted file mode 100644 index 33d7b6db8..000000000 --- a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv3/976184ed-c4ed-4898-83c7-521a8a8309ac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lesubra_ECE-EIFFEL-3Bv3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-EIFFEL-3Bv3", - "id": "lesubra/ECE-EIFFEL-3Bv3", - "developer": "lesubra", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3786 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5469 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1669 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4675 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3975 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V1/fa52f072-7725-4a4e-b728-042e5897a1bd.json b/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V1/fa52f072-7725-4a4e-b728-042e5897a1bd.json deleted file mode 100644 index 692fe6956..000000000 --- a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V1/fa52f072-7725-4a4e-b728-042e5897a1bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lesubra_ECE-PRYMMAL-3B-SLERP-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-3B-SLERP-V1", - "id": "lesubra/ECE-PRYMMAL-3B-SLERP-V1", - "developer": "lesubra", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2933 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5341 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1662 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V2/6374dcee-301c-4f28-9316-82ed8e693089.json b/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V2/6374dcee-301c-4f28-9316-82ed8e693089.json deleted file mode 100644 index 91ea20f99..000000000 --- a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V2/6374dcee-301c-4f28-9316-82ed8e693089.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lesubra_ECE-PRYMMAL-3B-SLERP-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-3B-SLERP-V2", - "id": "lesubra/ECE-PRYMMAL-3B-SLERP-V2", - "developer": "lesubra", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2933 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5341 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1662 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V1/b7c95cb4-f32f-466e-a28c-32afd9ec5578.json b/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V1/b7c95cb4-f32f-466e-a28c-32afd9ec5578.json deleted file mode 100644 index 7c02e49b7..000000000 --- a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V1/b7c95cb4-f32f-466e-a28c-32afd9ec5578.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lesubra_ECE-PRYMMAL-3B-SLERP_2-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-3B-SLERP_2-V1", - "id": "lesubra/ECE-PRYMMAL-3B-SLERP_2-V1", - "developer": "lesubra", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1677 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.399 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V2/bddd742b-f7c9-44aa-ad2f-83f51a4625be.json b/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V2/bddd742b-f7c9-44aa-ad2f-83f51a4625be.json deleted file mode 100644 index 7ac9d72fd..000000000 --- a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V2/bddd742b-f7c9-44aa-ad2f-83f51a4625be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lesubra_ECE-PRYMMAL-3B-SLERP_2-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-3B-SLERP_2-V2", - "id": "lesubra/ECE-PRYMMAL-3B-SLERP_2-V2", - "developer": "lesubra", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3664 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1677 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.399 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lesubra/merge-test/099af0ee-c06b-4435-8f97-27681f3eddff.json b/data/hfopenllm_v2/lesubra/merge-test/099af0ee-c06b-4435-8f97-27681f3eddff.json deleted file mode 100644 index 0105f3bb4..000000000 --- a/data/hfopenllm_v2/lesubra/merge-test/099af0ee-c06b-4435-8f97-27681f3eddff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lesubra_merge-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merge-test", - "id": "lesubra/merge-test", - "developer": "lesubra", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5383 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4419 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full/fa826f3a-8688-4518-8d44-68189abb47ba.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full/fa826f3a-8688-4518-8d44-68189abb47ba.json deleted file mode 100644 index 122789813..000000000 --- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full/fa826f3a-8688-4518-8d44-68189abb47ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-full/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "suzume-llama-3-8B-multilingual-orpo-borda-full", - "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full", - "developer": "lightblue", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5817 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4714 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half/10d29dc0-3486-40df-9933-1ce8f0fabaa2.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half/10d29dc0-3486-40df-9933-1ce8f0fabaa2.json deleted file mode 100644 index b4ab0a93e..000000000 --- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half/10d29dc0-3486-40df-9933-1ce8f0fabaa2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-half/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "suzume-llama-3-8B-multilingual-orpo-borda-half", - "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half", - "developer": "lightblue", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6249 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4707 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3516 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3614 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25/741ff375-3392-461e-a9b0-e0dab4e6e9f8.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25/741ff375-3392-461e-a9b0-e0dab4e6e9f8.json deleted file mode 100644 index 9a8eccba8..000000000 --- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25/741ff375-3392-461e-a9b0-e0dab4e6e9f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top25/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "suzume-llama-3-8B-multilingual-orpo-borda-top25", - "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25", - "developer": "lightblue", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6637 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4865 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1042 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3566 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75/c3d709de-118d-40c2-ab89-040efedd7fdb.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75/c3d709de-118d-40c2-ab89-040efedd7fdb.json deleted file mode 100644 index 669b89390..000000000 --- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75/c3d709de-118d-40c2-ab89-040efedd7fdb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top75/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "suzume-llama-3-8B-multilingual-orpo-borda-top75", - "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75", - "developer": "lightblue", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6687 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4833 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual/9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual/9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json deleted file mode 100644 index 919b41cbc..000000000 --- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual/9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "suzume-llama-3-8B-multilingual", - "id": "lightblue/suzume-llama-3-8B-multilingual", - "developer": "lightblue", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6678 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3383 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_145_/be850d1b-bf75-4c34-830f-8881792ac842.json b/data/hfopenllm_v2/lkoenig/BBAI_145_/be850d1b-bf75-4c34-830f-8881792ac842.json deleted file mode 100644 index 302154c6b..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_145_/be850d1b-bf75-4c34-830f-8881792ac842.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_145_/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_145_", - "id": "lkoenig/BBAI_145_", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5567 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.361 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_200_Gemma/6b644b97-4fc3-4826-9ea9-68be1dc8e947.json b/data/hfopenllm_v2/lkoenig/BBAI_200_Gemma/6b644b97-4fc3-4826-9ea9-68be1dc8e947.json deleted file mode 100644 index 7865c5bfc..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_200_Gemma/6b644b97-4fc3-4826-9ea9-68be1dc8e947.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_200_Gemma/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_200_Gemma", - "id": "lkoenig/BBAI_200_Gemma", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 19.3 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0705 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3449 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_212_QwenLawLo/861d41f1-6d33-4e07-96ea-2c39a36c4b63.json b/data/hfopenllm_v2/lkoenig/BBAI_212_QwenLawLo/861d41f1-6d33-4e07-96ea-2c39a36c4b63.json deleted file mode 100644 index 2d76a0d19..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_212_QwenLawLo/861d41f1-6d33-4e07-96ea-2c39a36c4b63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_212_QwenLawLo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_212_QwenLawLo", - "id": "lkoenig/BBAI_212_QwenLawLo", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4566 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5574 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4489 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_212_Qwencore/7501b038-4847-45bc-8b92-6800d7a58c1e.json b/data/hfopenllm_v2/lkoenig/BBAI_212_Qwencore/7501b038-4847-45bc-8b92-6800d7a58c1e.json deleted file mode 100644 index 3b77ae393..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_212_Qwencore/7501b038-4847-45bc-8b92-6800d7a58c1e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_212_Qwencore/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_212_Qwencore", - "id": "lkoenig/BBAI_212_Qwencore", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4384 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5569 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3489 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_230_Xiaqwen/db48206d-700b-45f3-b597-8752110113b5.json b/data/hfopenllm_v2/lkoenig/BBAI_230_Xiaqwen/db48206d-700b-45f3-b597-8752110113b5.json deleted file mode 100644 index af9941d74..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_230_Xiaqwen/db48206d-700b-45f3-b597-8752110113b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_230_Xiaqwen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_230_Xiaqwen", - "id": "lkoenig/BBAI_230_Xiaqwen", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5578 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3663 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4481 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_375_QwenDyancabs/b52b76e4-9dec-4336-88b1-d98b95b95d2a.json b/data/hfopenllm_v2/lkoenig/BBAI_375_QwenDyancabs/b52b76e4-9dec-4336-88b1-d98b95b95d2a.json deleted file mode 100644 index 687f6c068..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_375_QwenDyancabs/b52b76e4-9dec-4336-88b1-d98b95b95d2a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_375_QwenDyancabs/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_375_QwenDyancabs", - "id": "lkoenig/BBAI_375_QwenDyancabs", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4566 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5571 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4476 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_456_QwenKoen/ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json b/data/hfopenllm_v2/lkoenig/BBAI_456_QwenKoen/ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json deleted file mode 100644 index b1a7b3d3e..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_456_QwenKoen/ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_456_QwenKoen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_456_QwenKoen", - "id": "lkoenig/BBAI_456_QwenKoen", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4529 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5553 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4469 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_7B_KoenQwenDyan/724221ce-d7b2-43cb-8e16-72ac529a7b60.json b/data/hfopenllm_v2/lkoenig/BBAI_7B_KoenQwenDyan/724221ce-d7b2-43cb-8e16-72ac529a7b60.json deleted file mode 100644 index 41f147ae3..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_7B_KoenQwenDyan/724221ce-d7b2-43cb-8e16-72ac529a7b60.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_7B_KoenQwenDyan/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_7B_KoenQwenDyan", - "id": "lkoenig/BBAI_7B_KoenQwenDyan", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5807 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5537 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4369 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_7B_Qwen2.5koen/552f3814-d071-4d00-a895-b739dffdcb2d.json b/data/hfopenllm_v2/lkoenig/BBAI_7B_Qwen2.5koen/552f3814-d071-4d00-a895-b739dffdcb2d.json deleted file mode 100644 index b762f6c72..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_7B_Qwen2.5koen/552f3814-d071-4d00-a895-b739dffdcb2d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_7B_Qwen2.5koen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_7B_Qwen2.5koen", - "id": "lkoenig/BBAI_7B_Qwen2.5koen", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4369 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4485 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyanKoenLo/d3819133-bae8-493d-9a86-aee67da5d115.json b/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyanKoenLo/d3819133-bae8-493d-9a86-aee67da5d115.json deleted file mode 100644 index 536de7261..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyanKoenLo/d3819133-bae8-493d-9a86-aee67da5d115.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_7B_QwenDyanKoenLo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_7B_QwenDyanKoenLo", - "id": "lkoenig/BBAI_7B_QwenDyanKoenLo", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4663 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5562 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4465 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyancabsLAW/5c3a022f-7221-4b4f-ab67-d5b69c558434.json b/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyancabsLAW/5c3a022f-7221-4b4f-ab67-d5b69c558434.json deleted file mode 100644 index 9a62f605c..000000000 --- a/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyancabsLAW/5c3a022f-7221-4b4f-ab67-d5b69c558434.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_7B_QwenDyancabsLAW/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BBAI_7B_QwenDyancabsLAW", - "id": "lkoenig/BBAI_7B_QwenDyancabsLAW", - "developer": "lkoenig", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5579 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3678 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4471 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/c161b868-746f-4d88-9f41-eb8283a7b87a.json b/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/c161b868-746f-4d88-9f41-eb8283a7b87a.json deleted file mode 100644 index 240e3a159..000000000 --- a/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/c161b868-746f-4d88-9f41-eb8283a7b87a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/llmat_Mistral-v0.3-7B-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-v0.3-7B-ORPO", - "id": "llmat/Mistral-v0.3-7B-ORPO", - "developer": "llmat", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.377 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3978 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2278 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json b/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json deleted file mode 100644 index ffd455c9c..000000000 --- a/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/llmat_Mistral-v0.3-7B-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-v0.3-7B-ORPO", - "id": "llmat/Mistral-v0.3-7B-ORPO", - "developer": "llmat", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4005 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2301 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5/39f4d1ab-fd42-4746-b949-9666ce32f9d1.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5/39f4d1ab-fd42-4746-b949-9666ce32f9d1.json deleted file mode 100644 index cc9cfb3e3..000000000 --- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5/39f4d1ab-fd42-4746-b949-9666ce32f9d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-1B-SLERP-V5", - "id": "llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5", - "developer": "llnYou", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3313 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4233 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3868 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2931 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6/8348f316-9109-4229-9fee-edc02431befa.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6/8348f316-9109-4229-9fee-edc02431befa.json deleted file mode 100644 index 48d4a3877..000000000 --- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6/8348f316-9109-4229-9fee-edc02431befa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-1B-SLERP-V6", - "id": "llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6", - "developer": "llnYou", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.357 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1388 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3944 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.235 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1/6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1/6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json deleted file mode 100644 index 914529969..000000000 --- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1/6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-3B-SLERP-V1", - "id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1", - "developer": "llnYou", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.81 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2346 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4018 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.285 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2/8645ffc1-6487-4205-b8b0-e980e094ac6c.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2/8645ffc1-6487-4205-b8b0-e980e094ac6c.json deleted file mode 100644 index dd572deaa..000000000 --- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2/8645ffc1-6487-4205-b8b0-e980e094ac6c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-3B-SLERP-V2", - "id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2", - "developer": "llnYou", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.81 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2309 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.399 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3588 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.29 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3/2c6d1e57-7673-4a86-808e-6ff6a7146a11.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3/2c6d1e57-7673-4a86-808e-6ff6a7146a11.json deleted file mode 100644 index b5bc635f3..000000000 --- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3/2c6d1e57-7673-4a86-808e-6ff6a7146a11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-3B-SLERP-V3", - "id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3", - "developer": "llnYou", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3581 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5473 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4361 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4043 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lmsys/vicuna-13b-v1.3/64ab8b1a-62be-4561-8f0c-e42f1fe37178.json b/data/hfopenllm_v2/lmsys/vicuna-13b-v1.3/64ab8b1a-62be-4561-8f0c-e42f1fe37178.json deleted file mode 100644 index 51fcf6393..000000000 --- a/data/hfopenllm_v2/lmsys/vicuna-13b-v1.3/64ab8b1a-62be-4561-8f0c-e42f1fe37178.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lmsys_vicuna-13b-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "vicuna-13b-v1.3", - "id": "lmsys/vicuna-13b-v1.3", - "developer": "lmsys", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3344 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3727 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lmsys/vicuna-7b-v1.3/3eb22885-eb7c-4c85-b79f-cd47ffacd551.json b/data/hfopenllm_v2/lmsys/vicuna-7b-v1.3/3eb22885-eb7c-4c85-b79f-cd47ffacd551.json deleted file mode 100644 index adbfd12b9..000000000 --- a/data/hfopenllm_v2/lmsys/vicuna-7b-v1.3/3eb22885-eb7c-4c85-b79f-cd47ffacd551.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lmsys_vicuna-7b-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "vicuna-7b-v1.3", - "id": "lmsys/vicuna-7b-v1.3", - "developer": "lmsys", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2909 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1838 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lmsys/vicuna-7b-v1.5/8956d608-c627-469b-943d-bfad6c7382af.json b/data/hfopenllm_v2/lmsys/vicuna-7b-v1.5/8956d608-c627-469b-943d-bfad6c7382af.json deleted file mode 100644 index 54b24d075..000000000 --- a/data/hfopenllm_v2/lmsys/vicuna-7b-v1.5/8956d608-c627-469b-943d-bfad6c7382af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lmsys_vicuna-7b-v1.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "vicuna-7b-v1.5", - "id": "lmsys/vicuna-7b-v1.5", - "developer": "lmsys", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2352 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3947 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7/9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json b/data/hfopenllm_v2/lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7/9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json deleted file mode 100644 index 2ad864b95..000000000 --- a/data/hfopenllm_v2/lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7/9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lodrick-the-lafted_llama-3.1-8b-instruct-ortho-v7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3.1-8b-instruct-ortho-v7", - "id": "lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7", - "developer": "lodrick-the-lafted", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3907 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1974 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lordjia/Llama-3-Cantonese-8B-Instruct/e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json b/data/hfopenllm_v2/lordjia/Llama-3-Cantonese-8B-Instruct/e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json deleted file mode 100644 index eef3e165a..000000000 --- a/data/hfopenllm_v2/lordjia/Llama-3-Cantonese-8B-Instruct/e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lordjia_Llama-3-Cantonese-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Cantonese-8B-Instruct", - "id": "lordjia/Llama-3-Cantonese-8B-Instruct", - "developer": "lordjia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6669 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4814 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4046 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lordjia/Qwen2-Cantonese-7B-Instruct/20acb302-3a74-4425-af4c-a1d719b90a88.json b/data/hfopenllm_v2/lordjia/Qwen2-Cantonese-7B-Instruct/20acb302-3a74-4425-af4c-a1d719b90a88.json deleted file mode 100644 index 95ef13bda..000000000 --- a/data/hfopenllm_v2/lordjia/Qwen2-Cantonese-7B-Instruct/20acb302-3a74-4425-af4c-a1d719b90a88.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lordjia_Qwen2-Cantonese-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-Cantonese-7B-Instruct", - "id": "lordjia/Qwen2-Cantonese-7B-Instruct", - "developer": "lordjia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.256 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4004 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lt-asset/nova-1.3b/a8613588-687d-4291-ae5a-57688501cffd.json b/data/hfopenllm_v2/lt-asset/nova-1.3b/a8613588-687d-4291-ae5a-57688501cffd.json deleted file mode 100644 index 80085f690..000000000 --- a/data/hfopenllm_v2/lt-asset/nova-1.3b/a8613588-687d-4291-ae5a-57688501cffd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lt-asset_nova-1.3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nova-1.3b", - "id": "lt-asset/nova-1.3b", - "developer": "lt-asset", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "NovaForCausalLM", - "params_billions": 1.347 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1214 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.317 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lunahr/thea-3b-50r-u1/83dd67cb-5508-4aa5-9435-d5585b7f3d52.json b/data/hfopenllm_v2/lunahr/thea-3b-50r-u1/83dd67cb-5508-4aa5-9435-d5585b7f3d52.json deleted file mode 100644 index 00e74d58e..000000000 --- a/data/hfopenllm_v2/lunahr/thea-3b-50r-u1/83dd67cb-5508-4aa5-9435-d5585b7f3d52.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lunahr_thea-3b-50r-u1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "thea-3b-50r-u1", - "id": "lunahr/thea-3b-50r-u1", - "developer": "lunahr", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.603 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4105 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1042 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2808 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/lunahr/thea-v2-3b-50r/26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json b/data/hfopenllm_v2/lunahr/thea-v2-3b-50r/26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json deleted file mode 100644 index bc6aaf04b..000000000 --- a/data/hfopenllm_v2/lunahr/thea-v2-3b-50r/26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/lunahr_thea-v2-3b-50r/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "thea-v2-3b-50r", - "id": "lunahr/thea-v2-3b-50r", - "developer": "lunahr", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3704 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2409 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/m42-health/Llama3-Med42-70B/df06c977-b54c-4668-837f-eb583ef24d29.json b/data/hfopenllm_v2/m42-health/Llama3-Med42-70B/df06c977-b54c-4668-837f-eb583ef24d29.json deleted file mode 100644 index 8b02603c5..000000000 --- a/data/hfopenllm_v2/m42-health/Llama3-Med42-70B/df06c977-b54c-4668-837f-eb583ef24d29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/m42-health_Llama3-Med42-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-Med42-70B", - "id": "m42-health/Llama3-Med42-70B", - "developer": "m42-health", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6291 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6688 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2258 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4629 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4963 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/macadeliccc/Samantha-Qwen-2-7B/31a8ac03-f58b-46e3-9f17-53311b1fd506.json b/data/hfopenllm_v2/macadeliccc/Samantha-Qwen-2-7B/31a8ac03-f58b-46e3-9f17-53311b1fd506.json deleted file mode 100644 index c21abe404..000000000 --- a/data/hfopenllm_v2/macadeliccc/Samantha-Qwen-2-7B/31a8ac03-f58b-46e3-9f17-53311b1fd506.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/macadeliccc_Samantha-Qwen-2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Samantha-Qwen-2-7B", - "id": "macadeliccc/Samantha-Qwen-2-7B", - "developer": "macadeliccc", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4377 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5082 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4799 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3779 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-base/3e4a7141-7a82-421a-a107-bbac3cbafc9b.json b/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-base/3e4a7141-7a82-421a-a107-bbac3cbafc9b.json deleted file mode 100644 index 552af16da..000000000 --- a/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-base/3e4a7141-7a82-421a-a107-bbac3cbafc9b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/macadeliccc_magistrate-3.2-3b-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magistrate-3.2-3b-base", - "id": "macadeliccc/magistrate-3.2-3b-base", - "developer": "macadeliccc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1159 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3976 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1689 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-it/9a3069f2-81ed-484a-b6e6-a45a259e9a43.json b/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-it/9a3069f2-81ed-484a-b6e6-a45a259e9a43.json deleted file mode 100644 index 55e0d15e4..000000000 --- a/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-it/9a3069f2-81ed-484a-b6e6-a45a259e9a43.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/macadeliccc_magistrate-3.2-3b-it/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "magistrate-3.2-3b-it", - "id": "macadeliccc/magistrate-3.2-3b-it", - "developer": "macadeliccc", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2292 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3257 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3763 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1592 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002/c0a3d0c3-c541-4606-a925-4100b062284f.json b/data/hfopenllm_v2/magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002/c0a3d0c3-c541-4606-a925-4100b062284f.json deleted file mode 100644 index 0f570e9e4..000000000 --- a/data/hfopenllm_v2/magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002/c0a3d0c3-c541-4606-a925-4100b062284f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/magnifi_Phi3_intent_v56_3_w_unknown_5_lr_0.002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi3_intent_v56_3_w_unknown_5_lr_0.002", - "id": "magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002", - "developer": "magnifi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2018 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3282 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4123 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1472 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/maldv/Awqward2.5-32B-Instruct/20685a4b-686f-4cd4-b49d-3067a005256d.json b/data/hfopenllm_v2/maldv/Awqward2.5-32B-Instruct/20685a4b-686f-4cd4-b49d-3067a005256d.json deleted file mode 100644 index 3a60298fd..000000000 --- a/data/hfopenllm_v2/maldv/Awqward2.5-32B-Instruct/20685a4b-686f-4cd4-b49d-3067a005256d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/maldv_Awqward2.5-32B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Awqward2.5-32B-Instruct", - "id": "maldv/Awqward2.5-32B-Instruct", - "developer": "maldv", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8255 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6974 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4275 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5723 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/maldv/Lytta2.5-32B-Instruct/85a91293-cd51-4f79-8b98-2f4bc67d78c1.json b/data/hfopenllm_v2/maldv/Lytta2.5-32B-Instruct/85a91293-cd51-4f79-8b98-2f4bc67d78c1.json deleted file mode 100644 index 4697190aa..000000000 --- a/data/hfopenllm_v2/maldv/Lytta2.5-32B-Instruct/85a91293-cd51-4f79-8b98-2f4bc67d78c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/maldv_Lytta2.5-32B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lytta2.5-32B-Instruct", - "id": "maldv/Lytta2.5-32B-Instruct", - "developer": "maldv", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.56 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3444 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5048 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/maldv/Qwentile2.5-32B-Instruct/d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json b/data/hfopenllm_v2/maldv/Qwentile2.5-32B-Instruct/d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json deleted file mode 100644 index 345ca800e..000000000 --- a/data/hfopenllm_v2/maldv/Qwentile2.5-32B-Instruct/d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/maldv_Qwentile2.5-32B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentile2.5-32B-Instruct", - "id": "maldv/Qwentile2.5-32B-Instruct", - "developer": "maldv", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7393 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6963 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4682 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5879 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/maldv/badger-kappa-llama-3-8b/c4d686f2-2af1-4271-9556-09380f07ba5f.json b/data/hfopenllm_v2/maldv/badger-kappa-llama-3-8b/c4d686f2-2af1-4271-9556-09380f07ba5f.json deleted file mode 100644 index 65f56c77f..000000000 --- a/data/hfopenllm_v2/maldv/badger-kappa-llama-3-8b/c4d686f2-2af1-4271-9556-09380f07ba5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/maldv_badger-kappa-llama-3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "badger-kappa-llama-3-8b", - "id": "maldv/badger-kappa-llama-3-8b", - "developer": "maldv", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4695 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5085 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3765 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3695 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/maldv/badger-lambda-llama-3-8b/93167303-b38e-43f0-a552-72c26ccb4339.json b/data/hfopenllm_v2/maldv/badger-lambda-llama-3-8b/93167303-b38e-43f0-a552-72c26ccb4339.json deleted file mode 100644 index 84491b944..000000000 --- a/data/hfopenllm_v2/maldv/badger-lambda-llama-3-8b/93167303-b38e-43f0-a552-72c26ccb4339.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/maldv_badger-lambda-llama-3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "badger-lambda-llama-3-8b", - "id": "maldv/badger-lambda-llama-3-8b", - "developer": "maldv", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4861 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4963 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/maldv/badger-mu-llama-3-8b/b52a176f-f369-4791-a7e3-88a72709c868.json b/data/hfopenllm_v2/maldv/badger-mu-llama-3-8b/b52a176f-f369-4791-a7e3-88a72709c868.json deleted file mode 100644 index 006838490..000000000 --- a/data/hfopenllm_v2/maldv/badger-mu-llama-3-8b/b52a176f-f369-4791-a7e3-88a72709c868.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/maldv_badger-mu-llama-3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "badger-mu-llama-3-8b", - "id": "maldv/badger-mu-llama-3-8b", - "developer": "maldv", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4919 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5143 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/maldv/badger-writer-llama-3-8b/b6310012-17f1-4ee0-abd0-0079a9299350.json b/data/hfopenllm_v2/maldv/badger-writer-llama-3-8b/b6310012-17f1-4ee0-abd0-0079a9299350.json deleted file mode 100644 index 08b482ae6..000000000 --- a/data/hfopenllm_v2/maldv/badger-writer-llama-3-8b/b6310012-17f1-4ee0-abd0-0079a9299350.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/maldv_badger-writer-llama-3-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "badger-writer-llama-3-8b", - "id": "maldv/badger-writer-llama-3-8b", - "developer": "maldv", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5303 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4864 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3581 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Cheng-1/f581e832-0f77-496e-bcd3-6cfec51ef594.json b/data/hfopenllm_v2/marcuscedricridia/Cheng-1/f581e832-0f77-496e-bcd3-6cfec51ef594.json deleted file mode 100644 index db31c8ed3..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Cheng-1/f581e832-0f77-496e-bcd3-6cfec51ef594.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Cheng-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cheng-1", - "id": "marcuscedricridia/Cheng-1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7789 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5525 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4894 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4349 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Cheng-2-v1.1/47b47c89-b13b-4099-98b2-854feae05f63.json b/data/hfopenllm_v2/marcuscedricridia/Cheng-2-v1.1/47b47c89-b13b-4099-98b2-854feae05f63.json deleted file mode 100644 index a97a44973..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Cheng-2-v1.1/47b47c89-b13b-4099-98b2-854feae05f63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Cheng-2-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cheng-2-v1.1", - "id": "marcuscedricridia/Cheng-2-v1.1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.651 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4167 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5076 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Cheng-2/8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json b/data/hfopenllm_v2/marcuscedricridia/Cheng-2/8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json deleted file mode 100644 index d2b1be907..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Cheng-2/8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Cheng-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cheng-2", - "id": "marcuscedricridia/Cheng-2", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8337 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6499 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4193 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5013 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1/4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1/4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json deleted file mode 100644 index 2a3ec93a3..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1/4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hush-Qwen2.5-7B-MST-v1.1", - "id": "marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5559 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4653 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3/0bdb6574-69e2-4858-b7aa-a90a5fadf741.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3/0bdb6574-69e2-4858-b7aa-a90a5fadf741.json deleted file mode 100644 index 5208a426c..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3/0bdb6574-69e2-4858-b7aa-a90a5fadf741.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hush-Qwen2.5-7B-MST-v1.3", - "id": "marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4758 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4311 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST/fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST/fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json deleted file mode 100644 index 637b7439c..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST/fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-MST/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hush-Qwen2.5-7B-MST", - "id": "marcuscedricridia/Hush-Qwen2.5-7B-MST", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7488 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5458 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4245 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4163 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-Preview/d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-Preview/d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json deleted file mode 100644 index af57fa32c..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-Preview/d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hush-Qwen2.5-7B-Preview", - "id": "marcuscedricridia/Hush-Qwen2.5-7B-Preview", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7962 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4364 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M/912446e3-efdf-4ed0-80bd-261c6c87a3d0.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M/912446e3-efdf-4ed0-80bd-261c6c87a3d0.json deleted file mode 100644 index 50261cfbe..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M/912446e3-efdf-4ed0-80bd-261c6c87a3d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-RP-v1.4-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hush-Qwen2.5-7B-RP-v1.4-1M", - "id": "marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7728 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5295 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4433 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.1/5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.1/5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json deleted file mode 100644 index 91ec9fbad..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.1/5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hush-Qwen2.5-7B-v1.1", - "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7889 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4179 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4227 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.2/80680e5e-ab83-4a59-aeec-9d4166509c47.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.2/80680e5e-ab83-4a59-aeec-9d4166509c47.json deleted file mode 100644 index db41151dd..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.2/80680e5e-ab83-4a59-aeec-9d4166509c47.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hush-Qwen2.5-7B-v1.2", - "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.2", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7865 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5403 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4403 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4219 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4197 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.3/c5bc9c92-8469-4174-aafd-67bb61aaccf2.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.3/c5bc9c92-8469-4174-aafd-67bb61aaccf2.json deleted file mode 100644 index b018cf076..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.3/c5bc9c92-8469-4174-aafd-67bb61aaccf2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hush-Qwen2.5-7B-v1.3", - "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.3", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7856 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5327 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3323 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4246 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4345 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.4/1d67b792-178b-4baa-a108-2362f658bd4e.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.4/1d67b792-178b-4baa-a108-2362f658bd4e.json deleted file mode 100644 index 142cdd945..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.4/1d67b792-178b-4baa-a108-2362f658bd4e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-v1.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hush-Qwen2.5-7B-v1.4", - "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.4", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7835 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5423 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4195 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Qwen2.5-7B-Preview/eb0c87b0-4795-4029-82c1-57ce37ba8259.json b/data/hfopenllm_v2/marcuscedricridia/Qwen2.5-7B-Preview/eb0c87b0-4795-4029-82c1-57ce37ba8259.json deleted file mode 100644 index 9e2e24bb9..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Qwen2.5-7B-Preview/eb0c87b0-4795-4029-82c1-57ce37ba8259.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Qwen2.5-7B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Preview", - "id": "marcuscedricridia/Qwen2.5-7B-Preview", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7679 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3444 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4258 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1/dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json b/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1/dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json deleted file mode 100644 index f754c4b96..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1/dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Yell-Qwen2.5-7B-Preview-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yell-Qwen2.5-7B-Preview-v1.1", - "id": "marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5348 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1896 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4059 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview/e005624d-c822-4be1-9477-873642aae228.json b/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview/e005624d-c822-4be1-9477-873642aae228.json deleted file mode 100644 index 0967ce44f..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview/e005624d-c822-4be1-9477-873642aae228.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_Yell-Qwen2.5-7B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Yell-Qwen2.5-7B-Preview", - "id": "marcuscedricridia/Yell-Qwen2.5-7B-Preview", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5839 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5371 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1926 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4046 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3798 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/absolute-o1-7b/e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json b/data/hfopenllm_v2/marcuscedricridia/absolute-o1-7b/e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json deleted file mode 100644 index 6b56c9588..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/absolute-o1-7b/e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_absolute-o1-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "absolute-o1-7b", - "id": "marcuscedricridia/absolute-o1-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7516 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5469 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-2-28-2025/704598c3-c5d6-4ce0-bab3-0fa98118e16a.json b/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-2-28-2025/704598c3-c5d6-4ce0-bab3-0fa98118e16a.json deleted file mode 100644 index f7db34598..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-2-28-2025/704598c3-c5d6-4ce0-bab3-0fa98118e16a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursa-o1-7b-2-28-2025/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cursa-o1-7b-2-28-2025", - "id": "marcuscedricridia/cursa-o1-7b-2-28-2025", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7467 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4811 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4365 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.1/fafc9463-d725-4827-8bc1-5cd9e83814b6.json b/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.1/fafc9463-d725-4827-8bc1-5cd9e83814b6.json deleted file mode 100644 index 542f287db..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.1/fafc9463-d725-4827-8bc1-5cd9e83814b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursa-o1-7b-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cursa-o1-7b-v1.1", - "id": "marcuscedricridia/cursa-o1-7b-v1.1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7528 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4985 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4259 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.2-normalize-false/109820e0-ee00-449c-9ae5-58a7bf1da5f8.json b/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.2-normalize-false/109820e0-ee00-449c-9ae5-58a7bf1da5f8.json deleted file mode 100644 index 37f93dd1f..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.2-normalize-false/109820e0-ee00-449c-9ae5-58a7bf1da5f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursa-o1-7b-v1.2-normalize-false/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cursa-o1-7b-v1.2-normalize-false", - "id": "marcuscedricridia/cursa-o1-7b-v1.2-normalize-false", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7616 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4992 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4436 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b/37f29d5b-d803-4195-9ce0-75e45e32c160.json b/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b/37f29d5b-d803-4195-9ce0-75e45e32c160.json deleted file mode 100644 index ad5a42f1a..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b/37f29d5b-d803-4195-9ce0-75e45e32c160.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursa-o1-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cursa-o1-7b", - "id": "marcuscedricridia/cursa-o1-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7628 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4301 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/cursor-o1-7b/43546f48-8c46-4481-b1e5-f4b1ad2535be.json b/data/hfopenllm_v2/marcuscedricridia/cursor-o1-7b/43546f48-8c46-4481-b1e5-f4b1ad2535be.json deleted file mode 100644 index 69bdd676a..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/cursor-o1-7b/43546f48-8c46-4481-b1e5-f4b1ad2535be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursor-o1-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cursor-o1-7b", - "id": "marcuscedricridia/cursor-o1-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4107 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5007 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3251 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/cursorr-o1.2-7b/ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json b/data/hfopenllm_v2/marcuscedricridia/cursorr-o1.2-7b/ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json deleted file mode 100644 index ebc368fd4..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/cursorr-o1.2-7b/ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursorr-o1.2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "cursorr-o1.2-7b", - "id": "marcuscedricridia/cursorr-o1.2-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.166 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3068 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.1/9290c86f-40b0-4520-b8aa-3460de62c396.json b/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.1/9290c86f-40b0-4520-b8aa-3460de62c396.json deleted file mode 100644 index 3d23d32e5..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.1/9290c86f-40b0-4520-b8aa-3460de62c396.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_etr1o-explicit-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "etr1o-explicit-v1.1", - "id": "marcuscedricridia/etr1o-explicit-v1.1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.288 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3132 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4111 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1195 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.2/a4bf576e-9556-4956-8dcb-4d8906d45db0.json b/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.2/a4bf576e-9556-4956-8dcb-4d8906d45db0.json deleted file mode 100644 index 22ae724ac..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.2/a4bf576e-9556-4956-8dcb-4d8906d45db0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_etr1o-explicit-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "etr1o-explicit-v1.2", - "id": "marcuscedricridia/etr1o-explicit-v1.2", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1504 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.295 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.1/320a5c00-3307-4bc3-9f47-9befb88e461c.json b/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.1/320a5c00-3307-4bc3-9f47-9befb88e461c.json deleted file mode 100644 index a0ab10b5b..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.1/320a5c00-3307-4bc3-9f47-9befb88e461c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_etr1o-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "etr1o-v1.1", - "id": "marcuscedricridia/etr1o-v1.1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1597 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4017 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.2/844d1556-6bc6-467e-a145-f92646770727.json b/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.2/844d1556-6bc6-467e-a145-f92646770727.json deleted file mode 100644 index c2db8d268..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.2/844d1556-6bc6-467e-a145-f92646770727.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_etr1o-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "etr1o-v1.2", - "id": "marcuscedricridia/etr1o-v1.2", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7287 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6349 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3588 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4714 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5316 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/fan-o1-7b/78923f4b-c2e7-4472-8398-10a0a8453ec5.json b/data/hfopenllm_v2/marcuscedricridia/fan-o1-7b/78923f4b-c2e7-4472-8398-10a0a8453ec5.json deleted file mode 100644 index 63b85c22d..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/fan-o1-7b/78923f4b-c2e7-4472-8398-10a0a8453ec5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_fan-o1-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "fan-o1-7b", - "id": "marcuscedricridia/fan-o1-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4849 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1616 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3274 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/olmner-7b/17abe1bf-2e97-409e-88e3-4f661861a195.json b/data/hfopenllm_v2/marcuscedricridia/olmner-7b/17abe1bf-2e97-409e-88e3-4f661861a195.json deleted file mode 100644 index 116baa15b..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/olmner-7b/17abe1bf-2e97-409e-88e3-4f661861a195.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_olmner-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "olmner-7b", - "id": "marcuscedricridia/olmner-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7254 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5472 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.438 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4309 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/olmner-della-7b/756978e5-1dfe-433e-ba88-339004a50ea7.json b/data/hfopenllm_v2/marcuscedricridia/olmner-della-7b/756978e5-1dfe-433e-ba88-339004a50ea7.json deleted file mode 100644 index 41cca1ed9..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/olmner-della-7b/756978e5-1dfe-433e-ba88-339004a50ea7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_olmner-della-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "olmner-della-7b", - "id": "marcuscedricridia/olmner-della-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7637 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5491 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4208 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/olmner-o1-7b/a889ae3a-5d86-4454-bfb9-332c4b61b836.json b/data/hfopenllm_v2/marcuscedricridia/olmner-o1-7b/a889ae3a-5d86-4454-bfb9-332c4b61b836.json deleted file mode 100644 index 963ff040a..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/olmner-o1-7b/a889ae3a-5d86-4454-bfb9-332c4b61b836.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_olmner-o1-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "olmner-o1-7b", - "id": "marcuscedricridia/olmner-o1-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7528 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5481 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4299 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/olmner-sbr-7b/2c5e1086-03b7-4cdd-801e-03fb26183076.json b/data/hfopenllm_v2/marcuscedricridia/olmner-sbr-7b/2c5e1086-03b7-4cdd-801e-03fb26183076.json deleted file mode 100644 index 634719136..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/olmner-sbr-7b/2c5e1086-03b7-4cdd-801e-03fb26183076.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_olmner-sbr-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "olmner-sbr-7b", - "id": "marcuscedricridia/olmner-sbr-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4947 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/post-cursa-o1/d9578847-b732-4c75-b246-9cdf03674fe0.json b/data/hfopenllm_v2/marcuscedricridia/post-cursa-o1/d9578847-b732-4c75-b246-9cdf03674fe0.json deleted file mode 100644 index 2d244c450..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/post-cursa-o1/d9578847-b732-4c75-b246-9cdf03674fe0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_post-cursa-o1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "post-cursa-o1", - "id": "marcuscedricridia/post-cursa-o1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7628 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4872 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4361 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.2/4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.2/4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json deleted file mode 100644 index 57d91dea9..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.2/4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pre-cursa-o1-v1.2", - "id": "marcuscedricridia/pre-cursa-o1-v1.2", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7549 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5487 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4272 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.3/619037af-d528-4579-b7e3-58628468d8fb.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.3/619037af-d528-4579-b7e3-58628468d8fb.json deleted file mode 100644 index ba339407b..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.3/619037af-d528-4579-b7e3-58628468d8fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pre-cursa-o1-v1.3", - "id": "marcuscedricridia/pre-cursa-o1-v1.3", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7507 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5455 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4271 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.442 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.4/5113b737-8d9f-4321-9a67-91f1aabb40a1.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.4/5113b737-8d9f-4321-9a67-91f1aabb40a1.json deleted file mode 100644 index 4cb1c2ce4..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.4/5113b737-8d9f-4321-9a67-91f1aabb40a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1-v1.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pre-cursa-o1-v1.4", - "id": "marcuscedricridia/pre-cursa-o1-v1.4", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7488 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4834 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4285 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4436 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.6/641ac372-2e5a-4b44-b22e-a17600a6a868.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.6/641ac372-2e5a-4b44-b22e-a17600a6a868.json deleted file mode 100644 index f3a0dee4d..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.6/641ac372-2e5a-4b44-b22e-a17600a6a868.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1-v1.6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pre-cursa-o1-v1.6", - "id": "marcuscedricridia/pre-cursa-o1-v1.6", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7528 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5473 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4234 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1/7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1/7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json deleted file mode 100644 index 2cf7d9f19..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1/7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pre-cursa-o1", - "id": "marcuscedricridia/pre-cursa-o1", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7409 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4424 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/r1o-et/c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json b/data/hfopenllm_v2/marcuscedricridia/r1o-et/c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json deleted file mode 100644 index 57d5458d0..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/r1o-et/c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_r1o-et/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "r1o-et", - "id": "marcuscedricridia/r1o-et", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3597 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4209 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0793 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.258 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/sbr-o1-7b/50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json b/data/hfopenllm_v2/marcuscedricridia/sbr-o1-7b/50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json deleted file mode 100644 index dab48a238..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/sbr-o1-7b/50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_sbr-o1-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sbr-o1-7b", - "id": "marcuscedricridia/sbr-o1-7b", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7455 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5479 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4985 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4404 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4355 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/marcuscedricridia/stray-r1o-et/99d97aef-bb6b-471b-8ed7-f6f92f75842c.json b/data/hfopenllm_v2/marcuscedricridia/stray-r1o-et/99d97aef-bb6b-471b-8ed7-f6f92f75842c.json deleted file mode 100644 index b8ff455f6..000000000 --- a/data/hfopenllm_v2/marcuscedricridia/stray-r1o-et/99d97aef-bb6b-471b-8ed7-f6f92f75842c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/marcuscedricridia_stray-r1o-et/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stray-r1o-et", - "id": "marcuscedricridia/stray-r1o-et", - "developer": "marcuscedricridia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1562 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2967 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1094 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/b98504a0-f1d6-4872-b748-2ca8199c5328.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/b98504a0-f1d6-4872-b748-2ca8199c5328.json deleted file mode 100644 index 524853f69..000000000 --- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/b98504a0-f1d6-4872-b748-2ca8199c5328.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3", - "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3", - "developer": "matouLeLoup", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3239 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/5a159667-7460-4a97-884e-6a96df59873b.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/5a159667-7460-4a97-884e-6a96df59873b.json deleted file mode 100644 index 9c8242ed9..000000000 --- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/5a159667-7460-4a97-884e-6a96df59873b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis", - "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis", - "developer": "matouLeLoup", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3239 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json deleted file mode 100644 index 4054a3100..000000000 --- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis", - "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis", - "developer": "matouLeLoup", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3239 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json deleted file mode 100644 index a118652bb..000000000 --- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis", - "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis", - "developer": "matouLeLoup", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1882 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3233 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3685 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/acbb93b3-f8fc-479d-9610-392efd7d4ecc.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/acbb93b3-f8fc-479d-9610-392efd7d4ecc.json deleted file mode 100644 index e00a6160a..000000000 --- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/acbb93b3-f8fc-479d-9610-392efd7d4ecc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", - "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis", - "developer": "matouLeLoup", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1652 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3024 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1116 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mattshumer/Reflection-Llama-3.1-70B/6d0589bd-1f05-44ee-afa5-3657b960d7c9.json b/data/hfopenllm_v2/mattshumer/Reflection-Llama-3.1-70B/6d0589bd-1f05-44ee-afa5-3657b960d7c9.json deleted file mode 100644 index e8d1ce9f3..000000000 --- a/data/hfopenllm_v2/mattshumer/Reflection-Llama-3.1-70B/6d0589bd-1f05-44ee-afa5-3657b960d7c9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mattshumer_Reflection-Llama-3.1-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Reflection-Llama-3.1-70B", - "id": "mattshumer/Reflection-Llama-3.1-70B", - "developer": "mattshumer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.645 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4577 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mattshumer/ref_70_e3/134663d8-05a8-4336-90e2-68e7cba5f1df.json b/data/hfopenllm_v2/mattshumer/ref_70_e3/134663d8-05a8-4336-90e2-68e7cba5f1df.json deleted file mode 100644 index d4167a969..000000000 --- a/data/hfopenllm_v2/mattshumer/ref_70_e3/134663d8-05a8-4336-90e2-68e7cba5f1df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mattshumer_ref_70_e3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ref_70_e3", - "id": "mattshumer/ref_70_e3", - "developer": "mattshumer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6294 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6501 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2795 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5303 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/maywell/Qwen2-7B-Multilingual-RP/3bfced28-b06e-46ab-a6aa-171b0c424337.json b/data/hfopenllm_v2/maywell/Qwen2-7B-Multilingual-RP/3bfced28-b06e-46ab-a6aa-171b0c424337.json deleted file mode 100644 index 4a43aa1ea..000000000 --- a/data/hfopenllm_v2/maywell/Qwen2-7B-Multilingual-RP/3bfced28-b06e-46ab-a6aa-171b0c424337.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/maywell_Qwen2-7B-Multilingual-RP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-7B-Multilingual-RP", - "id": "maywell/Qwen2-7B-Multilingual-RP", - "developer": "maywell", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4347 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5062 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3696 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.1-MedIT-SUN-8B/b6a83b82-6b05-4437-a076-e2a3982f6169.json b/data/hfopenllm_v2/meditsolutions/Llama-3.1-MedIT-SUN-8B/b6a83b82-6b05-4437-a076-e2a3982f6169.json deleted file mode 100644 index a6245e049..000000000 --- a/data/hfopenllm_v2/meditsolutions/Llama-3.1-MedIT-SUN-8B/b6a83b82-6b05-4437-a076-e2a3982f6169.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.1-MedIT-SUN-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-MedIT-SUN-8B", - "id": "meditsolutions/Llama-3.1-MedIT-SUN-8B", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7837 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5187 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2092 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4056 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3916 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-Instruct/f621201b-f571-4487-9f1e-b767675c659d.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-Instruct/f621201b-f571-4487-9f1e-b767675c659d.json deleted file mode 100644 index c6b47ea7c..000000000 --- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-Instruct/f621201b-f571-4487-9f1e-b767675c659d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-SUN-1B-Instruct", - "id": "meditsolutions/Llama-3.2-SUN-1B-Instruct", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaMedITForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6413 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3474 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1781 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-chat/710fdb79-fba4-42da-8e26-45b4caf75207.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-chat/710fdb79-fba4-42da-8e26-45b4caf75207.json deleted file mode 100644 index 0f33814a2..000000000 --- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-chat/710fdb79-fba4-42da-8e26-45b4caf75207.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-1B-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-SUN-1B-chat", - "id": "meditsolutions/Llama-3.2-SUN-1B-chat", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3514 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1838 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000/35fa7a5e-8866-4ce3-9899-8737e908f34f.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000/35fa7a5e-8866-4ce3-9899-8737e908f34f.json deleted file mode 100644 index b58b7f561..000000000 --- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000/35fa7a5e-8866-4ce3-9899-8737e908f34f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-26000/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-SUN-2.4B-checkpoint-26000", - "id": "meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.209 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2814 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3018 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4103 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1345 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800/2b24b69b-15dc-4666-83f3-c77db545bdbd.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800/2b24b69b-15dc-4666-83f3-c77db545bdbd.json deleted file mode 100644 index 3c289add0..000000000 --- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800/2b24b69b-15dc-4666-83f3-c77db545bdbd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-34800/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-SUN-2.4B-checkpoint-34800", - "id": "meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.209 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2501 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3161 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4022 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0/0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0/0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json deleted file mode 100644 index 92bc18771..000000000 --- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0/0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-2.4B-v1.0.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-SUN-2.4B-v1.0.0", - "id": "meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.472 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5637 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3391 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1543 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.5B-chat/f45135b0-3c26-44b5-9922-a6c0817a172d.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.5B-chat/f45135b0-3c26-44b5-9922-a6c0817a172d.json deleted file mode 100644 index 03c4a4de9..000000000 --- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.5B-chat/f45135b0-3c26-44b5-9922-a6c0817a172d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-2.5B-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-SUN-2.5B-chat", - "id": "meditsolutions/Llama-3.2-SUN-2.5B-chat", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.472 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3155 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1813 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct/67eb0d6c-9086-4c80-8506-c3e1489f2673.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct/67eb0d6c-9086-4c80-8506-c3e1489f2673.json deleted file mode 100644 index 9ccd525a8..000000000 --- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct/67eb0d6c-9086-4c80-8506-c3e1489f2673.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-HDIC-1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-SUN-HDIC-1B-Instruct", - "id": "meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6827 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3508 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2366 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3594 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1687 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json b/data/hfopenllm_v2/meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json deleted file mode 100644 index 47159802e..000000000 --- a/data/hfopenllm_v2/meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune", - "id": "meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.646 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3655 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4035 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json b/data/hfopenllm_v2/meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json deleted file mode 100644 index 555723348..000000000 --- a/data/hfopenllm_v2/meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MSH-v1-Bielik-v2.3-Instruct-MedIT-merge", - "id": "meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 11.169 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5814 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5672 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4385 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/MedIT-Mesh-3B-Instruct/6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json b/data/hfopenllm_v2/meditsolutions/MedIT-Mesh-3B-Instruct/6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json deleted file mode 100644 index af35e21a0..000000000 --- a/data/hfopenllm_v2/meditsolutions/MedIT-Mesh-3B-Instruct/6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_MedIT-Mesh-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MedIT-Mesh-3B-Instruct", - "id": "meditsolutions/MedIT-Mesh-3B-Instruct", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5814 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5576 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2032 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4048 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4012 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meditsolutions/SmolLM2-MedIT-Upscale-2B/871131c1-295d-40a0-a396-09d24b880064.json b/data/hfopenllm_v2/meditsolutions/SmolLM2-MedIT-Upscale-2B/871131c1-295d-40a0-a396-09d24b880064.json deleted file mode 100644 index 870bf4c0c..000000000 --- a/data/hfopenllm_v2/meditsolutions/SmolLM2-MedIT-Upscale-2B/871131c1-295d-40a0-a396-09d24b880064.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meditsolutions_SmolLM2-MedIT-Upscale-2B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-MedIT-Upscale-2B", - "id": "meditsolutions/SmolLM2-MedIT-Upscale-2B", - "developer": "meditsolutions", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.114 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6429 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1971 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meetkai/functionary-small-v3.1/44eefbb2-22d4-4dff-889d-a87fc40b2eea.json b/data/hfopenllm_v2/meetkai/functionary-small-v3.1/44eefbb2-22d4-4dff-889d-a87fc40b2eea.json deleted file mode 100644 index f08479499..000000000 --- a/data/hfopenllm_v2/meetkai/functionary-small-v3.1/44eefbb2-22d4-4dff-889d-a87fc40b2eea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meetkai_functionary-small-v3.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "functionary-small-v3.1", - "id": "meetkai/functionary-small-v3.1", - "developer": "meetkai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4982 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1571 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3349 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meraGPT/mera-mix-4x7B/cd1de470-a174-4c08-9efe-a06d493dc4b2.json b/data/hfopenllm_v2/meraGPT/mera-mix-4x7B/cd1de470-a174-4c08-9efe-a06d493dc4b2.json deleted file mode 100644 index b69c8564f..000000000 --- a/data/hfopenllm_v2/meraGPT/mera-mix-4x7B/cd1de470-a174-4c08-9efe-a06d493dc4b2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meraGPT_mera-mix-4x7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mera-mix-4x7B", - "id": "meraGPT/mera-mix-4x7B", - "developer": "meraGPT", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4832 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4057 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2748 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B/fdb55a14-0697-4775-8358-fed202498b4f.json b/data/hfopenllm_v2/mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B/fdb55a14-0697-4775-8358-fed202498b4f.json deleted file mode 100644 index a962d2f5e..000000000 --- a/data/hfopenllm_v2/mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B/fdb55a14-0697-4775-8358-fed202498b4f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_JAJUKA-WEWILLNEVERFORGETYOU-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "JAJUKA-WEWILLNEVERFORGETYOU-3B", - "id": "mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4941 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.437 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3033 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/SuperQwen-2.5-1.5B/c069a224-638a-4cad-a9ad-e4f8579e8c15.json b/data/hfopenllm_v2/mergekit-community/SuperQwen-2.5-1.5B/c069a224-638a-4cad-a9ad-e4f8579e8c15.json deleted file mode 100644 index 1f4baa35d..000000000 --- a/data/hfopenllm_v2/mergekit-community/SuperQwen-2.5-1.5B/c069a224-638a-4cad-a9ad-e4f8579e8c15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_SuperQwen-2.5-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SuperQwen-2.5-1.5B", - "id": "mergekit-community/SuperQwen-2.5-1.5B", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2907 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1075 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/VirtuosoSmall-InstructModelStock/10e5c103-f25f-45bb-bfe6-a22876cffe87.json b/data/hfopenllm_v2/mergekit-community/VirtuosoSmall-InstructModelStock/10e5c103-f25f-45bb-bfe6-a22876cffe87.json deleted file mode 100644 index 7af838cdf..000000000 --- a/data/hfopenllm_v2/mergekit-community/VirtuosoSmall-InstructModelStock/10e5c103-f25f-45bb-bfe6-a22876cffe87.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_VirtuosoSmall-InstructModelStock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "VirtuosoSmall-InstructModelStock", - "id": "mergekit-community/VirtuosoSmall-InstructModelStock", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5238 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6518 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4094 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4756 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5421 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/diabolic6045_ELN-AOC-CAIN/a9ecca9a-c5d4-45b2-a403-e74a98a46322.json b/data/hfopenllm_v2/mergekit-community/diabolic6045_ELN-AOC-CAIN/a9ecca9a-c5d4-45b2-a403-e74a98a46322.json deleted file mode 100644 index 2b83b90c9..000000000 --- a/data/hfopenllm_v2/mergekit-community/diabolic6045_ELN-AOC-CAIN/a9ecca9a-c5d4-45b2-a403-e74a98a46322.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_diabolic6045_ELN-AOC-CAIN/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "diabolic6045_ELN-AOC-CAIN", - "id": "mergekit-community/diabolic6045_ELN-AOC-CAIN", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0862 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3126 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1191 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-dare_ties-ajgjgea/630d8a60-03b7-4550-82f4-e879b2e01c6c.json b/data/hfopenllm_v2/mergekit-community/mergekit-dare_ties-ajgjgea/630d8a60-03b7-4550-82f4-e879b2e01c6c.json deleted file mode 100644 index afddca479..000000000 --- a/data/hfopenllm_v2/mergekit-community/mergekit-dare_ties-ajgjgea/630d8a60-03b7-4550-82f4-e879b2e01c6c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-dare_ties-ajgjgea/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-dare_ties-ajgjgea", - "id": "mergekit-community/mergekit-dare_ties-ajgjgea", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5263 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1744 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-della-zgowfmf/206b5a96-ae07-41fd-822f-436d49c57dcb.json b/data/hfopenllm_v2/mergekit-community/mergekit-della-zgowfmf/206b5a96-ae07-41fd-822f-436d49c57dcb.json deleted file mode 100644 index 160d88a78..000000000 --- a/data/hfopenllm_v2/mergekit-community/mergekit-della-zgowfmf/206b5a96-ae07-41fd-822f-436d49c57dcb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-della-zgowfmf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-della-zgowfmf", - "id": "mergekit-community/mergekit-della-zgowfmf", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4828 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6591 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3901 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4834 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5415 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-model_stock-azgztvm/702d2120-5301-4e03-bb0f-1f8ab19e522a.json b/data/hfopenllm_v2/mergekit-community/mergekit-model_stock-azgztvm/702d2120-5301-4e03-bb0f-1f8ab19e522a.json deleted file mode 100644 index 46f675fa0..000000000 --- a/data/hfopenllm_v2/mergekit-community/mergekit-model_stock-azgztvm/702d2120-5301-4e03-bb0f-1f8ab19e522a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-model_stock-azgztvm/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-model_stock-azgztvm", - "id": "mergekit-community/mergekit-model_stock-azgztvm", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5062 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6543 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-slerp-fmrazcr/61e39700-c237-49fc-baef-3fa573b3b0c6.json b/data/hfopenllm_v2/mergekit-community/mergekit-slerp-fmrazcr/61e39700-c237-49fc-baef-3fa573b3b0c6.json deleted file mode 100644 index 0b718a9e2..000000000 --- a/data/hfopenllm_v2/mergekit-community/mergekit-slerp-fmrazcr/61e39700-c237-49fc-baef-3fa573b3b0c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-slerp-fmrazcr/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-slerp-fmrazcr", - "id": "mergekit-community/mergekit-slerp-fmrazcr", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4174 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5342 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1193 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4105 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-ties-rraxdhv/8892ab84-750d-494f-9f87-ad28e73cf364.json b/data/hfopenllm_v2/mergekit-community/mergekit-ties-rraxdhv/8892ab84-750d-494f-9f87-ad28e73cf364.json deleted file mode 100644 index a58721bfd..000000000 --- a/data/hfopenllm_v2/mergekit-community/mergekit-ties-rraxdhv/8892ab84-750d-494f-9f87-ad28e73cf364.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-ties-rraxdhv/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-ties-rraxdhv", - "id": "mergekit-community/mergekit-ties-rraxdhv", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5184 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-ties-ykqemwr/538a2eb7-34e4-4e78-a382-60a13710096e.json b/data/hfopenllm_v2/mergekit-community/mergekit-ties-ykqemwr/538a2eb7-34e4-4e78-a382-60a13710096e.json deleted file mode 100644 index 929e2d82b..000000000 --- a/data/hfopenllm_v2/mergekit-community/mergekit-ties-ykqemwr/538a2eb7-34e4-4e78-a382-60a13710096e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-ties-ykqemwr/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mergekit-ties-ykqemwr", - "id": "mergekit-community/mergekit-ties-ykqemwr", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.36 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5455 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1224 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3734 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mergekit-community/sexeh_time_testing/a041629e-8ed8-4a6c-95ee-98e759501e19.json b/data/hfopenllm_v2/mergekit-community/sexeh_time_testing/a041629e-8ed8-4a6c-95ee-98e759501e19.json deleted file mode 100644 index 959317552..000000000 --- a/data/hfopenllm_v2/mergekit-community/sexeh_time_testing/a041629e-8ed8-4a6c-95ee-98e759501e19.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mergekit-community_sexeh_time_testing/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sexeh_time_testing", - "id": "mergekit-community/sexeh_time_testing", - "developer": "mergekit-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7329 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3619 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3667 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-13b-chat-hf/09f05984-5815-4b3d-bc73-83ea1e5ecc27.json b/data/hfopenllm_v2/meta-llama/Llama-2-13b-chat-hf/09f05984-5815-4b3d-bc73-83ea1e5ecc27.json deleted file mode 100644 index 29fbff778..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-2-13b-chat-hf/09f05984-5815-4b3d-bc73-83ea1e5ecc27.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-13b-chat-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-2-13b-chat-hf", - "id": "meta-llama/Llama-2-13b-chat-hf", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.016 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3985 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2315 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4007 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1923 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-13b-hf/6535524e-f8cf-4f2f-9d89-9ba70aedac91.json b/data/hfopenllm_v2/meta-llama/Llama-2-13b-hf/6535524e-f8cf-4f2f-9d89-9ba70aedac91.json deleted file mode 100644 index 50a1407eb..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-2-13b-hf/6535524e-f8cf-4f2f-9d89-9ba70aedac91.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-13b-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-2-13b-hf", - "id": "meta-llama/Llama-2-13b-hf", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.016 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4126 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2378 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-70b-chat-hf/08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json b/data/hfopenllm_v2/meta-llama/Llama-2-70b-chat-hf/08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json deleted file mode 100644 index 741023f42..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-2-70b-chat-hf/08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-70b-chat-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-2-70b-chat-hf", - "id": "meta-llama/Llama-2-70b-chat-hf", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 68.977 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4958 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3042 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-70b-hf/631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json b/data/hfopenllm_v2/meta-llama/Llama-2-70b-hf/631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json deleted file mode 100644 index d8c229621..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-2-70b-hf/631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-70b-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-2-70b-hf", - "id": "meta-llama/Llama-2-70b-hf", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 68.977 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2407 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5473 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3718 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-7b-chat-hf/b771f6db-7516-4423-9010-3467db0e26e3.json b/data/hfopenllm_v2/meta-llama/Llama-2-7b-chat-hf/b771f6db-7516-4423-9010-3467db0e26e3.json deleted file mode 100644 index 4c366b7ee..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-2-7b-chat-hf/b771f6db-7516-4423-9010-3467db0e26e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-7b-chat-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-2-7b-chat-hf", - "id": "meta-llama/Llama-2-7b-chat-hf", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3986 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3114 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3676 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1688 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-7b-hf/cf580dfb-2924-4c4b-9352-394275b959bd.json b/data/hfopenllm_v2/meta-llama/Llama-2-7b-hf/cf580dfb-2924-4c4b-9352-394275b959bd.json deleted file mode 100644 index 009ce4f17..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-2-7b-hf/cf580dfb-2924-4c4b-9352-394275b959bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-7b-hf/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-2-7b-hf", - "id": "meta-llama/Llama-2-7b-hf", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2519 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3496 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1861 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.1-70B-Instruct/ba549fe6-7718-4abf-a610-7e0f48611483.json b/data/hfopenllm_v2/meta-llama/Llama-3.1-70B-Instruct/ba549fe6-7718-4abf-a610-7e0f48611483.json deleted file mode 100644 index 772844f47..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-3.1-70B-Instruct/ba549fe6-7718-4abf-a610-7e0f48611483.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.1-70B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-70B-Instruct", - "id": "meta-llama/Llama-3.1-70B-Instruct", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8669 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6917 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4581 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5309 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.1-70B/b92440b1-78a9-4288-a432-f057f2b04a2f.json b/data/hfopenllm_v2/meta-llama/Llama-3.1-70B/b92440b1-78a9-4288-a432-f057f2b04a2f.json deleted file mode 100644 index a94baa730..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-3.1-70B/b92440b1-78a9-4288-a432-f057f2b04a2f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.1-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-70B", - "id": "meta-llama/Llama-3.1-70B", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1684 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.626 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1843 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4572 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4654 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.1-8B-Instruct/838f3932-edf2-4f72-9238-981d1aadc771.json b/data/hfopenllm_v2/meta-llama/Llama-3.1-8B-Instruct/838f3932-edf2-4f72-9238-981d1aadc771.json deleted file mode 100644 index f3e5eb24a..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-3.1-8B-Instruct/838f3932-edf2-4f72-9238-981d1aadc771.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Instruct", - "id": "meta-llama/Llama-3.1-8B-Instruct", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4922 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5087 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1556 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3972 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3798 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.1-8B/61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json b/data/hfopenllm_v2/meta-llama/Llama-3.1-8B/61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json deleted file mode 100644 index b2fd3607e..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-3.1-8B/61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B", - "id": "meta-llama/Llama-3.1-8B", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3288 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.2-1B-Instruct/0b307c78-94c7-418f-bc47-5106b81c30de.json b/data/hfopenllm_v2/meta-llama/Llama-3.2-1B-Instruct/0b307c78-94c7-418f-bc47-5106b81c30de.json deleted file mode 100644 index f3ef3bdee..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-3.2-1B-Instruct/0b307c78-94c7-418f-bc47-5106b81c30de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.2-1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-Instruct", - "id": "meta-llama/Llama-3.2-1B-Instruct", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.24 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5698 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3329 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1682 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.2-1B/18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json b/data/hfopenllm_v2/meta-llama/Llama-3.2-1B/18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json deleted file mode 100644 index c0d6f09c5..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-3.2-1B/18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B", - "id": "meta-llama/Llama-3.2-1B", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.24 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1478 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3115 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2282 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1203 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.2-3B-Instruct/dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json b/data/hfopenllm_v2/meta-llama/Llama-3.2-3B-Instruct/dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json deleted file mode 100644 index cad18f268..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-3.2-3B-Instruct/dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.2-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Instruct", - "id": "meta-llama/Llama-3.2-3B-Instruct", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7393 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.461 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1767 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3195 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.2-3B/8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json b/data/hfopenllm_v2/meta-llama/Llama-3.2-3B/8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json deleted file mode 100644 index b00c2e441..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-3.2-3B/8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.2-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B", - "id": "meta-llama/Llama-3.2-3B", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1337 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3905 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3577 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2488 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.3-70B-Instruct/f74d26e6-9dfb-4e81-8522-8309b27760cf.json b/data/hfopenllm_v2/meta-llama/Llama-3.3-70B-Instruct/f74d26e6-9dfb-4e81-8522-8309b27760cf.json deleted file mode 100644 index fd07a860b..000000000 --- a/data/hfopenllm_v2/meta-llama/Llama-3.3-70B-Instruct/f74d26e6-9dfb-4e81-8522-8309b27760cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.3-70B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.3-70B-Instruct", - "id": "meta-llama/Llama-3.3-70B-Instruct", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8998 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6919 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4834 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5332 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B-Instruct/2022bcf3-a057-4b0a-aa33-6cf074ffc714.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B-Instruct/2022bcf3-a057-4b0a-aa33-6cf074ffc714.json deleted file mode 100644 index dc54a58af..000000000 --- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B-Instruct/2022bcf3-a057-4b0a-aa33-6cf074ffc714.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-70B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3-70B-Instruct", - "id": "meta-llama/Meta-Llama-3-70B-Instruct", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8099 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6547 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2447 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5207 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B/a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B/a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json deleted file mode 100644 index faa64a96e..000000000 --- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B/a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3-70B", - "id": "meta-llama/Meta-Llama-3-70B", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1603 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6461 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4518 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4709 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/24d850fe-1817-4041-8767-085f4bd2bac3.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/24d850fe-1817-4041-8767-085f4bd2bac3.json deleted file mode 100644 index ac1995522..000000000 --- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/24d850fe-1817-4041-8767-085f4bd2bac3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3-8B-Instruct", - "id": "meta-llama/Meta-Llama-3-8B-Instruct", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4989 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3568 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3664 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/610a3be1-1032-4079-ba37-d6c2c5f9fd55.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/610a3be1-1032-4079-ba37-d6c2c5f9fd55.json deleted file mode 100644 index 7d274c62c..000000000 --- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/610a3be1-1032-4079-ba37-d6c2c5f9fd55.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3-8B-Instruct", - "id": "meta-llama/Meta-Llama-3-8B-Instruct", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4782 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.491 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3805 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B/857bb10e-1b43-4714-a758-0cef5816ba02.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B/857bb10e-1b43-4714-a758-0cef5816ba02.json deleted file mode 100644 index e2bd15b0b..000000000 --- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B/857bb10e-1b43-4714-a758-0cef5816ba02.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3-8B", - "id": "meta-llama/Meta-Llama-3-8B", - "developer": "meta-llama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1455 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4598 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3614 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.321 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mhl1/Qwen2.5-0.5B-cinstruct-stage1/cdabdd54-6101-471c-9bd8-446953be986b.json b/data/hfopenllm_v2/mhl1/Qwen2.5-0.5B-cinstruct-stage1/cdabdd54-6101-471c-9bd8-446953be986b.json deleted file mode 100644 index a4956c1b7..000000000 --- a/data/hfopenllm_v2/mhl1/Qwen2.5-0.5B-cinstruct-stage1/cdabdd54-6101-471c-9bd8-446953be986b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mhl1_Qwen2.5-0.5B-cinstruct-stage1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-0.5B-cinstruct-stage1", - "id": "mhl1/Qwen2.5-0.5B-cinstruct-stage1", - "developer": "mhl1", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/DialoGPT-medium/8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json b/data/hfopenllm_v2/microsoft/DialoGPT-medium/8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json deleted file mode 100644 index 93673afea..000000000 --- a/data/hfopenllm_v2/microsoft/DialoGPT-medium/8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_DialoGPT-medium/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DialoGPT-medium", - "id": "microsoft/DialoGPT-medium", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.345 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1479 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3014 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1119 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Orca-2-13b/65d10996-2c5b-4e11-9a07-319c2446a237.json b/data/hfopenllm_v2/microsoft/Orca-2-13b/65d10996-2c5b-4e11-9a07-319c2446a237.json deleted file mode 100644 index 346756a08..000000000 --- a/data/hfopenllm_v2/microsoft/Orca-2-13b/65d10996-2c5b-4e11-9a07-319c2446a237.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Orca-2-13b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Orca-2-13b", - "id": "microsoft/Orca-2-13b", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3128 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4884 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.513 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2749 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Orca-2-7b/ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json b/data/hfopenllm_v2/microsoft/Orca-2-7b/ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json deleted file mode 100644 index b1da27202..000000000 --- a/data/hfopenllm_v2/microsoft/Orca-2-7b/ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Orca-2-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Orca-2-7b", - "id": "microsoft/Orca-2-7b", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5026 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2319 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-3-medium-128k-instruct/45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json b/data/hfopenllm_v2/microsoft/Phi-3-medium-128k-instruct/45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json deleted file mode 100644 index a60530782..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-3-medium-128k-instruct/45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-medium-128k-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-medium-128k-instruct", - "id": "microsoft/Phi-3-medium-128k-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6382 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4129 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4712 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-3-medium-4k-instruct/4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json b/data/hfopenllm_v2/microsoft/Phi-3-medium-4k-instruct/4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json deleted file mode 100644 index 5255ec1d7..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-3-medium-4k-instruct/4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-medium-4k-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-medium-4k-instruct", - "id": "microsoft/Phi-3-medium-4k-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6423 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6412 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1956 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4258 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4676 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-3-mini-128k-instruct/b4a79f30-3a04-4f78-861e-1571316a0642.json b/data/hfopenllm_v2/microsoft/Phi-3-mini-128k-instruct/b4a79f30-3a04-4f78-861e-1571316a0642.json deleted file mode 100644 index 9e63038b5..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-3-mini-128k-instruct/b4a79f30-3a04-4f78-861e-1571316a0642.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-mini-128k-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-mini-128k-instruct", - "id": "microsoft/Phi-3-mini-128k-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5976 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5575 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1405 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3734 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/53426038-df38-45ba-b621-34231c9cad7f.json b/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/53426038-df38-45ba-b621-34231c9cad7f.json deleted file mode 100644 index bbd15d9d0..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/53426038-df38-45ba-b621-34231c9cad7f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-mini-4k-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-mini-4k-instruct", - "id": "microsoft/Phi-3-mini-4k-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5477 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5491 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4022 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json b/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json deleted file mode 100644 index 14dce77a3..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-mini-4k-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-mini-4k-instruct", - "id": "microsoft/Phi-3-mini-4k-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5613 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5676 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3866 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-3-small-128k-instruct/d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json b/data/hfopenllm_v2/microsoft/Phi-3-small-128k-instruct/d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json deleted file mode 100644 index 4fa6eea19..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-3-small-128k-instruct/d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-small-128k-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-small-128k-instruct", - "id": "microsoft/Phi-3-small-128k-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3SmallForCausalLM", - "params_billions": 7.392 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6368 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6202 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2026 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4378 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4491 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-3-small-8k-instruct/8b752519-63d4-4638-b56e-1c45c7f4694e.json b/data/hfopenllm_v2/microsoft/Phi-3-small-8k-instruct/8b752519-63d4-4638-b56e-1c45c7f4694e.json deleted file mode 100644 index ac0af8542..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-3-small-8k-instruct/8b752519-63d4-4638-b56e-1c45c7f4694e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-small-8k-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-small-8k-instruct", - "id": "microsoft/Phi-3-small-8k-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3SmallForCausalLM", - "params_billions": 7.392 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6497 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6208 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1887 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4558 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4506 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-3.5-MoE-instruct/8da71b7c-7b73-453f-998b-84e70b54e471.json b/data/hfopenllm_v2/microsoft/Phi-3.5-MoE-instruct/8da71b7c-7b73-453f-998b-84e70b54e471.json deleted file mode 100644 index abd80b617..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-3.5-MoE-instruct/8da71b7c-7b73-453f-998b-84e70b54e471.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-3.5-MoE-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3.5-MoE-instruct", - "id": "microsoft/Phi-3.5-MoE-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 42.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6925 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6408 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3119 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4565 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-3.5-mini-instruct/2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json b/data/hfopenllm_v2/microsoft/Phi-3.5-mini-instruct/2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json deleted file mode 100644 index 5a22577ff..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-3.5-mini-instruct/2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-3.5-mini-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3.5-mini-instruct", - "id": "microsoft/Phi-3.5-mini-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5775 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5518 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3962 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/Phi-4-mini-instruct/37e19712-3197-42da-a8f2-ae1f36c2b06c.json b/data/hfopenllm_v2/microsoft/Phi-4-mini-instruct/37e19712-3197-42da-a8f2-ae1f36c2b06c.json deleted file mode 100644 index 7a263b228..000000000 --- a/data/hfopenllm_v2/microsoft/Phi-4-mini-instruct/37e19712-3197-42da-a8f2-ae1f36c2b06c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_Phi-4-mini-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-mini-instruct", - "id": "microsoft/Phi-4-mini-instruct", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 3.836 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5689 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1699 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3932 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/phi-1/c6ae6691-64ec-443d-8d76-af614c8cc7f9.json b/data/hfopenllm_v2/microsoft/phi-1/c6ae6691-64ec-443d-8d76-af614c8cc7f9.json deleted file mode 100644 index 84c69a4be..000000000 --- a/data/hfopenllm_v2/microsoft/phi-1/c6ae6691-64ec-443d-8d76-af614c8cc7f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_phi-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-1", - "id": "microsoft/phi-1", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "PhiForCausalLM", - "params_billions": 1.418 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2068 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3139 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3525 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1162 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/phi-1_5/80567722-8c6b-41b9-8103-3bdaedfdb8ee.json b/data/hfopenllm_v2/microsoft/phi-1_5/80567722-8c6b-41b9-8103-3bdaedfdb8ee.json deleted file mode 100644 index 2fc19d941..000000000 --- a/data/hfopenllm_v2/microsoft/phi-1_5/80567722-8c6b-41b9-8103-3bdaedfdb8ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_phi-1_5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-1_5", - "id": "microsoft/phi-1_5", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 1.418 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2033 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.336 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3404 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1691 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/phi-2/20192dc4-ea3a-4413-8457-18a592fa0c64.json b/data/hfopenllm_v2/microsoft/phi-2/20192dc4-ea3a-4413-8457-18a592fa0c64.json deleted file mode 100644 index 428cbe527..000000000 --- a/data/hfopenllm_v2/microsoft/phi-2/20192dc4-ea3a-4413-8457-18a592fa0c64.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_phi-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-2", - "id": "microsoft/phi-2", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2739 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4881 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4099 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2628 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/phi-4/8c878c05-86f7-4d61-81d7-9bb286516581.json b/data/hfopenllm_v2/microsoft/phi-4/8c878c05-86f7-4d61-81d7-9bb286516581.json deleted file mode 100644 index 5d88bde05..000000000 --- a/data/hfopenllm_v2/microsoft/phi-4/8c878c05-86f7-4d61-81d7-9bb286516581.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_phi-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4", - "id": "microsoft/phi-4", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0585 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6691 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3165 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5287 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/microsoft/phi-4/fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json b/data/hfopenllm_v2/microsoft/phi-4/fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json deleted file mode 100644 index 8eaf9faa9..000000000 --- a/data/hfopenllm_v2/microsoft/phi-4/fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/microsoft_phi-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4", - "id": "microsoft/phi-4", - "developer": "microsoft", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Phi3ForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0488 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6703 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2787 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.401 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5295 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/migtissera/Llama-3-70B-Synthia-v3.5/0516b46b-a957-413f-aadc-58f4339dc60a.json b/data/hfopenllm_v2/migtissera/Llama-3-70B-Synthia-v3.5/0516b46b-a957-413f-aadc-58f4339dc60a.json deleted file mode 100644 index 2429a8f70..000000000 --- a/data/hfopenllm_v2/migtissera/Llama-3-70B-Synthia-v3.5/0516b46b-a957-413f-aadc-58f4339dc60a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/migtissera_Llama-3-70B-Synthia-v3.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-70B-Synthia-v3.5", - "id": "migtissera/Llama-3-70B-Synthia-v3.5", - "developer": "migtissera", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6076 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6489 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4922 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/migtissera/Llama-3-8B-Synthia-v3.5/97200dd7-7ed0-4a7b-ace9-31c173f017f1.json b/data/hfopenllm_v2/migtissera/Llama-3-8B-Synthia-v3.5/97200dd7-7ed0-4a7b-ace9-31c173f017f1.json deleted file mode 100644 index 30146a9c1..000000000 --- a/data/hfopenllm_v2/migtissera/Llama-3-8B-Synthia-v3.5/97200dd7-7ed0-4a7b-ace9-31c173f017f1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/migtissera_Llama-3-8B-Synthia-v3.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Synthia-v3.5", - "id": "migtissera/Llama-3-8B-Synthia-v3.5", - "developer": "migtissera", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4888 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4044 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.303 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/migtissera/Tess-3-7B-SFT/758f8332-ffa8-4059-ac6f-400f9367bb23.json b/data/hfopenllm_v2/migtissera/Tess-3-7B-SFT/758f8332-ffa8-4059-ac6f-400f9367bb23.json deleted file mode 100644 index 45ec18a45..000000000 --- a/data/hfopenllm_v2/migtissera/Tess-3-7B-SFT/758f8332-ffa8-4059-ac6f-400f9367bb23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/migtissera_Tess-3-7B-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tess-3-7B-SFT", - "id": "migtissera/Tess-3-7B-SFT", - "developer": "migtissera", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3946 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4607 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4113 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3034 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/migtissera/Tess-3-Mistral-Nemo-12B/b1103662-055c-471e-ace8-dd75f607491d.json b/data/hfopenllm_v2/migtissera/Tess-3-Mistral-Nemo-12B/b1103662-055c-471e-ace8-dd75f607491d.json deleted file mode 100644 index ada1e45ae..000000000 --- a/data/hfopenllm_v2/migtissera/Tess-3-Mistral-Nemo-12B/b1103662-055c-471e-ace8-dd75f607491d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/migtissera_Tess-3-Mistral-Nemo-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tess-3-Mistral-Nemo-12B", - "id": "migtissera/Tess-3-Mistral-Nemo-12B", - "developer": "migtissera", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3355 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4899 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4458 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2565 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/migtissera/Tess-v2.5-Phi-3-medium-128k-14B/27b0d675-498f-4351-b92f-7c0d1a3c83bd.json b/data/hfopenllm_v2/migtissera/Tess-v2.5-Phi-3-medium-128k-14B/27b0d675-498f-4351-b92f-7c0d1a3c83bd.json deleted file mode 100644 index 81b7d3cb0..000000000 --- a/data/hfopenllm_v2/migtissera/Tess-v2.5-Phi-3-medium-128k-14B/27b0d675-498f-4351-b92f-7c0d1a3c83bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/migtissera_Tess-v2.5-Phi-3-medium-128k-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tess-v2.5-Phi-3-medium-128k-14B", - "id": "migtissera/Tess-v2.5-Phi-3-medium-128k-14B", - "developer": "migtissera", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 13.96 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6207 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4113 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3732 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/migtissera/Tess-v2.5.2-Qwen2-72B/3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json b/data/hfopenllm_v2/migtissera/Tess-v2.5.2-Qwen2-72B/3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json deleted file mode 100644 index 82ec5a874..000000000 --- a/data/hfopenllm_v2/migtissera/Tess-v2.5.2-Qwen2-72B/3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/migtissera_Tess-v2.5.2-Qwen2-72B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tess-v2.5.2-Qwen2-72B", - "id": "migtissera/Tess-v2.5.2-Qwen2-72B", - "developer": "migtissera", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4494 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6647 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2938 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5561 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/3883b0d3-e442-42d3-adc6-ed959c902dd3.json b/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/3883b0d3-e442-42d3-adc6-ed959c902dd3.json deleted file mode 100644 index 3c3025654..000000000 --- a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/3883b0d3-e442-42d3-adc6-ed959c902dd3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/migtissera_Trinity-2-Codestral-22B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Trinity-2-Codestral-22B-v0.2", - "id": "migtissera/Trinity-2-Codestral-22B-v0.2", - "developer": "migtissera", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4345 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5686 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4045 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.334 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/da172cdb-1388-42f5-97b1-ae8e15291631.json b/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/da172cdb-1388-42f5-97b1-ae8e15291631.json deleted file mode 100644 index 5c049da80..000000000 --- a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/da172cdb-1388-42f5-97b1-ae8e15291631.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/migtissera_Trinity-2-Codestral-22B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Trinity-2-Codestral-22B-v0.2", - "id": "migtissera/Trinity-2-Codestral-22B-v0.2", - "developer": "migtissera", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.443 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5706 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B/7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json b/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B/7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json deleted file mode 100644 index 186299b68..000000000 --- a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B/7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/migtissera_Trinity-2-Codestral-22B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Trinity-2-Codestral-22B", - "id": "migtissera/Trinity-2-Codestral-22B", - "developer": "migtissera", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5593 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4111 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3308 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/7cdd1de0-767d-4527-a024-c67166bb8b20.json b/data/hfopenllm_v2/mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/7cdd1de0-767d-4527-a024-c67166bb8b20.json deleted file mode 100644 index 036d983ff..000000000 --- a/data/hfopenllm_v2/mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/7cdd1de0-767d-4527-a024-c67166bb8b20.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mindw96_DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3", - "id": "mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3", - "developer": "mindw96", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1388 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3068 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1106 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/minghaowu/Qwen1.5-1.8B-OpenHermes-2.5/d4702278-54c4-42e8-a901-dfe5c7f2004a.json b/data/hfopenllm_v2/minghaowu/Qwen1.5-1.8B-OpenHermes-2.5/d4702278-54c4-42e8-a901-dfe5c7f2004a.json deleted file mode 100644 index fb2414062..000000000 --- a/data/hfopenllm_v2/minghaowu/Qwen1.5-1.8B-OpenHermes-2.5/d4702278-54c4-42e8-a901-dfe5c7f2004a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/minghaowu_Qwen1.5-1.8B-OpenHermes-2.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen1.5-1.8B-OpenHermes-2.5", - "id": "minghaowu/Qwen1.5-1.8B-OpenHermes-2.5", - "developer": "minghaowu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.837 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2778 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3375 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1792 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ministral/Ministral-3b-instruct/149f8ee5-4376-4fcc-8f87-7412a3083570.json b/data/hfopenllm_v2/ministral/Ministral-3b-instruct/149f8ee5-4376-4fcc-8f87-7412a3083570.json deleted file mode 100644 index d2bace4f0..000000000 --- a/data/hfopenllm_v2/ministral/Ministral-3b-instruct/149f8ee5-4376-4fcc-8f87-7412a3083570.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ministral_Ministral-3b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ministral-3b-instruct", - "id": "ministral/Ministral-3b-instruct", - "developer": "ministral", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 3.316 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1358 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3192 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1093 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistral-community/Mistral-7B-v0.2/de82b746-c5d7-450a-bc2b-1b2859d91d6b.json b/data/hfopenllm_v2/mistral-community/Mistral-7B-v0.2/de82b746-c5d7-450a-bc2b-1b2859d91d6b.json deleted file mode 100644 index 0f1e81e70..000000000 --- a/data/hfopenllm_v2/mistral-community/Mistral-7B-v0.2/de82b746-c5d7-450a-bc2b-1b2859d91d6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistral-community_Mistral-7B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-v0.2", - "id": "mistral-community/Mistral-7B-v0.2", - "developer": "mistral-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2266 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.451 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4032 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistral-community/Mixtral-8x22B-v0.1/d2a916a6-288a-4761-a3fd-ca674edb67c1.json b/data/hfopenllm_v2/mistral-community/Mixtral-8x22B-v0.1/d2a916a6-288a-4761-a3fd-ca674edb67c1.json deleted file mode 100644 index 944c1169a..000000000 --- a/data/hfopenllm_v2/mistral-community/Mixtral-8x22B-v0.1/d2a916a6-288a-4761-a3fd-ca674edb67c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistral-community_Mixtral-8x22B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral-8x22B-v0.1", - "id": "mistral-community/Mixtral-8x22B-v0.1", - "developer": "mistral-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Unknown", - "params_billions": 0.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1543 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3533 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.36 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistral-community/mixtral-8x22B-v0.3/cda497f9-c7f9-48d6-944b-0167476e5e5c.json b/data/hfopenllm_v2/mistral-community/mixtral-8x22B-v0.3/cda497f9-c7f9-48d6-944b-0167476e5e5c.json deleted file mode 100644 index 036d6208b..000000000 --- a/data/hfopenllm_v2/mistral-community/mixtral-8x22B-v0.3/cda497f9-c7f9-48d6-944b-0167476e5e5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistral-community_mixtral-8x22B-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mixtral-8x22B-v0.3", - "id": "mistral-community/mixtral-8x22B-v0.3", - "developer": "mistral-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 140.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2583 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1835 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4037 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4639 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Codestral-22B-v0.1/b56c6c01-a226-4090-9332-330535d79e24.json b/data/hfopenllm_v2/mistralai/Codestral-22B-v0.1/b56c6c01-a226-4090-9332-330535d79e24.json deleted file mode 100644 index d2f9f4474..000000000 --- a/data/hfopenllm_v2/mistralai/Codestral-22B-v0.1/b56c6c01-a226-4090-9332-330535d79e24.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Codestral-22B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Codestral-22B-v0.1", - "id": "mistralai/Codestral-22B-v0.1", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5772 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5139 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1005 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Ministral-8B-Instruct-2410/0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json b/data/hfopenllm_v2/mistralai/Ministral-8B-Instruct-2410/0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json deleted file mode 100644 index 3393c5511..000000000 --- a/data/hfopenllm_v2/mistralai/Ministral-8B-Instruct-2410/0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Ministral-8B-Instruct-2410/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ministral-8B-Instruct-2410", - "id": "mistralai/Ministral-8B-Instruct-2410", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.02 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5896 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4762 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1956 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3291 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.1/2917c469-7e22-497e-8d62-9b9972266658.json b/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.1/2917c469-7e22-497e-8d62-9b9972266658.json deleted file mode 100644 index 46b328f13..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.1/2917c469-7e22-497e-8d62-9b9972266658.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-v0.1", - "id": "mistralai/Mistral-7B-Instruct-v0.1", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3355 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3848 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2414 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.2/2424d85c-e092-4e7c-bf4f-ae014d08a159.json b/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.2/2424d85c-e092-4e7c-bf4f-ae014d08a159.json deleted file mode 100644 index 3c886d188..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.2/2424d85c-e092-4e7c-bf4f-ae014d08a159.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-Instruct-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-v0.2", - "id": "mistralai/Mistral-7B-Instruct-v0.2", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3966 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2717 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.3/90278363-1d8f-47ca-a7dc-c51c6b511dc9.json b/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.3/90278363-1d8f-47ca-a7dc-c51c6b511dc9.json deleted file mode 100644 index e6172895f..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.3/90278363-1d8f-47ca-a7dc-c51c6b511dc9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-Instruct-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-v0.3", - "id": "mistralai/Mistral-7B-Instruct-v0.3", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4722 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3075 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-v0.1/3c3197ee-675d-4bb7-874d-28104d2a3cae.json b/data/hfopenllm_v2/mistralai/Mistral-7B-v0.1/3c3197ee-675d-4bb7-874d-28104d2a3cae.json deleted file mode 100644 index 59e48b5f6..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-7B-v0.1/3c3197ee-675d-4bb7-874d-28104d2a3cae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-v0.1", - "id": "mistralai/Mistral-7B-v0.1", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2386 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4139 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3013 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-v0.3/eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json b/data/hfopenllm_v2/mistralai/Mistral-7B-v0.3/eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json deleted file mode 100644 index ab5aca3af..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-7B-v0.3/eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-v0.3", - "id": "mistralai/Mistral-7B-v0.3", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2266 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4032 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-Large-Instruct-2411/d770f88d-b110-4f27-85e9-e52217c11798.json b/data/hfopenllm_v2/mistralai/Mistral-Large-Instruct-2411/d770f88d-b110-4f27-85e9-e52217c11798.json deleted file mode 100644 index 439c483ad..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-Large-Instruct-2411/d770f88d-b110-4f27-85e9-e52217c11798.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Large-Instruct-2411/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Large-Instruct-2411", - "id": "mistralai/Mistral-Large-Instruct-2411", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 122.61 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8401 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6747 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.454 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-Nemo-Base-2407/364328ce-5de7-401f-ad84-0c76e3c1dc91.json b/data/hfopenllm_v2/mistralai/Mistral-Nemo-Base-2407/364328ce-5de7-401f-ad84-0c76e3c1dc91.json deleted file mode 100644 index 9a51045d8..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-Nemo-Base-2407/364328ce-5de7-401f-ad84-0c76e3c1dc91.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Nemo-Base-2407/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Base-2407", - "id": "mistralai/Mistral-Nemo-Base-2407", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 11.58 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.163 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5035 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3921 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3472 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-Nemo-Instruct-2407/f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json b/data/hfopenllm_v2/mistralai/Mistral-Nemo-Instruct-2407/f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json deleted file mode 100644 index 3522fc93d..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-Nemo-Instruct-2407/f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Nemo-Instruct-2407/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Instruct-2407", - "id": "mistralai/Mistral-Nemo-Instruct-2407", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.638 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5037 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1269 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3517 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-Small-24B-Base-2501/d641aa88-9981-4a25-90d5-fcc4564ede52.json b/data/hfopenllm_v2/mistralai/Mistral-Small-24B-Base-2501/d641aa88-9981-4a25-90d5-fcc4564ede52.json deleted file mode 100644 index f8fd7098d..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-Small-24B-Base-2501/d641aa88-9981-4a25-90d5-fcc4564ede52.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Small-24B-Base-2501/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-24B-Base-2501", - "id": "mistralai/Mistral-Small-24B-Base-2501", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1672 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6442 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1971 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/8915e742-df2e-41bc-b83f-3e111edfd257.json b/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/8915e742-df2e-41bc-b83f-3e111edfd257.json deleted file mode 100644 index b9bddd2f0..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/8915e742-df2e-41bc-b83f-3e111edfd257.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Small-Instruct-2409/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-Instruct-2409", - "id": "mistralai/Mistral-Small-Instruct-2409", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6283 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.583 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2039 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4063 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4099 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/e29a5e35-8677-4e53-83fd-85e919b4366a.json b/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/e29a5e35-8677-4e53-83fd-85e919b4366a.json deleted file mode 100644 index 883513689..000000000 --- a/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/e29a5e35-8677-4e53-83fd-85e919b4366a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Small-Instruct-2409/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-Instruct-2409", - "id": "mistralai/Mistral-Small-Instruct-2409", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.05 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.667 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5213 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1435 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3632 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x22B-Instruct-v0.1/e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json b/data/hfopenllm_v2/mistralai/Mixtral-8x22B-Instruct-v0.1/e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json deleted file mode 100644 index 95271fa45..000000000 --- a/data/hfopenllm_v2/mistralai/Mixtral-8x22B-Instruct-v0.1/e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x22B-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral-8x22B-Instruct-v0.1", - "id": "mistralai/Mixtral-8x22B-Instruct-v0.1", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 140.621 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7184 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6125 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4311 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4483 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x22B-v0.1/504baceb-6684-430d-a532-b7b5b0b061fe.json b/data/hfopenllm_v2/mistralai/Mixtral-8x22B-v0.1/504baceb-6684-430d-a532-b7b5b0b061fe.json deleted file mode 100644 index e967e9002..000000000 --- a/data/hfopenllm_v2/mistralai/Mixtral-8x22B-v0.1/504baceb-6684-430d-a532-b7b5b0b061fe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x22B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral-8x22B-v0.1", - "id": "mistralai/Mixtral-8x22B-v0.1", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 140.621 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2583 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.624 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1835 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4037 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4639 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-Instruct-v0.1/31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json b/data/hfopenllm_v2/mistralai/Mixtral-8x7B-Instruct-v0.1/31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json deleted file mode 100644 index 659f51d7a..000000000 --- a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-Instruct-v0.1/31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x7B-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral-8x7B-Instruct-v0.1", - "id": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5599 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3692 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/01ab0a3e-393a-497a-9b32-8af790b7581a.json b/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/01ab0a3e-393a-497a-9b32-8af790b7581a.json deleted file mode 100644 index b2775e111..000000000 --- a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/01ab0a3e-393a-497a-9b32-8af790b7581a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x7B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral-8x7B-v0.1", - "id": "mistralai/Mixtral-8x7B-v0.1", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2326 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5098 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3871 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/541967a6-b856-4dc9-958a-9335197fba99.json b/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/541967a6-b856-4dc9-958a-9335197fba99.json deleted file mode 100644 index 2f9aa7f43..000000000 --- a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/541967a6-b856-4dc9-958a-9335197fba99.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x7B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixtral-8x7B-v0.1", - "id": "mistralai/Mixtral-8x7B-v0.1", - "developer": "mistralai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2415 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5087 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4321 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.385 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mixtao/MixTAO-7Bx2-MoE-v8.1/ee31c801-67cb-46a3-9e39-02e842c0473f.json b/data/hfopenllm_v2/mixtao/MixTAO-7Bx2-MoE-v8.1/ee31c801-67cb-46a3-9e39-02e842c0473f.json deleted file mode 100644 index f557e9ecc..000000000 --- a/data/hfopenllm_v2/mixtao/MixTAO-7Bx2-MoE-v8.1/ee31c801-67cb-46a3-9e39-02e842c0473f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mixtao_MixTAO-7Bx2-MoE-v8.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MixTAO-7Bx2-MoE-v8.1", - "id": "mixtao/MixTAO-7Bx2-MoE-v8.1", - "developer": "mixtao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5189 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4463 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mkurman/llama-3.2-MEDIT-3B-o1/65fabe8b-05af-461e-b804-fcff3492da34.json b/data/hfopenllm_v2/mkurman/llama-3.2-MEDIT-3B-o1/65fabe8b-05af-461e-b804-fcff3492da34.json deleted file mode 100644 index 3b0161b0b..000000000 --- a/data/hfopenllm_v2/mkurman/llama-3.2-MEDIT-3B-o1/65fabe8b-05af-461e-b804-fcff3492da34.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mkurman_llama-3.2-MEDIT-3B-o1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3.2-MEDIT-3B-o1", - "id": "mkurman/llama-3.2-MEDIT-3B-o1", - "developer": "mkurman", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.607 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1307 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2741 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mkurman/phi-4-MedIT-11B-exp-1/7e1a7121-2c9f-4196-bbdd-48aea257f384.json b/data/hfopenllm_v2/mkurman/phi-4-MedIT-11B-exp-1/7e1a7121-2c9f-4196-bbdd-48aea257f384.json deleted file mode 100644 index f017555d6..000000000 --- a/data/hfopenllm_v2/mkurman/phi-4-MedIT-11B-exp-1/7e1a7121-2c9f-4196-bbdd-48aea257f384.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mkurman_phi-4-MedIT-11B-exp-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-MedIT-11B-exp-1", - "id": "mkurman/phi-4-MedIT-11B-exp-1", - "developer": "mkurman", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Phi3ForCausalLM", - "params_billions": 11.514 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5948 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5414 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3848 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mkurman/phi4-MedIT-10B-o1/dd32609c-316e-4511-8791-fcae33a1a506.json b/data/hfopenllm_v2/mkurman/phi4-MedIT-10B-o1/dd32609c-316e-4511-8791-fcae33a1a506.json deleted file mode 100644 index 5207ed657..000000000 --- a/data/hfopenllm_v2/mkurman/phi4-MedIT-10B-o1/dd32609c-316e-4511-8791-fcae33a1a506.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mkurman_phi4-MedIT-10B-o1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi4-MedIT-10B-o1", - "id": "mkurman/phi4-MedIT-10B-o1", - "developer": "mkurman", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaMedITForCausalLM", - "params_billions": 10.255 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3463 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5198 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mkxu/llama-3-8b-instruct-fpo/d95d7058-49eb-47d7-b790-3a253291d22b.json b/data/hfopenllm_v2/mkxu/llama-3-8b-instruct-fpo/d95d7058-49eb-47d7-b790-3a253291d22b.json deleted file mode 100644 index 9ef45d0ae..000000000 --- a/data/hfopenllm_v2/mkxu/llama-3-8b-instruct-fpo/d95d7058-49eb-47d7-b790-3a253291d22b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mkxu_llama-3-8b-instruct-fpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-instruct-fpo", - "id": "mkxu/llama-3-8b-instruct-fpo", - "developer": "mkxu", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4959 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3605 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mkxu/llama-3-8b-po1/37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json b/data/hfopenllm_v2/mkxu/llama-3-8b-po1/37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json deleted file mode 100644 index db56d51b6..000000000 --- a/data/hfopenllm_v2/mkxu/llama-3-8b-po1/37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mkxu_llama-3-8b-po1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-po1", - "id": "mkxu/llama-3-8b-po1", - "developer": "mkxu", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4976 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/AlphaMonarch-7B/76d0d338-e502-4638-adad-c4c4df00c26f.json b/data/hfopenllm_v2/mlabonne/AlphaMonarch-7B/76d0d338-e502-4638-adad-c4c4df00c26f.json deleted file mode 100644 index e4a056c61..000000000 --- a/data/hfopenllm_v2/mlabonne/AlphaMonarch-7B/76d0d338-e502-4638-adad-c4c4df00c26f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_AlphaMonarch-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AlphaMonarch-7B", - "id": "mlabonne/AlphaMonarch-7B", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4939 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4626 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4121 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2473 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/Beyonder-4x7B-v3/f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json b/data/hfopenllm_v2/mlabonne/Beyonder-4x7B-v3/f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json deleted file mode 100644 index ae8e73867..000000000 --- a/data/hfopenllm_v2/mlabonne/Beyonder-4x7B-v3/f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_Beyonder-4x7B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Beyonder-4x7B-v3", - "id": "mlabonne/Beyonder-4x7B-v3", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.154 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5608 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4671 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4045 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2512 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/BigQwen2.5-52B-Instruct/6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json b/data/hfopenllm_v2/mlabonne/BigQwen2.5-52B-Instruct/6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json deleted file mode 100644 index 7401e378b..000000000 --- a/data/hfopenllm_v2/mlabonne/BigQwen2.5-52B-Instruct/6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_BigQwen2.5-52B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BigQwen2.5-52B-Instruct", - "id": "mlabonne/BigQwen2.5-52B-Instruct", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 52.268 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7913 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7121 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4113 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5519 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/BigQwen2.5-Echo-47B-Instruct/0e59c8ca-cde0-4482-ab03-3309bcb8737c.json b/data/hfopenllm_v2/mlabonne/BigQwen2.5-Echo-47B-Instruct/0e59c8ca-cde0-4482-ab03-3309bcb8737c.json deleted file mode 100644 index 015017389..000000000 --- a/data/hfopenllm_v2/mlabonne/BigQwen2.5-Echo-47B-Instruct/0e59c8ca-cde0-4482-ab03-3309bcb8737c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_BigQwen2.5-Echo-47B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BigQwen2.5-Echo-47B-Instruct", - "id": "mlabonne/BigQwen2.5-Echo-47B-Instruct", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 47.392 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7357 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6125 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4734 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v2/d7e900e2-0574-44cd-a68a-0dd2715cf48c.json b/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v2/d7e900e2-0574-44cd-a68a-0dd2715cf48c.json deleted file mode 100644 index e8c365a9a..000000000 --- a/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v2/d7e900e2-0574-44cd-a68a-0dd2715cf48c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_ChimeraLlama-3-8B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ChimeraLlama-3-8B-v2", - "id": "mlabonne/ChimeraLlama-3-8B-v2", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4469 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5046 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3791 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3569 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v3/fd626c3f-566d-4193-9a85-e7c9a89e671c.json b/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v3/fd626c3f-566d-4193-9a85-e7c9a89e671c.json deleted file mode 100644 index e51cf0145..000000000 --- a/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v3/fd626c3f-566d-4193-9a85-e7c9a89e671c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_ChimeraLlama-3-8B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ChimeraLlama-3-8B-v3", - "id": "mlabonne/ChimeraLlama-3-8B-v3", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0884 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4004 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3669 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/Daredevil-8B-abliterated/196b04ae-fd53-400f-9f08-19edd4959f6e.json b/data/hfopenllm_v2/mlabonne/Daredevil-8B-abliterated/196b04ae-fd53-400f-9f08-19edd4959f6e.json deleted file mode 100644 index 8e430f3b6..000000000 --- a/data/hfopenllm_v2/mlabonne/Daredevil-8B-abliterated/196b04ae-fd53-400f-9f08-19edd4959f6e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_Daredevil-8B-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Daredevil-8B-abliterated", - "id": "mlabonne/Daredevil-8B-abliterated", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4426 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/Daredevil-8B/57177299-076a-4506-89a7-ce54af08df4f.json b/data/hfopenllm_v2/mlabonne/Daredevil-8B/57177299-076a-4506-89a7-ce54af08df4f.json deleted file mode 100644 index f05fd6db8..000000000 --- a/data/hfopenllm_v2/mlabonne/Daredevil-8B/57177299-076a-4506-89a7-ce54af08df4f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_Daredevil-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Daredevil-8B", - "id": "mlabonne/Daredevil-8B", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4548 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/Hermes-3-Llama-3.1-70B-lorablated/d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json b/data/hfopenllm_v2/mlabonne/Hermes-3-Llama-3.1-70B-lorablated/d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json deleted file mode 100644 index dced4ed52..000000000 --- a/data/hfopenllm_v2/mlabonne/Hermes-3-Llama-3.1-70B-lorablated/d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_Hermes-3-Llama-3.1-70B-lorablated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes-3-Llama-3.1-70B-lorablated", - "id": "mlabonne/Hermes-3-Llama-3.1-70B-lorablated", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3424 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6693 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5029 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4679 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated/92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json b/data/hfopenllm_v2/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated/92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json deleted file mode 100644 index 2110b6907..000000000 --- a/data/hfopenllm_v2/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated/92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_Meta-Llama-3.1-8B-Instruct-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Meta-Llama-3.1-8B-Instruct-abliterated", - "id": "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7329 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4874 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3503 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/NeuralBeagle14-7B/cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json b/data/hfopenllm_v2/mlabonne/NeuralBeagle14-7B/cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json deleted file mode 100644 index 6fc2a6175..000000000 --- a/data/hfopenllm_v2/mlabonne/NeuralBeagle14-7B/cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_NeuralBeagle14-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralBeagle14-7B", - "id": "mlabonne/NeuralBeagle14-7B", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4935 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4628 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4319 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/6999bb02-29fd-4c59-886f-184362afa06e.json b/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/6999bb02-29fd-4c59-886f-184362afa06e.json deleted file mode 100644 index 6a7692af4..000000000 --- a/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/6999bb02-29fd-4c59-886f-184362afa06e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_NeuralDaredevil-8B-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralDaredevil-8B-abliterated", - "id": "mlabonne/NeuralDaredevil-8B-abliterated", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7561 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3841 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/913d1d8e-0b02-4ce5-9b7c-403143a8c880.json b/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/913d1d8e-0b02-4ce5-9b7c-403143a8c880.json deleted file mode 100644 index 24cd13aa3..000000000 --- a/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/913d1d8e-0b02-4ce5-9b7c-403143a8c880.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_NeuralDaredevil-8B-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NeuralDaredevil-8B-abliterated", - "id": "mlabonne/NeuralDaredevil-8B-abliterated", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5124 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3802 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/OrpoLlama-3-8B/82c87bc0-29cf-4150-92f5-c80fb0028ea6.json b/data/hfopenllm_v2/mlabonne/OrpoLlama-3-8B/82c87bc0-29cf-4150-92f5-c80fb0028ea6.json deleted file mode 100644 index e19f78336..000000000 --- a/data/hfopenllm_v2/mlabonne/OrpoLlama-3-8B/82c87bc0-29cf-4150-92f5-c80fb0028ea6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_OrpoLlama-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OrpoLlama-3-8B", - "id": "mlabonne/OrpoLlama-3-8B", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3653 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4424 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2705 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlabonne/phixtral-2x2_8/a18834ad-6143-4ce2-9842-471817a60a39.json b/data/hfopenllm_v2/mlabonne/phixtral-2x2_8/a18834ad-6143-4ce2-9842-471817a60a39.json deleted file mode 100644 index f6430bd4f..000000000 --- a/data/hfopenllm_v2/mlabonne/phixtral-2x2_8/a18834ad-6143-4ce2-9842-471817a60a39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlabonne_phixtral-2x2_8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phixtral-2x2_8", - "id": "mlabonne/phixtral-2x2_8", - "developer": "mlabonne", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 4.458 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4889 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2551 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/be900bcf-8ec9-484f-81db-0e83975c1ecd.json b/data/hfopenllm_v2/mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/be900bcf-8ec9-484f-81db-0e83975c1ecd.json deleted file mode 100644 index da8271107..000000000 --- a/data/hfopenllm_v2/mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/be900bcf-8ec9-484f-81db-0e83975c1ecd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlx-community_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32", - "id": "mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32", - "developer": "mlx-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3249 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1638 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mlx-community/Mistral-Small-24B-Instruct-2501-bf16/d226ccf6-674b-44c6-8b11-d782b59a961a.json b/data/hfopenllm_v2/mlx-community/Mistral-Small-24B-Instruct-2501-bf16/d226ccf6-674b-44c6-8b11-d782b59a961a.json deleted file mode 100644 index b1e204eb5..000000000 --- a/data/hfopenllm_v2/mlx-community/Mistral-Small-24B-Instruct-2501-bf16/d226ccf6-674b-44c6-8b11-d782b59a961a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mlx-community_Mistral-Small-24B-Instruct-2501-bf16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-24B-Instruct-2501-bf16", - "id": "mlx-community/Mistral-Small-24B-Instruct-2501-bf16", - "developer": "mlx-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6283 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6713 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3225 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4618 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mmnga/Llama-3-70B-japanese-suzume-vector-v0.1/d8839a1a-8d07-4e0b-bd44-2668c84f750c.json b/data/hfopenllm_v2/mmnga/Llama-3-70B-japanese-suzume-vector-v0.1/d8839a1a-8d07-4e0b-bd44-2668c84f750c.json deleted file mode 100644 index c98ce35a8..000000000 --- a/data/hfopenllm_v2/mmnga/Llama-3-70B-japanese-suzume-vector-v0.1/d8839a1a-8d07-4e0b-bd44-2668c84f750c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mmnga_Llama-3-70B-japanese-suzume-vector-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-70B-japanese-suzume-vector-v0.1", - "id": "mmnga/Llama-3-70B-japanese-suzume-vector-v0.1", - "developer": "mmnga", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6542 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2326 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5224 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1/e90b04db-2eb3-483a-ab0e-ea8aef821d84.json b/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1/e90b04db-2eb3-483a-ab0e-ea8aef821d84.json deleted file mode 100644 index f696f25a8..000000000 --- a/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1/e90b04db-2eb3-483a-ab0e-ea8aef821d84.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Llama3-8B-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-ReDistill-Llama3-8B-v1.1", - "id": "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1", - "developer": "mobiuslabsgmbh", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3704 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3285 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1/900921ae-fbb2-4488-ab19-18987c1d008d.json b/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1/900921ae-fbb2-4488-ab19-18987c1d008d.json deleted file mode 100644 index 195277705..000000000 --- a/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1/900921ae-fbb2-4488-ab19-18987c1d008d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Qwen-7B-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-ReDistill-Qwen-7B-v1.1", - "id": "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1", - "developer": "mobiuslabsgmbh", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4009 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2326 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-2x8B-v0.2/0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json b/data/hfopenllm_v2/moeru-ai/L3.1-Moe-2x8B-v0.2/0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json deleted file mode 100644 index 3c3e590ae..000000000 --- a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-2x8B-v0.2/0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/moeru-ai_L3.1-Moe-2x8B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Moe-2x8B-v0.2", - "id": "moeru-ai/L3.1-Moe-2x8B-v0.2", - "developer": "moeru-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 13.668 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7348 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1699 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3858 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.1/b50a49cd-2909-4dbe-9c9f-c150abb99845.json b/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.1/b50a49cd-2909-4dbe-9c9f-c150abb99845.json deleted file mode 100644 index e9612c059..000000000 --- a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.1/b50a49cd-2909-4dbe-9c9f-c150abb99845.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/moeru-ai_L3.1-Moe-4x8B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Moe-4x8B-v0.1", - "id": "moeru-ai/L3.1-Moe-4x8B-v0.1", - "developer": "moeru-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.942 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4939 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3609 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3454 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.2/13831d81-a9dd-43c7-bce1-240aad42fbc6.json b/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.2/13831d81-a9dd-43c7-bce1-240aad42fbc6.json deleted file mode 100644 index 5c203bb82..000000000 --- a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.2/13831d81-a9dd-43c7-bce1-240aad42fbc6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/moeru-ai_L3.1-Moe-4x8B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Moe-4x8B-v0.2", - "id": "moeru-ai/L3.1-Moe-4x8B-v0.2", - "developer": "moeru-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.942 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5407 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3234 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2763 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/monsterapi/Llama-3_1-8B-Instruct-orca-ORPO/56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json b/data/hfopenllm_v2/monsterapi/Llama-3_1-8B-Instruct-orca-ORPO/56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json deleted file mode 100644 index 4f353589f..000000000 --- a/data/hfopenllm_v2/monsterapi/Llama-3_1-8B-Instruct-orca-ORPO/56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/monsterapi_Llama-3_1-8B-Instruct-orca-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3_1-8B-Instruct-orca-ORPO", - "id": "monsterapi/Llama-3_1-8B-Instruct-orca-ORPO", - "developer": "monsterapi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 16.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2273 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2865 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1168 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/monsterapi/gemma-2-2b-LoRA-MonsterInstruct/8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json b/data/hfopenllm_v2/monsterapi/gemma-2-2b-LoRA-MonsterInstruct/8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json deleted file mode 100644 index 1294499b7..000000000 --- a/data/hfopenllm_v2/monsterapi/gemma-2-2b-LoRA-MonsterInstruct/8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/monsterapi_gemma-2-2b-LoRA-MonsterInstruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-LoRA-MonsterInstruct", - "id": "monsterapi/gemma-2-2b-LoRA-MonsterInstruct", - "developer": "monsterapi", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3903 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.365 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1987 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mosaicml/mpt-7b/18ab167d-b72e-4fa9-94a8-09edc641c73f.json b/data/hfopenllm_v2/mosaicml/mpt-7b/18ab167d-b72e-4fa9-94a8-09edc641c73f.json deleted file mode 100644 index 80c3fe30d..000000000 --- a/data/hfopenllm_v2/mosaicml/mpt-7b/18ab167d-b72e-4fa9-94a8-09edc641c73f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mosaicml_mpt-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mpt-7b", - "id": "mosaicml/mpt-7b", - "developer": "mosaicml", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MPTForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2152 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3672 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1206 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection/7df237ea-29c0-4d0a-9092-c41df4c13aca.json b/data/hfopenllm_v2/mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection/7df237ea-29c0-4d0a-9092-c41df4c13aca.json deleted file mode 100644 index 553a8fd2a..000000000 --- a/data/hfopenllm_v2/mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection/7df237ea-29c0-4d0a-9092-c41df4c13aca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mosama_Qwen2.5-1.5B-Instruct-CoT-Reflection/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-1.5B-Instruct-CoT-Reflection", - "id": "mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection", - "developer": "mosama", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.287 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4109 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3212 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mrdayl/OpenCogito/e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json b/data/hfopenllm_v2/mrdayl/OpenCogito/e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json deleted file mode 100644 index f7201e897..000000000 --- a/data/hfopenllm_v2/mrdayl/OpenCogito/e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mrdayl_OpenCogito/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenCogito", - "id": "mrdayl/OpenCogito", - "developer": "mrdayl", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3934 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.472 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.424 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3452 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mrdayl/OpenCognito-r1/01591bb6-9daf-40fb-b802-0a007f4cc388.json b/data/hfopenllm_v2/mrdayl/OpenCognito-r1/01591bb6-9daf-40fb-b802-0a007f4cc388.json deleted file mode 100644 index 745fb3a4c..000000000 --- a/data/hfopenllm_v2/mrdayl/OpenCognito-r1/01591bb6-9daf-40fb-b802-0a007f4cc388.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mrdayl_OpenCognito-r1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenCognito-r1", - "id": "mrdayl/OpenCognito-r1", - "developer": "mrdayl", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4673 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1903 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mrdayl/OpenCognito-r2/f6c32abf-bbae-4827-9ce2-29ce20c9463e.json b/data/hfopenllm_v2/mrdayl/OpenCognito-r2/f6c32abf-bbae-4827-9ce2-29ce20c9463e.json deleted file mode 100644 index 45eaf201c..000000000 --- a/data/hfopenllm_v2/mrdayl/OpenCognito-r2/f6c32abf-bbae-4827-9ce2-29ce20c9463e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mrdayl_OpenCognito-r2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenCognito-r2", - "id": "mrdayl/OpenCognito-r2", - "developer": "mrdayl", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3959 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3462 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mrdayl/OpenCognito/74a6605d-3557-4458-bef5-cc9420434e68.json b/data/hfopenllm_v2/mrdayl/OpenCognito/74a6605d-3557-4458-bef5-cc9420434e68.json deleted file mode 100644 index 6d8c6acce..000000000 --- a/data/hfopenllm_v2/mrdayl/OpenCognito/74a6605d-3557-4458-bef5-cc9420434e68.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mrdayl_OpenCognito/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenCognito", - "id": "mrdayl/OpenCognito", - "developer": "mrdayl", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4706 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3443 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mrdayl/OpenThink/dbe6e126-d35c-4634-a544-adf374ed5d00.json b/data/hfopenllm_v2/mrdayl/OpenThink/dbe6e126-d35c-4634-a544-adf374ed5d00.json deleted file mode 100644 index 383e8e8c7..000000000 --- a/data/hfopenllm_v2/mrdayl/OpenThink/dbe6e126-d35c-4634-a544-adf374ed5d00.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mrdayl_OpenThink/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenThink", - "id": "mrdayl/OpenThink", - "developer": "mrdayl", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2054 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2885 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.185 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-gsm8k-3e/d68681c1-01e4-4af0-9a81-e0aaed0ae865.json b/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-gsm8k-3e/d68681c1-01e4-4af0-9a81-e0aaed0ae865.json deleted file mode 100644 index 0b5f7c44b..000000000 --- a/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-gsm8k-3e/d68681c1-01e4-4af0-9a81-e0aaed0ae865.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mrm8488_phi-4-14B-grpo-gsm8k-3e/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-14B-grpo-gsm8k-3e", - "id": "mrm8488/phi-4-14B-grpo-gsm8k-3e", - "developer": "mrm8488", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6885 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6805 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4524 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3994 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5268 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-limo/de9620b8-7112-436f-8941-fae2c5e7f9e0.json b/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-limo/de9620b8-7112-436f-8941-fae2c5e7f9e0.json deleted file mode 100644 index 2b68d13b9..000000000 --- a/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-limo/de9620b8-7112-436f-8941-fae2c5e7f9e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mrm8488_phi-4-14B-grpo-limo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-14B-grpo-limo", - "id": "mrm8488/phi-4-14B-grpo-limo", - "developer": "mrm8488", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6812 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6785 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4569 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3981 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5261 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/mukaj/Llama-3.1-Hawkish-8B/cafee7ac-deb6-4c4b-af8f-81548648cb14.json b/data/hfopenllm_v2/mukaj/Llama-3.1-Hawkish-8B/cafee7ac-deb6-4c4b-af8f-81548648cb14.json deleted file mode 100644 index 0f6acc63a..000000000 --- a/data/hfopenllm_v2/mukaj/Llama-3.1-Hawkish-8B/cafee7ac-deb6-4c4b-af8f-81548648cb14.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/mukaj_Llama-3.1-Hawkish-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Hawkish-8B", - "id": "mukaj/Llama-3.1-Hawkish-8B", - "developer": "mukaj", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4884 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2432 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3967 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/natong19/Mistral-Nemo-Instruct-2407-abliterated/3e3cb617-6f19-4731-b31a-b1f4d88237d5.json b/data/hfopenllm_v2/natong19/Mistral-Nemo-Instruct-2407-abliterated/3e3cb617-6f19-4731-b31a-b1f4d88237d5.json deleted file mode 100644 index 8280be34b..000000000 --- a/data/hfopenllm_v2/natong19/Mistral-Nemo-Instruct-2407-abliterated/3e3cb617-6f19-4731-b31a-b1f4d88237d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/natong19_Mistral-Nemo-Instruct-2407-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Instruct-2407-abliterated", - "id": "natong19/Mistral-Nemo-Instruct-2407-abliterated", - "developer": "natong19", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6392 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5048 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4033 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3518 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/natong19/Qwen2-7B-Instruct-abliterated/3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json b/data/hfopenllm_v2/natong19/Qwen2-7B-Instruct-abliterated/3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json deleted file mode 100644 index f21a3cf8b..000000000 --- a/data/hfopenllm_v2/natong19/Qwen2-7B-Instruct-abliterated/3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/natong19_Qwen2-7B-Instruct-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2-7B-Instruct-abliterated", - "id": "natong19/Qwen2-7B-Instruct-abliterated", - "developer": "natong19", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5837 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5553 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2764 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/8909f916-401b-4457-ab8f-2691696049c6.json b/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/8909f916-401b-4457-ab8f-2691696049c6.json deleted file mode 100644 index 642a4f5d2..000000000 --- a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/8909f916-401b-4457-ab8f-2691696049c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nazimali_Mistral-Nemo-Kurdish-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Kurdish-Instruct", - "id": "nazimali/Mistral-Nemo-Kurdish-Instruct", - "developer": "nazimali", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4964 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4699 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3979 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/ae191508-7dad-4cac-ad4a-af95d7a15b5d.json b/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/ae191508-7dad-4cac-ad4a-af95d7a15b5d.json deleted file mode 100644 index 030f58d03..000000000 --- a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/ae191508-7dad-4cac-ad4a-af95d7a15b5d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nazimali_Mistral-Nemo-Kurdish-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Kurdish-Instruct", - "id": "nazimali/Mistral-Nemo-Kurdish-Instruct", - "developer": "nazimali", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4721 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4006 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish/507f5047-fac3-415f-b9fa-aae4311fa837.json b/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish/507f5047-fac3-415f-b9fa-aae4311fa837.json deleted file mode 100644 index ef98c1d8b..000000000 --- a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish/507f5047-fac3-415f-b9fa-aae4311fa837.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nazimali_Mistral-Nemo-Kurdish/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Kurdish", - "id": "nazimali/Mistral-Nemo-Kurdish", - "developer": "nazimali", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3401 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5133 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4116 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/BigKartoffel-mistral-nemo-20B/0ee8716c-74f0-41b4-94a2-efc715150293.json b/data/hfopenllm_v2/nbeerbower/BigKartoffel-mistral-nemo-20B/0ee8716c-74f0-41b4-94a2-efc715150293.json deleted file mode 100644 index b7f28eb06..000000000 --- a/data/hfopenllm_v2/nbeerbower/BigKartoffel-mistral-nemo-20B/0ee8716c-74f0-41b4-94a2-efc715150293.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_BigKartoffel-mistral-nemo-20B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BigKartoffel-mistral-nemo-20B", - "id": "nbeerbower/BigKartoffel-mistral-nemo-20B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 20.427 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5857 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/DoppelKartoffel-Mistral-Nemo-23B/fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json b/data/hfopenllm_v2/nbeerbower/DoppelKartoffel-Mistral-Nemo-23B/fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json deleted file mode 100644 index 59fb04144..000000000 --- a/data/hfopenllm_v2/nbeerbower/DoppelKartoffel-Mistral-Nemo-23B/fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_DoppelKartoffel-Mistral-Nemo-23B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DoppelKartoffel-Mistral-Nemo-23B", - "id": "nbeerbower/DoppelKartoffel-Mistral-Nemo-23B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.153 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5191 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5218 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3795 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.308 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/DoublePotato-Mistral-Nemo-13B/4fd20259-c7c7-4da5-9013-ae2feb2175b1.json b/data/hfopenllm_v2/nbeerbower/DoublePotato-Mistral-Nemo-13B/4fd20259-c7c7-4da5-9013-ae2feb2175b1.json deleted file mode 100644 index 27b871961..000000000 --- a/data/hfopenllm_v2/nbeerbower/DoublePotato-Mistral-Nemo-13B/4fd20259-c7c7-4da5-9013-ae2feb2175b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_DoublePotato-Mistral-Nemo-13B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DoublePotato-Mistral-Nemo-13B", - "id": "nbeerbower/DoublePotato-Mistral-Nemo-13B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 13.338 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6796 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5438 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3596 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-1.5B/a7c8c345-cade-48fd-93c0-0f344044d2b5.json b/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-1.5B/a7c8c345-cade-48fd-93c0-0f344044d2b5.json deleted file mode 100644 index cc0a30ad9..000000000 --- a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-1.5B/a7c8c345-cade-48fd-93c0-0f344044d2b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Dumpling-Qwen2.5-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dumpling-Qwen2.5-1.5B", - "id": "nbeerbower/Dumpling-Qwen2.5-1.5B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.416 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2772 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-14B/7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json b/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-14B/7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json deleted file mode 100644 index 043eb77ba..000000000 --- a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-14B/7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Dumpling-Qwen2.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dumpling-Qwen2.5-14B", - "id": "nbeerbower/Dumpling-Qwen2.5-14B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6451 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3097 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r16/7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json b/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r16/7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json deleted file mode 100644 index c9e94af62..000000000 --- a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r16/7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Dumpling-Qwen2.5-7B-1k-r16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dumpling-Qwen2.5-7B-1k-r16", - "id": "nbeerbower/Dumpling-Qwen2.5-7B-1k-r16", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5214 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2364 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3959 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5/25468720-93d7-4f10-a534-30c4976657e8.json b/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5/25468720-93d7-4f10-a534-30c4976657e8.json deleted file mode 100644 index f310a40a3..000000000 --- a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5/25468720-93d7-4f10-a534-30c4976657e8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Dumpling-Qwen2.5-7B-1k-r64-2e-5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dumpling-Qwen2.5-7B-1k-r64-2e-5", - "id": "nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4179 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5301 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B/5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json b/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B/5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json deleted file mode 100644 index d59eb6c26..000000000 --- a/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B/5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_EVA-abliterated-TIES-Qwen2.5-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EVA-abliterated-TIES-Qwen2.5-1.5B", - "id": "nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4115 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3997 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1375 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2712 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B/27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json b/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B/27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json deleted file mode 100644 index af7d95ee5..000000000 --- a/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B/27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_EVA-abliterated-TIES-Qwen2.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EVA-abliterated-TIES-Qwen2.5-14B", - "id": "nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7836 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6372 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5211 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Flammades-Mistral-Nemo-12B/65917125-bb7c-4d64-ba5f-b5e4f67ec332.json b/data/hfopenllm_v2/nbeerbower/Flammades-Mistral-Nemo-12B/65917125-bb7c-4d64-ba5f-b5e4f67ec332.json deleted file mode 100644 index 1fe39793f..000000000 --- a/data/hfopenllm_v2/nbeerbower/Flammades-Mistral-Nemo-12B/65917125-bb7c-4d64-ba5f-b5e4f67ec332.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Flammades-Mistral-Nemo-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Flammades-Mistral-Nemo-12B", - "id": "nbeerbower/Flammades-Mistral-Nemo-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4806 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Gemma2-Gutenberg-Doppel-9B/30bf22d8-b93a-4775-8073-30e14e15e35d.json b/data/hfopenllm_v2/nbeerbower/Gemma2-Gutenberg-Doppel-9B/30bf22d8-b93a-4775-8073-30e14e15e35d.json deleted file mode 100644 index 4dfbf3259..000000000 --- a/data/hfopenllm_v2/nbeerbower/Gemma2-Gutenberg-Doppel-9B/30bf22d8-b93a-4775-8073-30e14e15e35d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Gemma2-Gutenberg-Doppel-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma2-Gutenberg-Doppel-9B", - "id": "nbeerbower/Gemma2-Gutenberg-Doppel-9B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7171 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.587 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1979 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4608 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Gutensuppe-mistral-nemo-12B/ff510365-a13d-4e44-9709-59a56e864991.json b/data/hfopenllm_v2/nbeerbower/Gutensuppe-mistral-nemo-12B/ff510365-a13d-4e44-9709-59a56e864991.json deleted file mode 100644 index 0a1e89534..000000000 --- a/data/hfopenllm_v2/nbeerbower/Gutensuppe-mistral-nemo-12B/ff510365-a13d-4e44-9709-59a56e864991.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Gutensuppe-mistral-nemo-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gutensuppe-mistral-nemo-12B", - "id": "nbeerbower/Gutensuppe-mistral-nemo-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2916 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5487 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Hermes2-Gutenberg2-Mistral-7B/6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json b/data/hfopenllm_v2/nbeerbower/Hermes2-Gutenberg2-Mistral-7B/6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json deleted file mode 100644 index edbb8c718..000000000 --- a/data/hfopenllm_v2/nbeerbower/Hermes2-Gutenberg2-Mistral-7B/6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Hermes2-Gutenberg2-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hermes2-Gutenberg2-Mistral-7B", - "id": "nbeerbower/Hermes2-Gutenberg2-Mistral-7B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3721 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4981 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4623 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2993 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Kartoffel-Deepfry-12B/f1e8cdbb-14b7-4959-a053-fb1b37629aff.json b/data/hfopenllm_v2/nbeerbower/Kartoffel-Deepfry-12B/f1e8cdbb-14b7-4959-a053-fb1b37629aff.json deleted file mode 100644 index 318400868..000000000 --- a/data/hfopenllm_v2/nbeerbower/Kartoffel-Deepfry-12B/f1e8cdbb-14b7-4959-a053-fb1b37629aff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Kartoffel-Deepfry-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kartoffel-Deepfry-12B", - "id": "nbeerbower/Kartoffel-Deepfry-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5022 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5365 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4792 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Llama-3.1-Nemotron-lorablated-70B/4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json b/data/hfopenllm_v2/nbeerbower/Llama-3.1-Nemotron-lorablated-70B/4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json deleted file mode 100644 index 254d39ab3..000000000 --- a/data/hfopenllm_v2/nbeerbower/Llama-3.1-Nemotron-lorablated-70B/4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Llama-3.1-Nemotron-lorablated-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Nemotron-lorablated-70B", - "id": "nbeerbower/Llama-3.1-Nemotron-lorablated-70B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7229 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6825 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3338 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4682 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5343 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Llama3.1-Gutenberg-Doppel-70B/d6966190-e254-4902-8472-cac59bfbdbe0.json b/data/hfopenllm_v2/nbeerbower/Llama3.1-Gutenberg-Doppel-70B/d6966190-e254-4902-8472-cac59bfbdbe0.json deleted file mode 100644 index 348fd047b..000000000 --- a/data/hfopenllm_v2/nbeerbower/Llama3.1-Gutenberg-Doppel-70B/d6966190-e254-4902-8472-cac59bfbdbe0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Llama3.1-Gutenberg-Doppel-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-Gutenberg-Doppel-70B", - "id": "nbeerbower/Llama3.1-Gutenberg-Doppel-70B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7092 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6661 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2122 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4897 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4737 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Lyra-Gutenberg-mistral-nemo-12B/5fdb5437-f413-451d-9800-42036cda7686.json b/data/hfopenllm_v2/nbeerbower/Lyra-Gutenberg-mistral-nemo-12B/5fdb5437-f413-451d-9800-42036cda7686.json deleted file mode 100644 index 16901bff0..000000000 --- a/data/hfopenllm_v2/nbeerbower/Lyra-Gutenberg-mistral-nemo-12B/5fdb5437-f413-451d-9800-42036cda7686.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Lyra-Gutenberg-mistral-nemo-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lyra-Gutenberg-mistral-nemo-12B", - "id": "nbeerbower/Lyra-Gutenberg-mistral-nemo-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3495 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5586 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4357 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3628 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg-12B/347577a4-2768-4472-ba48-9b174ad89724.json b/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg-12B/347577a4-2768-4472-ba48-9b174ad89724.json deleted file mode 100644 index fb5f4a0df..000000000 --- a/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg-12B/347577a4-2768-4472-ba48-9b174ad89724.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Lyra4-Gutenberg-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lyra4-Gutenberg-12B", - "id": "nbeerbower/Lyra4-Gutenberg-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2212 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5387 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1299 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4038 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3571 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg2-12B/33af440e-837d-4454-9340-af0d3ee74f77.json b/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg2-12B/33af440e-837d-4454-9340-af0d3ee74f77.json deleted file mode 100644 index 6dfbf9b42..000000000 --- a/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg2-12B/33af440e-837d-4454-9340-af0d3ee74f77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Lyra4-Gutenberg2-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lyra4-Gutenberg2-12B", - "id": "nbeerbower/Lyra4-Gutenberg2-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2585 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5345 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3972 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated/1a1f4709-8d05-4905-8105-0c3606d5ef5b.json b/data/hfopenllm_v2/nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated/1a1f4709-8d05-4905-8105-0c3606d5ef5b.json deleted file mode 100644 index 5eb89a5fa..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated/1a1f4709-8d05-4905-8105-0c3606d5ef5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mahou-1.5-mistral-nemo-12B-lorablated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mahou-1.5-mistral-nemo-12B-lorablated", - "id": "nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6825 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5496 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4522 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT/28421948-089b-4487-bb71-a06e5ce74402.json b/data/hfopenllm_v2/nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT/28421948-089b-4487-bb71-a06e5ce74402.json deleted file mode 100644 index dd9733de3..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT/28421948-089b-4487-bb71-a06e5ce74402.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Gutenberg-Doppel-7B-FFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Gutenberg-Doppel-7B-FFT", - "id": "nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5717 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4076 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4059 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2729 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2/3fa0c783-9226-4fc8-b3a0-6e960684f43d.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2/3fa0c783-9226-4fc8-b3a0-6e960684f43d.json deleted file mode 100644 index 40710b344..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2/3fa0c783-9226-4fc8-b3a0-6e960684f43d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Gutenberg-Doppel-12B-v2", - "id": "nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6536 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4233 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3546 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B/743b7fe2-f998-408c-98b1-af02d9c1ee2a.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B/743b7fe2-f998-408c-98b1-af02d9c1ee2a.json deleted file mode 100644 index 0895b9113..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B/743b7fe2-f998-408c-98b1-af02d9c1ee2a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Gutenberg-Doppel-12B", - "id": "nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3567 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4132 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental/0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental/0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json deleted file mode 100644 index 361dfbd4d..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental/0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Moderne-12B-FFT-experimental/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Moderne-12B-FFT-experimental", - "id": "nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5234 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3715 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3455 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v2/87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v2/87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json deleted file mode 100644 index 85ba56e3d..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v2/87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Prism-12B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Prism-12B-v2", - "id": "nbeerbower/Mistral-Nemo-Prism-12B-v2", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6974 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v7/6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v7/6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json deleted file mode 100644 index 4e58c2e14..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v7/6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Prism-12B-v7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Prism-12B-v7", - "id": "nbeerbower/Mistral-Nemo-Prism-12B-v7", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6962 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5521 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4639 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.359 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B/525f1b9f-88a2-459d-bb4a-7c01a0107968.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B/525f1b9f-88a2-459d-bb4a-7c01a0107968.json deleted file mode 100644 index 8eb277f61..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B/525f1b9f-88a2-459d-bb4a-7c01a0107968.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Prism-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Nemo-Prism-12B", - "id": "nbeerbower/Mistral-Nemo-Prism-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6858 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5475 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4626 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3581 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Small-Drummer-22B/503f79be-7f05-4464-ac9f-0f284f1e7965.json b/data/hfopenllm_v2/nbeerbower/Mistral-Small-Drummer-22B/503f79be-7f05-4464-ac9f-0f284f1e7965.json deleted file mode 100644 index f6ad8dc3b..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mistral-Small-Drummer-22B/503f79be-7f05-4464-ac9f-0f284f1e7965.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Small-Drummer-22B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-Drummer-22B", - "id": "nbeerbower/Mistral-Small-Drummer-22B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5793 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4064 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Small-Gutenberg-Doppel-22B/86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json b/data/hfopenllm_v2/nbeerbower/Mistral-Small-Gutenberg-Doppel-22B/86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json deleted file mode 100644 index ade724a65..000000000 --- a/data/hfopenllm_v2/nbeerbower/Mistral-Small-Gutenberg-Doppel-22B/86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Small-Gutenberg-Doppel-22B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Small-Gutenberg-Doppel-22B", - "id": "nbeerbower/Mistral-Small-Gutenberg-Doppel-22B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4893 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5859 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3971 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Nemo-Loony-12B-experimental/d472ba79-6592-4f8a-a99c-ec3f71468d3e.json b/data/hfopenllm_v2/nbeerbower/Nemo-Loony-12B-experimental/d472ba79-6592-4f8a-a99c-ec3f71468d3e.json deleted file mode 100644 index cdcb586d6..000000000 --- a/data/hfopenllm_v2/nbeerbower/Nemo-Loony-12B-experimental/d472ba79-6592-4f8a-a99c-ec3f71468d3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Nemo-Loony-12B-experimental/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemo-Loony-12B-experimental", - "id": "nbeerbower/Nemo-Loony-12B-experimental", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3734 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3822 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1589 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Nemoties-ChatML-12B/6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json b/data/hfopenllm_v2/nbeerbower/Nemoties-ChatML-12B/6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json deleted file mode 100644 index 2c18c82d6..000000000 --- a/data/hfopenllm_v2/nbeerbower/Nemoties-ChatML-12B/6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Nemoties-ChatML-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemoties-ChatML-12B", - "id": "nbeerbower/Nemoties-ChatML-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6382 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.547 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3551 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Qwen2.5-Gutenberg-Doppel-14B/76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json b/data/hfopenllm_v2/nbeerbower/Qwen2.5-Gutenberg-Doppel-14B/76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json deleted file mode 100644 index c994ceaa2..000000000 --- a/data/hfopenllm_v2/nbeerbower/Qwen2.5-Gutenberg-Doppel-14B/76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Qwen2.5-Gutenberg-Doppel-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Gutenberg-Doppel-14B", - "id": "nbeerbower/Qwen2.5-Gutenberg-Doppel-14B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8091 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6382 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4921 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/SmolNemo-12B-FFT-experimental/d2845d6e-65dd-4448-901d-d554b3e741f3.json b/data/hfopenllm_v2/nbeerbower/SmolNemo-12B-FFT-experimental/d2845d6e-65dd-4448-901d-d554b3e741f3.json deleted file mode 100644 index baa82798c..000000000 --- a/data/hfopenllm_v2/nbeerbower/SmolNemo-12B-FFT-experimental/d2845d6e-65dd-4448-901d-d554b3e741f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_SmolNemo-12B-FFT-experimental/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolNemo-12B-FFT-experimental", - "id": "nbeerbower/SmolNemo-12B-FFT-experimental", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3348 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3336 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1217 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/Stella-mistral-nemo-12B-v2/f7dd203f-24d8-4875-878a-12ed99e20cd3.json b/data/hfopenllm_v2/nbeerbower/Stella-mistral-nemo-12B-v2/f7dd203f-24d8-4875-878a-12ed99e20cd3.json deleted file mode 100644 index 2d27ecb57..000000000 --- a/data/hfopenllm_v2/nbeerbower/Stella-mistral-nemo-12B-v2/f7dd203f-24d8-4875-878a-12ed99e20cd3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_Stella-mistral-nemo-12B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Stella-mistral-nemo-12B-v2", - "id": "nbeerbower/Stella-mistral-nemo-12B-v2", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3274 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5484 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4304 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-27B/287ae246-bee5-4fae-b78f-203491aa8df2.json b/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-27B/287ae246-bee5-4fae-b78f-203491aa8df2.json deleted file mode 100644 index 883e02d15..000000000 --- a/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-27B/287ae246-bee5-4fae-b78f-203491aa8df2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_gemma2-gutenberg-27B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma2-gutenberg-27B", - "id": "nbeerbower/gemma2-gutenberg-27B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 27.227 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2947 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3797 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3727 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1982 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-9B/9ee493f7-e031-4593-beae-65be17678e00.json b/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-9B/9ee493f7-e031-4593-beae-65be17678e00.json deleted file mode 100644 index 49d199adb..000000000 --- a/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-9B/9ee493f7-e031-4593-beae-65be17678e00.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_gemma2-gutenberg-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma2-gutenberg-9B", - "id": "nbeerbower/gemma2-gutenberg-9B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2796 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5951 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4192 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/llama-3-gutenberg-8B/86b10c6f-41c6-4d0a-ae59-f90e204e466c.json b/data/hfopenllm_v2/nbeerbower/llama-3-gutenberg-8B/86b10c6f-41c6-4d0a-ae59-f90e204e466c.json deleted file mode 100644 index 787cd3d0f..000000000 --- a/data/hfopenllm_v2/nbeerbower/llama-3-gutenberg-8B/86b10c6f-41c6-4d0a-ae59-f90e204e466c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_llama-3-gutenberg-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-gutenberg-8B", - "id": "nbeerbower/llama-3-gutenberg-8B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4994 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/llama3.1-cc-8B/043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json b/data/hfopenllm_v2/nbeerbower/llama3.1-cc-8B/043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json deleted file mode 100644 index 17baa63e8..000000000 --- a/data/hfopenllm_v2/nbeerbower/llama3.1-cc-8B/043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_llama3.1-cc-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3.1-cc-8B", - "id": "nbeerbower/llama3.1-cc-8B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5068 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4871 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3885 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/llama3.1-kartoffeldes-70B/1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json b/data/hfopenllm_v2/nbeerbower/llama3.1-kartoffeldes-70B/1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json deleted file mode 100644 index ed92bd808..000000000 --- a/data/hfopenllm_v2/nbeerbower/llama3.1-kartoffeldes-70B/1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_llama3.1-kartoffeldes-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3.1-kartoffeldes-70B", - "id": "nbeerbower/llama3.1-kartoffeldes-70B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.823 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6894 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4646 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4988 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades-12B/ee625c29-62c4-49da-9790-e7e67233157d.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades-12B/ee625c29-62c4-49da-9790-e7e67233157d.json deleted file mode 100644 index 75703bbb6..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades-12B/ee625c29-62c4-49da-9790-e7e67233157d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-bophades-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-bophades-12B", - "id": "nbeerbower/mistral-nemo-bophades-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6794 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4988 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1231 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades3-12B/02b16bf2-62bb-401e-9726-2135d8d610be.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades3-12B/02b16bf2-62bb-401e-9726-2135d8d610be.json deleted file mode 100644 index c34d6ed08..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades3-12B/02b16bf2-62bb-401e-9726-2135d8d610be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-bophades3-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-bophades3-12B", - "id": "nbeerbower/mistral-nemo-bophades3-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6578 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5449 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4604 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-cc-12B/db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-cc-12B/db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json deleted file mode 100644 index 25981a535..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-cc-12B/db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-cc-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-cc-12B", - "id": "nbeerbower/mistral-nemo-cc-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1435 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5399 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4424 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3598 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutades-12B/aa37bda0-2e0a-4361-a5b4-468154d8ac72.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutades-12B/aa37bda0-2e0a-4361-a5b4-468154d8ac72.json deleted file mode 100644 index 4cebf3ef7..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutades-12B/aa37bda0-2e0a-4361-a5b4-468154d8ac72.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutades-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-gutades-12B", - "id": "nbeerbower/mistral-nemo-gutades-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3425 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5407 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.404 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3561 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v2/d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v2/d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json deleted file mode 100644 index f920c7857..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v2/d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg-12B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-gutenberg-12B-v2", - "id": "nbeerbower/mistral-nemo-gutenberg-12B-v2", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6203 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5397 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3499 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v3/becf9805-83a9-4137-a938-81a61a10e4f0.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v3/becf9805-83a9-4137-a938-81a61a10e4f0.json deleted file mode 100644 index b760c9b8c..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v3/becf9805-83a9-4137-a938-81a61a10e4f0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg-12B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-gutenberg-12B-v3", - "id": "nbeerbower/mistral-nemo-gutenberg-12B-v3", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5441 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v4/6e848120-bc31-4628-af05-30707a6dcc41.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v4/6e848120-bc31-4628-af05-30707a6dcc41.json deleted file mode 100644 index ab9d229a4..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v4/6e848120-bc31-4628-af05-30707a6dcc41.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg-12B-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-gutenberg-12B-v4", - "id": "nbeerbower/mistral-nemo-gutenberg-12B-v4", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2379 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5269 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4104 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B/864af855-71b0-4b11-ae3f-56294a7d0db9.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B/864af855-71b0-4b11-ae3f-56294a7d0db9.json deleted file mode 100644 index c29fbdb15..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B/864af855-71b0-4b11-ae3f-56294a7d0db9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-gutenberg-12B", - "id": "nbeerbower/mistral-nemo-gutenberg-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3504 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5281 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg2-12B-test/285bd390-1dd9-4db2-af45-68dea557da3c.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg2-12B-test/285bd390-1dd9-4db2-af45-68dea557da3c.json deleted file mode 100644 index dc74a419a..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg2-12B-test/285bd390-1dd9-4db2-af45-68dea557da3c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg2-12B-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-gutenberg2-12B-test", - "id": "nbeerbower/mistral-nemo-gutenberg2-12B-test", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3385 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5255 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4157 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-kartoffel-12B/459e2375-1a15-4129-bee0-dc8852d531e2.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-kartoffel-12B/459e2375-1a15-4129-bee0-dc8852d531e2.json deleted file mode 100644 index 0d4219fba..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-kartoffel-12B/459e2375-1a15-4129-bee0-dc8852d531e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-kartoffel-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-kartoffel-12B", - "id": "nbeerbower/mistral-nemo-kartoffel-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7032 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5484 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4653 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3585 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-narwhal-12B/7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-narwhal-12B/7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json deleted file mode 100644 index d17ad6414..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-narwhal-12B/7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-narwhal-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-narwhal-12B", - "id": "nbeerbower/mistral-nemo-narwhal-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5549 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5057 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3483 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-wissenschaft-12B/7ceab841-f9a3-455b-9314-243d8fc3cd11.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-wissenschaft-12B/7ceab841-f9a3-455b-9314-243d8fc3cd11.json deleted file mode 100644 index 5c64e8342..000000000 --- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-wissenschaft-12B/7ceab841-f9a3-455b-9314-243d8fc3cd11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-wissenschaft-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-nemo-wissenschaft-12B", - "id": "nbeerbower/mistral-nemo-wissenschaft-12B", - "developer": "nbeerbower", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.504 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nbrahme/IndusQ/c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json b/data/hfopenllm_v2/nbrahme/IndusQ/c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json deleted file mode 100644 index 21a04901a..000000000 --- a/data/hfopenllm_v2/nbrahme/IndusQ/c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nbrahme_IndusQ/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IndusQ", - "id": "nbrahme/IndusQ", - "developer": "nbrahme", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 1.176 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.244 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/necva/IE-cont-Llama3.1-8B/68cb2ca1-1648-41a2-92b7-969bccdca4ee.json b/data/hfopenllm_v2/necva/IE-cont-Llama3.1-8B/68cb2ca1-1648-41a2-92b7-969bccdca4ee.json deleted file mode 100644 index baefc2323..000000000 --- a/data/hfopenllm_v2/necva/IE-cont-Llama3.1-8B/68cb2ca1-1648-41a2-92b7-969bccdca4ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/necva_IE-cont-Llama3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IE-cont-Llama3.1-8B", - "id": "necva/IE-cont-Llama3.1-8B", - "developer": "necva", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2049 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2912 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/necva/replica-IEPile/5f285d61-5e4b-4c5c-8960-c10313d76ae3.json b/data/hfopenllm_v2/necva/replica-IEPile/5f285d61-5e4b-4c5c-8960-c10313d76ae3.json deleted file mode 100644 index 065eb3646..000000000 --- a/data/hfopenllm_v2/necva/replica-IEPile/5f285d61-5e4b-4c5c-8960-c10313d76ae3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/necva_replica-IEPile/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "replica-IEPile", - "id": "necva/replica-IEPile", - "developer": "necva", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.65 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4678 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4779 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1239 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3561 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.1-bf16-falcon3-7b-instruct/3af19898-8590-4aec-b324-46c7fbf596d3.json b/data/hfopenllm_v2/neopolita/jessi-v0.1-bf16-falcon3-7b-instruct/3af19898-8590-4aec-b324-46c7fbf596d3.json deleted file mode 100644 index a9a8b3aca..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.1-bf16-falcon3-7b-instruct/3af19898-8590-4aec-b324-46c7fbf596d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.1-bf16-falcon3-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.1-bf16-falcon3-7b-instruct", - "id": "neopolita/jessi-v0.1-bf16-falcon3-7b-instruct", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7527 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4825 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3924 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.1-falcon3-10b-instruct/e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json b/data/hfopenllm_v2/neopolita/jessi-v0.1-falcon3-10b-instruct/e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json deleted file mode 100644 index e86ad0154..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.1-falcon3-10b-instruct/e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.1-falcon3-10b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.1-falcon3-10b-instruct", - "id": "neopolita/jessi-v0.1-falcon3-10b-instruct", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7552 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5953 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2002 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.1-qwen2.5-7b-instruct/3f578b45-48f9-4022-991c-32a71706aba3.json b/data/hfopenllm_v2/neopolita/jessi-v0.1-qwen2.5-7b-instruct/3f578b45-48f9-4022-991c-32a71706aba3.json deleted file mode 100644 index 428e15bc6..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.1-qwen2.5-7b-instruct/3f578b45-48f9-4022-991c-32a71706aba3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.1-qwen2.5-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.1-qwen2.5-7b-instruct", - "id": "neopolita/jessi-v0.1-qwen2.5-7b-instruct", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7327 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.1-virtuoso-small/ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json b/data/hfopenllm_v2/neopolita/jessi-v0.1-virtuoso-small/ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json deleted file mode 100644 index d6a3717c7..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.1-virtuoso-small/ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.1-virtuoso-small/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.1-virtuoso-small", - "id": "neopolita/jessi-v0.1-virtuoso-small", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7959 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6443 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3399 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4362 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.513 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-10b-instruct/81630ea2-d496-4872-92b7-e476badaf50d.json b/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-10b-instruct/81630ea2-d496-4872-92b7-e476badaf50d.json deleted file mode 100644 index 26642a60f..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-10b-instruct/81630ea2-d496-4872-92b7-e476badaf50d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.2-falcon3-10b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.2-falcon3-10b-instruct", - "id": "neopolita/jessi-v0.2-falcon3-10b-instruct", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7768 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6205 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2122 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4281 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-7b-instruct/9436d04a-9c81-47ad-a7b8-496e14058627.json b/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-7b-instruct/9436d04a-9c81-47ad-a7b8-496e14058627.json deleted file mode 100644 index b8f742e94..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-7b-instruct/9436d04a-9c81-47ad-a7b8-496e14058627.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.2-falcon3-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.2-falcon3-7b-instruct", - "id": "neopolita/jessi-v0.2-falcon3-7b-instruct", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5771 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5363 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2538 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3905 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.3-falcon3-7b-instruct/f1e6e54e-cb97-4980-8957-2190ee5c4c34.json b/data/hfopenllm_v2/neopolita/jessi-v0.3-falcon3-7b-instruct/f1e6e54e-cb97-4980-8957-2190ee5c4c34.json deleted file mode 100644 index ed27b7620..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.3-falcon3-7b-instruct/f1e6e54e-cb97-4980-8957-2190ee5c4c34.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.3-falcon3-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.3-falcon3-7b-instruct", - "id": "neopolita/jessi-v0.3-falcon3-7b-instruct", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4692 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.397 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.4-falcon3-7b-instruct/30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json b/data/hfopenllm_v2/neopolita/jessi-v0.4-falcon3-7b-instruct/30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json deleted file mode 100644 index b113a0091..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.4-falcon3-7b-instruct/30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.4-falcon3-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.4-falcon3-7b-instruct", - "id": "neopolita/jessi-v0.4-falcon3-7b-instruct", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5522 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4971 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4004 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.5-falcon3-7b-instruct/1c389a32-68b3-47c0-a6b8-2c2291293002.json b/data/hfopenllm_v2/neopolita/jessi-v0.5-falcon3-7b-instruct/1c389a32-68b3-47c0-a6b8-2c2291293002.json deleted file mode 100644 index d7d351e7b..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.5-falcon3-7b-instruct/1c389a32-68b3-47c0-a6b8-2c2291293002.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.5-falcon3-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.5-falcon3-7b-instruct", - "id": "neopolita/jessi-v0.5-falcon3-7b-instruct", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.559 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4865 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3966 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.6-falcon3-7b-instruct/e759a217-6571-446d-9bf9-d1512793f307.json b/data/hfopenllm_v2/neopolita/jessi-v0.6-falcon3-7b-instruct/e759a217-6571-446d-9bf9-d1512793f307.json deleted file mode 100644 index 02cb96cd3..000000000 --- a/data/hfopenllm_v2/neopolita/jessi-v0.6-falcon3-7b-instruct/e759a217-6571-446d-9bf9-d1512793f307.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.6-falcon3-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jessi-v0.6-falcon3-7b-instruct", - "id": "neopolita/jessi-v0.6-falcon3-7b-instruct", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7402 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5509 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4904 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3957 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/neopolita/loki-v0.1-virtuoso/753f3b21-7365-4117-b2a0-a91f03ec3d39.json b/data/hfopenllm_v2/neopolita/loki-v0.1-virtuoso/753f3b21-7365-4117-b2a0-a91f03ec3d39.json deleted file mode 100644 index 21bbf2ac9..000000000 --- a/data/hfopenllm_v2/neopolita/loki-v0.1-virtuoso/753f3b21-7365-4117-b2a0-a91f03ec3d39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/neopolita_loki-v0.1-virtuoso/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "loki-v0.1-virtuoso", - "id": "neopolita/loki-v0.1-virtuoso", - "developer": "neopolita", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7819 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6467 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3391 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/297ef102-67c1-4e9c-b418-fed026bb1f8a.json b/data/hfopenllm_v2/netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/297ef102-67c1-4e9c-b418-fed026bb1f8a.json deleted file mode 100644 index 504b55a12..000000000 --- a/data/hfopenllm_v2/netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/297ef102-67c1-4e9c-b418-fed026bb1f8a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b", - "id": "netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.115 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2877 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0015 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b/9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json b/data/hfopenllm_v2/netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b/9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json deleted file mode 100644 index 3a61410cd..000000000 --- a/data/hfopenllm_v2/netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b/9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_DeepSeek-R1-MFANN-TIES-unretrained-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-MFANN-TIES-unretrained-7b", - "id": "netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2587 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3086 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/Llama3.1-MFANN-8b/b1446577-f13f-434a-a0b4-916091395d4a.json b/data/hfopenllm_v2/netcat420/Llama3.1-MFANN-8b/b1446577-f13f-434a-a0b4-916091395d4a.json deleted file mode 100644 index 4ba8de1e7..000000000 --- a/data/hfopenllm_v2/netcat420/Llama3.1-MFANN-8b/b1446577-f13f-434a-a0b4-916091395d4a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_Llama3.1-MFANN-8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-MFANN-8b", - "id": "netcat420/Llama3.1-MFANN-8b", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4281 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3379 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2725 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/fc8946aa-8b04-482c-8c05-d026d2af07be.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/fc8946aa-8b04-482c-8c05-d026d2af07be.json deleted file mode 100644 index db28a161a..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/fc8946aa-8b04-482c-8c05-d026d2af07be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-Llama3.1-Abliterated-SLERP-TIES-V2", - "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3522 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/fabe3784-948c-4618-9cf0-c76a3ddd3820.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/fabe3784-948c-4618-9cf0-c76a3ddd3820.json deleted file mode 100644 index fe6d32e73..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/fabe3784-948c-4618-9cf0-c76a3ddd3820.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-Llama3.1-Abliterated-SLERP-TIES-V3", - "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4238 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4914 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3741 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4/736dcf09-6a19-4e88-a790-7a7ee74d8717.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4/736dcf09-6a19-4e88-a790-7a7ee74d8717.json deleted file mode 100644 index ca9c619d8..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4/736dcf09-6a19-4e88-a790-7a7ee74d8717.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-Llama3.1-Abliterated-SLERP-V4", - "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4169 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4909 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3821 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3516 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5/75b4c750-1570-4825-a04a-965c06861fd4.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5/75b4c750-1570-4825-a04a-965c06861fd4.json deleted file mode 100644 index 2cffceba6..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5/75b4c750-1570-4825-a04a-965c06861fd4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-Llama3.1-Abliterated-SLERP-V5", - "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4329 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4952 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3445 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES/b7f8b678-2aea-4d41-ba21-2083fc472574.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES/b7f8b678-2aea-4d41-ba21-2083fc472574.json deleted file mode 100644 index 4f314f567..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES/b7f8b678-2aea-4d41-ba21-2083fc472574.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-Slerp-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-Llama3.1-Abliterated-Slerp-TIES", - "id": "netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4293 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4968 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3531 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2/a8010630-58de-448c-af08-70b8ffec431b.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2/a8010630-58de-448c-af08-70b8ffec431b.json deleted file mode 100644 index 3eb29349d..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2/a8010630-58de-448c-af08-70b8ffec431b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-Slerp-V3.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-Llama3.1-Abliterated-Slerp-V3.2", - "id": "netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4128 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-SFT/4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json b/data/hfopenllm_v2/netcat420/MFANN-SFT/4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json deleted file mode 100644 index dde4c2499..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-SFT/4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-SFT", - "id": "netcat420/MFANN-SFT", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3682 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4852 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3336 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-abliterated-phi2-merge-unretrained/1132251a-59c7-402e-9957-f9288864508f.json b/data/hfopenllm_v2/netcat420/MFANN-abliterated-phi2-merge-unretrained/1132251a-59c7-402e-9957-f9288864508f.json deleted file mode 100644 index d5dbf98ee..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-abliterated-phi2-merge-unretrained/1132251a-59c7-402e-9957-f9288864508f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-abliterated-phi2-merge-unretrained/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-abliterated-phi2-merge-unretrained", - "id": "netcat420/MFANN-abliterated-phi2-merge-unretrained", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.775 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3005 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4104 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3183 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1478 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-Abliterated-SLERP/e2fac049-8f9f-4b71-bcd3-5746b7d90150.json b/data/hfopenllm_v2/netcat420/MFANN-llama3.1-Abliterated-SLERP/e2fac049-8f9f-4b71-bcd3-5746b7d90150.json deleted file mode 100644 index 3a55b4765..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-Abliterated-SLERP/e2fac049-8f9f-4b71-bcd3-5746b7d90150.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-llama3.1-Abliterated-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-llama3.1-Abliterated-SLERP", - "id": "netcat420/MFANN-llama3.1-Abliterated-SLERP", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2591 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4574 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1/d891a1e1-ad65-498f-9ee8-59523c1bfd19.json b/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1/d891a1e1-ad65-498f-9ee8-59523c1bfd19.json deleted file mode 100644 index ea3ef3c69..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1/d891a1e1-ad65-498f-9ee8-59523c1bfd19.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-llama3.1-abliterated-SLERP-v3.1", - "id": "netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4921 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3543 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3/9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json b/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3/9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json deleted file mode 100644 index cf3326495..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3/9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-llama3.1-abliterated-SLERP-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-llama3.1-abliterated-SLERP-v3", - "id": "netcat420/MFANN-llama3.1-abliterated-SLERP-v3", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3799 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4931 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3531 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-v2/ca031f70-5785-46d1-8a58-b279d8340776.json b/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-v2/ca031f70-5785-46d1-8a58-b279d8340776.json deleted file mode 100644 index 27620df52..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-v2/ca031f70-5785-46d1-8a58-b279d8340776.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-llama3.1-abliterated-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-llama3.1-abliterated-v2", - "id": "netcat420/MFANN-llama3.1-abliterated-v2", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4941 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3845 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3491 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V2/18457711-92b8-4c27-a89a-928fecdf5724.json b/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V2/18457711-92b8-4c27-a89a-928fecdf5724.json deleted file mode 100644 index f7adf105b..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V2/18457711-92b8-4c27-a89a-928fecdf5724.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-phigments-slerp-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-phigments-slerp-V2", - "id": "netcat420/MFANN-phigments-slerp-V2", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3232 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4827 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4037 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2717 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.2/3398aeb8-08a8-4be9-a24c-efeabcaa2139.json b/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.2/3398aeb8-08a8-4be9-a24c-efeabcaa2139.json deleted file mode 100644 index fd602beef..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.2/3398aeb8-08a8-4be9-a24c-efeabcaa2139.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-phigments-slerp-V3.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-phigments-slerp-V3.2", - "id": "netcat420/MFANN-phigments-slerp-V3.2", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3524 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4809 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2705 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.3/707bc006-4318-41bc-b91b-aa43ca7cba6f.json b/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.3/707bc006-4318-41bc-b91b-aa43ca7cba6f.json deleted file mode 100644 index bc28c5ad6..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.3/707bc006-4318-41bc-b91b-aa43ca7cba6f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN-phigments-slerp-V3.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN-phigments-slerp-V3.3", - "id": "netcat420/MFANN-phigments-slerp-V3.3", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3691 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4895 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3892 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2803 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3b/7bfda919-13be-4b68-8655-99fe6a4605a2.json b/data/hfopenllm_v2/netcat420/MFANN3b/7bfda919-13be-4b68-8655-99fe6a4605a2.json deleted file mode 100644 index 1d0c531da..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3b/7bfda919-13be-4b68-8655-99fe6a4605a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3b", - "id": "netcat420/MFANN3b", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2524 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4433 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3606 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2306 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.15/f844e739-5f0d-4db4-ba66-bd33b1290571.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.15/f844e739-5f0d-4db4-ba66-bd33b1290571.json deleted file mode 100644 index da81b800a..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv0.15/f844e739-5f0d-4db4-ba66-bd33b1290571.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.15/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv0.15", - "id": "netcat420/MFANN3bv0.15", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2012 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3958 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2468 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.18/0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.18/0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json deleted file mode 100644 index 691ede716..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv0.18/0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.18/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv0.18", - "id": "netcat420/MFANN3bv0.18", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2206 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4514 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4024 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.19/87652005-4404-4c45-bd4f-5f63c44adf63.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.19/87652005-4404-4c45-bd4f-5f63c44adf63.json deleted file mode 100644 index b5b586fea..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv0.19/87652005-4404-4c45-bd4f-5f63c44adf63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.19/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv0.19", - "id": "netcat420/MFANN3bv0.19", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2258 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4024 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.252 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.20/a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.20/a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json deleted file mode 100644 index 560868144..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv0.20/a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.20/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv0.20", - "id": "netcat420/MFANN3bv0.20", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2193 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4077 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.21/e8ba93e6-6f90-4169-8403-381b7f9e26ab.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.21/e8ba93e6-6f90-4169-8403-381b7f9e26ab.json deleted file mode 100644 index 12ac93cd4..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv0.21/e8ba93e6-6f90-4169-8403-381b7f9e26ab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.21/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv0.21", - "id": "netcat420/MFANN3bv0.21", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1909 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.447 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3759 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.22/ea86b542-3d06-4e71-b49d-17cdd362b465.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.22/ea86b542-3d06-4e71-b49d-17cdd362b465.json deleted file mode 100644 index 48ebc0f8a..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv0.22/ea86b542-3d06-4e71-b49d-17cdd362b465.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.22/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv0.22", - "id": "netcat420/MFANN3bv0.22", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1979 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4485 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3521 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.23/15615d2c-46a1-47c7-a273-697e97bdf9f2.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.23/15615d2c-46a1-47c7-a273-697e97bdf9f2.json deleted file mode 100644 index 0a80010af..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv0.23/15615d2c-46a1-47c7-a273-697e97bdf9f2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.23/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv0.23", - "id": "netcat420/MFANN3bv0.23", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2048 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3427 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2418 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.24/a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.24/a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json deleted file mode 100644 index 95741769c..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv0.24/a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.24/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv0.24", - "id": "netcat420/MFANN3bv0.24", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.22 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4407 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3521 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv1.1/76f3fa3a-1629-4cdd-b457-3a108784b427.json b/data/hfopenllm_v2/netcat420/MFANN3bv1.1/76f3fa3a-1629-4cdd-b457-3a108784b427.json deleted file mode 100644 index 2c3b57cd6..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv1.1/76f3fa3a-1629-4cdd-b457-3a108784b427.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv1.1", - "id": "netcat420/MFANN3bv1.1", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.775 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2507 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3397 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3223 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1159 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv1.2/c9e979e1-4433-4a38-8fd4-c14895e74f44.json b/data/hfopenllm_v2/netcat420/MFANN3bv1.2/c9e979e1-4433-4a38-8fd4-c14895e74f44.json deleted file mode 100644 index f3b97324c..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv1.2/c9e979e1-4433-4a38-8fd4-c14895e74f44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv1.2", - "id": "netcat420/MFANN3bv1.2", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.775 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2686 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv1.3/3f2effba-1ab8-476d-b228-ed9491e83adf.json b/data/hfopenllm_v2/netcat420/MFANN3bv1.3/3f2effba-1ab8-476d-b228-ed9491e83adf.json deleted file mode 100644 index 5ea688265..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv1.3/3f2effba-1ab8-476d-b228-ed9491e83adf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv1.3", - "id": "netcat420/MFANN3bv1.3", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2547 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3299 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2276 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv1.4/a5f0fb1b-27a7-495f-a010-3307afdb8949.json b/data/hfopenllm_v2/netcat420/MFANN3bv1.4/a5f0fb1b-27a7-495f-a010-3307afdb8949.json deleted file mode 100644 index 6b3a33345..000000000 --- a/data/hfopenllm_v2/netcat420/MFANN3bv1.4/a5f0fb1b-27a7-495f-a010-3307afdb8949.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv1.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANN3bv1.4", - "id": "netcat420/MFANN3bv1.4", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3524 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4809 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2705 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.19/22f2aa1d-fff1-430a-9c20-3b32859d9665.json b/data/hfopenllm_v2/netcat420/MFANNv0.19/22f2aa1d-fff1-430a-9c20-3b32859d9665.json deleted file mode 100644 index 4621646f1..000000000 --- a/data/hfopenllm_v2/netcat420/MFANNv0.19/22f2aa1d-fff1-430a-9c20-3b32859d9665.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.19/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANNv0.19", - "id": "netcat420/MFANNv0.19", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3057 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4731 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2473 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.20/daff0e6f-d29f-4861-855f-902a0cd9a469.json b/data/hfopenllm_v2/netcat420/MFANNv0.20/daff0e6f-d29f-4861-855f-902a0cd9a469.json deleted file mode 100644 index b815945e1..000000000 --- a/data/hfopenllm_v2/netcat420/MFANNv0.20/daff0e6f-d29f-4861-855f-902a0cd9a469.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.20/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANNv0.20", - "id": "netcat420/MFANNv0.20", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3479 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4574 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3202 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.21/0f5cb926-b691-4d57-87f5-290235fd250a.json b/data/hfopenllm_v2/netcat420/MFANNv0.21/0f5cb926-b691-4d57-87f5-290235fd250a.json deleted file mode 100644 index 0d6243929..000000000 --- a/data/hfopenllm_v2/netcat420/MFANNv0.21/0f5cb926-b691-4d57-87f5-290235fd250a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.21/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANNv0.21", - "id": "netcat420/MFANNv0.21", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3233 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4576 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3993 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3031 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.22.1/d9e813da-2966-4901-99f9-c7627c64fc52.json b/data/hfopenllm_v2/netcat420/MFANNv0.22.1/d9e813da-2966-4901-99f9-c7627c64fc52.json deleted file mode 100644 index 7b869bbb5..000000000 --- a/data/hfopenllm_v2/netcat420/MFANNv0.22.1/d9e813da-2966-4901-99f9-c7627c64fc52.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.22.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANNv0.22.1", - "id": "netcat420/MFANNv0.22.1", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3089 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4661 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3343 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.23/4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json b/data/hfopenllm_v2/netcat420/MFANNv0.23/4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json deleted file mode 100644 index 80ebfd3ef..000000000 --- a/data/hfopenllm_v2/netcat420/MFANNv0.23/4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.23/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANNv0.23", - "id": "netcat420/MFANNv0.23", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4898 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3388 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.24/f7494fd4-d248-46a6-a46d-f9d8db560aae.json b/data/hfopenllm_v2/netcat420/MFANNv0.24/f7494fd4-d248-46a6-a46d-f9d8db560aae.json deleted file mode 100644 index 89be208aa..000000000 --- a/data/hfopenllm_v2/netcat420/MFANNv0.24/f7494fd4-d248-46a6-a46d-f9d8db560aae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.24/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANNv0.24", - "id": "netcat420/MFANNv0.24", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3162 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.479 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3348 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.25/4b8533d1-7770-435f-ba76-a5c658aabd8f.json b/data/hfopenllm_v2/netcat420/MFANNv0.25/4b8533d1-7770-435f-ba76-a5c658aabd8f.json deleted file mode 100644 index c62b5f82d..000000000 --- a/data/hfopenllm_v2/netcat420/MFANNv0.25/4b8533d1-7770-435f-ba76-a5c658aabd8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.25/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MFANNv0.25", - "id": "netcat420/MFANNv0.25", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3467 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4794 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3343 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/309c7906-0010-4f17-848f-185062d96a26.json b/data/hfopenllm_v2/netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/309c7906-0010-4f17-848f-185062d96a26.json deleted file mode 100644 index f9f6fbeee..000000000 --- a/data/hfopenllm_v2/netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/309c7906-0010-4f17-848f-185062d96a26.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-nerd-uncensored-v0.9-MFANN", - "id": "netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5878 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5237 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3376 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3904 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-7b-MFANN-slerp/f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json b/data/hfopenllm_v2/netcat420/Qwen2.5-7b-MFANN-slerp/f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json deleted file mode 100644 index 25a7240c6..000000000 --- a/data/hfopenllm_v2/netcat420/Qwen2.5-7b-MFANN-slerp/f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-7b-MFANN-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7b-MFANN-slerp", - "id": "netcat420/Qwen2.5-7b-MFANN-slerp", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6532 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5089 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3417 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp/b4a70c71-dfac-4888-937e-d5220b491b0e.json b/data/hfopenllm_v2/netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp/b4a70c71-dfac-4888-937e-d5220b491b0e.json deleted file mode 100644 index 07ded6ed7..000000000 --- a/data/hfopenllm_v2/netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp/b4a70c71-dfac-4888-937e-d5220b491b0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-7b-nerd-uncensored-MFANN-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7b-nerd-uncensored-MFANN-slerp", - "id": "netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.11 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/b879a534-6b24-4873-a0e4-e18453540121.json b/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/b879a534-6b24-4873-a0e4-e18453540121.json deleted file mode 100644 index 6830a0165..000000000 --- a/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/b879a534-6b24-4873-a0e4-e18453540121.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained", - "id": "netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6486 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2991 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4152 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3432 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json b/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json deleted file mode 100644 index 2e55a4ead..000000000 --- a/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN", - "id": "netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5742 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5071 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2568 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4058 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/7766c638-b4dc-4b2d-8c14-becdb1b709ef.json b/data/hfopenllm_v2/netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/7766c638-b4dc-4b2d-8c14-becdb1b709ef.json deleted file mode 100644 index a0253b996..000000000 --- a/data/hfopenllm_v2/netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/7766c638-b4dc-4b2d-8c14-becdb1b709ef.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b", - "id": "netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3789 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2324 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3528 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1677 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-MFANN-7b/dd211bef-3940-4d78-8f7b-a67da81d605b.json b/data/hfopenllm_v2/netcat420/Qwen2.5-MFANN-7b/dd211bef-3940-4d78-8f7b-a67da81d605b.json deleted file mode 100644 index 1928a3f0f..000000000 --- a/data/hfopenllm_v2/netcat420/Qwen2.5-MFANN-7b/dd211bef-3940-4d78-8f7b-a67da81d605b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-MFANN-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-MFANN-7b", - "id": "netcat420/Qwen2.5-MFANN-7b", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6097 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5054 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2787 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3233 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERP-V1.2/87e20b7a-85c8-4845-94b0-ace1e18814cb.json b/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERP-V1.2/87e20b7a-85c8-4845-94b0-ace1e18814cb.json deleted file mode 100644 index 51f2c598a..000000000 --- a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERP-V1.2/87e20b7a-85c8-4845-94b0-ace1e18814cb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_qwen2.5-MFANN-7b-SLERP-V1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-MFANN-7b-SLERP-V1.2", - "id": "netcat420/qwen2.5-MFANN-7b-SLERP-V1.2", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6606 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4259 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3438 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERPv1.1/9ab01db6-3154-4c5b-b6a2-35479538d332.json b/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERPv1.1/9ab01db6-3154-4c5b-b6a2-35479538d332.json deleted file mode 100644 index 6d923911a..000000000 --- a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERPv1.1/9ab01db6-3154-4c5b-b6a2-35479538d332.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_qwen2.5-MFANN-7b-SLERPv1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-MFANN-7b-SLERPv1.1", - "id": "netcat420/qwen2.5-MFANN-7b-SLERPv1.1", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2968 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4126 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-v1.1/9d35316a-011d-4e45-ae57-317b53de621f.json b/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-v1.1/9d35316a-011d-4e45-ae57-317b53de621f.json deleted file mode 100644 index 6d09d9e25..000000000 --- a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-v1.1/9d35316a-011d-4e45-ae57-317b53de621f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netcat420_qwen2.5-MFANN-7b-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-MFANN-7b-v1.1", - "id": "netcat420/qwen2.5-MFANN-7b-v1.1", - "developer": "netcat420", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6088 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4967 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2825 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/netease-youdao/Confucius-o1-14B/c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json b/data/hfopenllm_v2/netease-youdao/Confucius-o1-14B/c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json deleted file mode 100644 index d488ce1e9..000000000 --- a/data/hfopenllm_v2/netease-youdao/Confucius-o1-14B/c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/netease-youdao_Confucius-o1-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Confucius-o1-14B", - "id": "netease-youdao/Confucius-o1-14B", - "developer": "netease-youdao", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.63 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4313 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3649 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5265 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/newsbang/Homer-7B-v0.1/0659cb01-0d52-42cb-9e3a-2d8cac01692e.json b/data/hfopenllm_v2/newsbang/Homer-7B-v0.1/0659cb01-0d52-42cb-9e3a-2d8cac01692e.json deleted file mode 100644 index 889543f52..000000000 --- a/data/hfopenllm_v2/newsbang/Homer-7B-v0.1/0659cb01-0d52-42cb-9e3a-2d8cac01692e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/newsbang_Homer-7B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Homer-7B-v0.1", - "id": "newsbang/Homer-7B-v0.1", - "developer": "newsbang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6109 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5601 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4357 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4475 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/newsbang/Homer-7B-v0.2/98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json b/data/hfopenllm_v2/newsbang/Homer-7B-v0.2/98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json deleted file mode 100644 index 0e6b4cdcc..000000000 --- a/data/hfopenllm_v2/newsbang/Homer-7B-v0.2/98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/newsbang_Homer-7B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Homer-7B-v0.2", - "id": "newsbang/Homer-7B-v0.2", - "developer": "newsbang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7494 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5517 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.441 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/newsbang/Homer-v0.3-Qwen2.5-7B/6e0f7e7e-8927-436e-95a7-5a7c626ca241.json b/data/hfopenllm_v2/newsbang/Homer-v0.3-Qwen2.5-7B/6e0f7e7e-8927-436e-95a7-5a7c626ca241.json deleted file mode 100644 index b533392a6..000000000 --- a/data/hfopenllm_v2/newsbang/Homer-v0.3-Qwen2.5-7B/6e0f7e7e-8927-436e-95a7-5a7c626ca241.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/newsbang_Homer-v0.3-Qwen2.5-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Homer-v0.3-Qwen2.5-7B", - "id": "newsbang/Homer-v0.3-Qwen2.5-7B", - "developer": "newsbang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5154 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5481 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3089 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4744 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/newsbang/Homer-v0.4-Qwen2.5-7B/9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json b/data/hfopenllm_v2/newsbang/Homer-v0.4-Qwen2.5-7B/9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json deleted file mode 100644 index c5e3960e9..000000000 --- a/data/hfopenllm_v2/newsbang/Homer-v0.4-Qwen2.5-7B/9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/newsbang_Homer-v0.4-Qwen2.5-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Homer-v0.4-Qwen2.5-7B", - "id": "newsbang/Homer-v0.4-Qwen2.5-7B", - "developer": "newsbang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7999 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5533 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2779 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4311 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4363 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/newsbang/Homer-v0.5-Qwen2.5-7B/04840708-a4cc-407c-8b2a-876b382920a1.json b/data/hfopenllm_v2/newsbang/Homer-v0.5-Qwen2.5-7B/04840708-a4cc-407c-8b2a-876b382920a1.json deleted file mode 100644 index 63f957333..000000000 --- a/data/hfopenllm_v2/newsbang/Homer-v0.5-Qwen2.5-7B/04840708-a4cc-407c-8b2a-876b382920a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/newsbang_Homer-v0.5-Qwen2.5-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Homer-v0.5-Qwen2.5-7B", - "id": "newsbang/Homer-v0.5-Qwen2.5-7B", - "developer": "newsbang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7881 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.554 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4193 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-72B/83b0844c-70fe-4b63-8ed2-4147390518ee.json b/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-72B/83b0844c-70fe-4b63-8ed2-4147390518ee.json deleted file mode 100644 index a97243437..000000000 --- a/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-72B/83b0844c-70fe-4b63-8ed2-4147390518ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/newsbang_Homer-v1.0-Qwen2.5-72B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Homer-v1.0-Qwen2.5-72B", - "id": "newsbang/Homer-v1.0-Qwen2.5-72B", - "developer": "newsbang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7628 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.731 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4161 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4677 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-7B/9cf10c60-bee1-4f4f-9e03-c3c10287bded.json b/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-7B/9cf10c60-bee1-4f4f-9e03-c3c10287bded.json deleted file mode 100644 index c299cf546..000000000 --- a/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-7B/9cf10c60-bee1-4f4f-9e03-c3c10287bded.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/newsbang_Homer-v1.0-Qwen2.5-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Homer-v1.0-Qwen2.5-7B", - "id": "newsbang/Homer-v1.0-Qwen2.5-7B", - "developer": "newsbang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6393 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3323 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3221 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4278 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4535 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nguyentd/FinancialAdvice-Qwen2.5-7B/8e92dd9e-a68c-46ef-9b03-955c06a21437.json b/data/hfopenllm_v2/nguyentd/FinancialAdvice-Qwen2.5-7B/8e92dd9e-a68c-46ef-9b03-955c06a21437.json deleted file mode 100644 index 332c8a2d4..000000000 --- a/data/hfopenllm_v2/nguyentd/FinancialAdvice-Qwen2.5-7B/8e92dd9e-a68c-46ef-9b03-955c06a21437.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nguyentd_FinancialAdvice-Qwen2.5-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FinancialAdvice-Qwen2.5-7B", - "id": "nguyentd/FinancialAdvice-Qwen2.5-7B", - "developer": "nguyentd", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4731 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4025 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ngxson/MiniThinky-1B-Llama-3.2/dd1139d8-2b44-4516-b24a-1219826f5482.json b/data/hfopenllm_v2/ngxson/MiniThinky-1B-Llama-3.2/dd1139d8-2b44-4516-b24a-1219826f5482.json deleted file mode 100644 index 2e0f2504c..000000000 --- a/data/hfopenllm_v2/ngxson/MiniThinky-1B-Llama-3.2/dd1139d8-2b44-4516-b24a-1219826f5482.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ngxson_MiniThinky-1B-Llama-3.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniThinky-1B-Llama-3.2", - "id": "ngxson/MiniThinky-1B-Llama-3.2", - "developer": "ngxson", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2771 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ngxson/MiniThinky-v2-1B-Llama-3.2/e37e86f7-b67b-4f0a-b1bd-92f30842b303.json b/data/hfopenllm_v2/ngxson/MiniThinky-v2-1B-Llama-3.2/e37e86f7-b67b-4f0a-b1bd-92f30842b303.json deleted file mode 100644 index 34a2fa568..000000000 --- a/data/hfopenllm_v2/ngxson/MiniThinky-v2-1B-Llama-3.2/e37e86f7-b67b-4f0a-b1bd-92f30842b303.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ngxson_MiniThinky-v2-1B-Llama-3.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniThinky-v2-1B-Llama-3.2", - "id": "ngxson/MiniThinky-v2-1B-Llama-3.2", - "developer": "ngxson", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2963 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1116 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nhyha/N3N_Delirium-v1_1030_0227/bc3b55d5-35ca-48b5-832e-8544e145b1b1.json b/data/hfopenllm_v2/nhyha/N3N_Delirium-v1_1030_0227/bc3b55d5-35ca-48b5-832e-8544e145b1b1.json deleted file mode 100644 index c0a263780..000000000 --- a/data/hfopenllm_v2/nhyha/N3N_Delirium-v1_1030_0227/bc3b55d5-35ca-48b5-832e-8544e145b1b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nhyha_N3N_Delirium-v1_1030_0227/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "N3N_Delirium-v1_1030_0227", - "id": "nhyha/N3N_Delirium-v1_1030_0227", - "developer": "nhyha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8023 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5891 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2107 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.415 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216/5757cd3d-c64e-4743-8200-5e610e24bf95.json b/data/hfopenllm_v2/nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216/5757cd3d-c64e-4743-8200-5e610e24bf95.json deleted file mode 100644 index 0ece90da0..000000000 --- a/data/hfopenllm_v2/nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216/5757cd3d-c64e-4743-8200-5e610e24bf95.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nhyha_N3N_Llama-3.1-8B-Instruct_1028_0216/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "N3N_Llama-3.1-8B-Instruct_1028_0216", - "id": "nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216", - "developer": "nhyha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4796 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5054 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.405 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3638 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241029_1532/ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json b/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241029_1532/ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json deleted file mode 100644 index 9ba31eded..000000000 --- a/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241029_1532/ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nhyha_N3N_gemma-2-9b-it_20241029_1532/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "N3N_gemma-2-9b-it_20241029_1532", - "id": "nhyha/N3N_gemma-2-9b-it_20241029_1532", - "developer": "nhyha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6752 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5863 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2122 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4594 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241110_2026/bee54048-ebb2-4051-a18f-aa85b0f2ce27.json b/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241110_2026/bee54048-ebb2-4051-a18f-aa85b0f2ce27.json deleted file mode 100644 index 3720f36fb..000000000 --- a/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241110_2026/bee54048-ebb2-4051-a18f-aa85b0f2ce27.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nhyha_N3N_gemma-2-9b-it_20241110_2026/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "N3N_gemma-2-9b-it_20241110_2026", - "id": "nhyha/N3N_gemma-2-9b-it_20241110_2026", - "developer": "nhyha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6283 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5867 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1609 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314/2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json b/data/hfopenllm_v2/nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314/2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json deleted file mode 100644 index f76d4c073..000000000 --- a/data/hfopenllm_v2/nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314/2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nhyha_merge_Qwen2.5-7B-Instruct_20241023_0314/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merge_Qwen2.5-7B-Instruct_20241023_0314", - "id": "nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314", - "developer": "nhyha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5695 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5559 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4251 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4542 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nidum/Nidum-Limitless-Gemma-2B/2c530a3b-888e-4a61-b97b-ea875b30ec9c.json b/data/hfopenllm_v2/nidum/Nidum-Limitless-Gemma-2B/2c530a3b-888e-4a61-b97b-ea875b30ec9c.json deleted file mode 100644 index e6ba40bf6..000000000 --- a/data/hfopenllm_v2/nidum/Nidum-Limitless-Gemma-2B/2c530a3b-888e-4a61-b97b-ea875b30ec9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nidum_Nidum-Limitless-Gemma-2B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nidum-Limitless-Gemma-2B", - "id": "nidum/Nidum-Limitless-Gemma-2B", - "developer": "nidum", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GemmaForCausalLM", - "params_billions": 2.506 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1174 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nisten/franqwenstein-35b/4c9fb322-735e-4644-8121-088d00f78c5f.json b/data/hfopenllm_v2/nisten/franqwenstein-35b/4c9fb322-735e-4644-8121-088d00f78c5f.json deleted file mode 100644 index d683ac73a..000000000 --- a/data/hfopenllm_v2/nisten/franqwenstein-35b/4c9fb322-735e-4644-8121-088d00f78c5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nisten_franqwenstein-35b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "franqwenstein-35b", - "id": "nisten/franqwenstein-35b", - "developer": "nisten", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 34.714 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3799 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6647 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4035 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.494 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5731 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nisten/franqwenstein-35b/e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json b/data/hfopenllm_v2/nisten/franqwenstein-35b/e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json deleted file mode 100644 index 12f77c7a5..000000000 --- a/data/hfopenllm_v2/nisten/franqwenstein-35b/e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nisten_franqwenstein-35b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "franqwenstein-35b", - "id": "nisten/franqwenstein-35b", - "developer": "nisten", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 34.714 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6591 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3044 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4681 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5611 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nisten/tqwendo-36b/e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json b/data/hfopenllm_v2/nisten/tqwendo-36b/e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json deleted file mode 100644 index 5712a649d..000000000 --- a/data/hfopenllm_v2/nisten/tqwendo-36b/e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nisten_tqwendo-36b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tqwendo-36b", - "id": "nisten/tqwendo-36b", - "developer": "nisten", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 35.69 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6778 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6432 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4154 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.443 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.8/42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json b/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.8/42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json deleted file mode 100644 index b018debaf..000000000 --- a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.8/42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nlpguy_Lion-Lamarck-v.1.0.8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lion-Lamarck-v.1.0.8", - "id": "nlpguy/Lion-Lamarck-v.1.0.8", - "developer": "nlpguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5869 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4673 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4643 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.9/915ae579-786a-4eb2-a1bb-107a12c9c40d.json b/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.9/915ae579-786a-4eb2-a1bb-107a12c9c40d.json deleted file mode 100644 index e62e07a5e..000000000 --- a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.9/915ae579-786a-4eb2-a1bb-107a12c9c40d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nlpguy_Lion-Lamarck-v.1.0.9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lion-Lamarck-v.1.0.9", - "id": "nlpguy/Lion-Lamarck-v.1.0.9", - "developer": "nlpguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3409 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5918 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3901 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4704 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.1.0/3489ffea-a607-4f3d-a0c2-bd17147f244f.json b/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.1.0/3489ffea-a607-4f3d-a0c2-bd17147f244f.json deleted file mode 100644 index 4a3058b4b..000000000 --- a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.1.0/3489ffea-a607-4f3d-a0c2-bd17147f244f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nlpguy_Lion-Lamarck-v.1.1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lion-Lamarck-v.1.1.0", - "id": "nlpguy/Lion-Lamarck-v.1.1.0", - "developer": "nlpguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5962 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5325 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4631 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nlpguy/Miisce-one/7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json b/data/hfopenllm_v2/nlpguy/Miisce-one/7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json deleted file mode 100644 index 39e5855db..000000000 --- a/data/hfopenllm_v2/nlpguy/Miisce-one/7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nlpguy_Miisce-one/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Miisce-one", - "id": "nlpguy/Miisce-one", - "developer": "nlpguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6505 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4169 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v1/6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json b/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v1/6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json deleted file mode 100644 index 823d8c02e..000000000 --- a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v1/6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nlpguy_Mistral-NeMo-Minitron-Upscale-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-NeMo-Minitron-Upscale-v1", - "id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v1", - "developer": "nlpguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.451 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1648 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4468 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2537 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v2/fe344f84-7428-45af-940f-736275bc4d50.json b/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v2/fe344f84-7428-45af-940f-736275bc4d50.json deleted file mode 100644 index 5887f8230..000000000 --- a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v2/fe344f84-7428-45af-940f-736275bc4d50.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nlpguy_Mistral-NeMo-Minitron-Upscale-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-NeMo-Minitron-Upscale-v2", - "id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v2", - "developer": "nlpguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.451 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1573 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3791 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1927 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v3/60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json b/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v3/60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json deleted file mode 100644 index f652b2940..000000000 --- a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v3/60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nlpguy_Mistral-NeMo-Minitron-Upscale-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-NeMo-Minitron-Upscale-v3", - "id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v3", - "developer": "nlpguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.451 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3052 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nlpguy/StableProse/1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json b/data/hfopenllm_v2/nlpguy/StableProse/1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json deleted file mode 100644 index 7c16d0d96..000000000 --- a/data/hfopenllm_v2/nlpguy/StableProse/1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nlpguy_StableProse/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "StableProse", - "id": "nlpguy/StableProse", - "developer": "nlpguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1972 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5117 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4067 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3468 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nlpguy/StarFusion-alpha1/2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json b/data/hfopenllm_v2/nlpguy/StarFusion-alpha1/2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json deleted file mode 100644 index 3dae9ae39..000000000 --- a/data/hfopenllm_v2/nlpguy/StarFusion-alpha1/2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nlpguy_StarFusion-alpha1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "StarFusion-alpha1", - "id": "nlpguy/StarFusion-alpha1", - "developer": "nlpguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.566 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3191 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/noname0202/Llama-3.2-4x3B-Instruct/e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json b/data/hfopenllm_v2/noname0202/Llama-3.2-4x3B-Instruct/e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json deleted file mode 100644 index d9ea08253..000000000 --- a/data/hfopenllm_v2/noname0202/Llama-3.2-4x3B-Instruct/e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/noname0202_Llama-3.2-4x3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-4x3B-Instruct", - "id": "noname0202/Llama-3.2-4x3B-Instruct", - "developer": "noname0202", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 9.949 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7067 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4647 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1586 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3285 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/noname0202/gemma-2-2b-it-ties/01bc964f-552b-4cda-9ed0-cf720f0c8de4.json b/data/hfopenllm_v2/noname0202/gemma-2-2b-it-ties/01bc964f-552b-4cda-9ed0-cf720f0c8de4.json deleted file mode 100644 index 51a2199d0..000000000 --- a/data/hfopenllm_v2/noname0202/gemma-2-2b-it-ties/01bc964f-552b-4cda-9ed0-cf720f0c8de4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/noname0202_gemma-2-2b-it-ties/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-it-ties", - "id": "noname0202/gemma-2-2b-it-ties", - "developer": "noname0202", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1266 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4206 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3929 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2561 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v1/c9e95c55-978e-485b-8a77-ab2e668e3254.json b/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v1/c9e95c55-978e-485b-8a77-ab2e668e3254.json deleted file mode 100644 index 088f2166d..000000000 --- a/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v1/c9e95c55-978e-485b-8a77-ab2e668e3254.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/noname0202_gemma-2-9b-sft-jp-en-zh-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-sft-jp-en-zh-v1", - "id": "noname0202/gemma-2-9b-sft-jp-en-zh-v1", - "developer": "noname0202", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2988 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4519 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v2/c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json b/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v2/c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json deleted file mode 100644 index 26c8c9012..000000000 --- a/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v2/c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/noname0202_gemma-2-9b-sft-jp-en-zh-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-sft-jp-en-zh-v2", - "id": "noname0202/gemma-2-9b-sft-jp-en-zh-v2", - "developer": "noname0202", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3993 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1042 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3612 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3675 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/noname0202/llama-math-1b-r16-0to512tokens-test/ae1801cb-d112-4d1a-895d-c6743779846a.json b/data/hfopenllm_v2/noname0202/llama-math-1b-r16-0to512tokens-test/ae1801cb-d112-4d1a-895d-c6743779846a.json deleted file mode 100644 index ff9746f84..000000000 --- a/data/hfopenllm_v2/noname0202/llama-math-1b-r16-0to512tokens-test/ae1801cb-d112-4d1a-895d-c6743779846a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/noname0202_llama-math-1b-r16-0to512tokens-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-math-1b-r16-0to512tokens-test", - "id": "noname0202/llama-math-1b-r16-0to512tokens-test", - "developer": "noname0202", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.547 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3488 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3143 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1728 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/noname0202/llama-math-1b-r32-0to512tokens-test/008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json b/data/hfopenllm_v2/noname0202/llama-math-1b-r32-0to512tokens-test/008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json deleted file mode 100644 index d00eac878..000000000 --- a/data/hfopenllm_v2/noname0202/llama-math-1b-r32-0to512tokens-test/008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/noname0202_llama-math-1b-r32-0to512tokens-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-math-1b-r32-0to512tokens-test", - "id": "noname0202/llama-math-1b-r32-0to512tokens-test", - "developer": "noname0202", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5683 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3495 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0906 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3209 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.176 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/noname0202/llama-math-1b-r32-test/379b315d-96fb-4edb-b2d6-3dc113a10c17.json b/data/hfopenllm_v2/noname0202/llama-math-1b-r32-test/379b315d-96fb-4edb-b2d6-3dc113a10c17.json deleted file mode 100644 index 442e75821..000000000 --- a/data/hfopenllm_v2/noname0202/llama-math-1b-r32-test/379b315d-96fb-4edb-b2d6-3dc113a10c17.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/noname0202_llama-math-1b-r32-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-math-1b-r32-test", - "id": "noname0202/llama-math-1b-r32-test", - "developer": "noname0202", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5819 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3486 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1781 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/noname0202/llama-math-1b-r8-512tokens-test/8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json b/data/hfopenllm_v2/noname0202/llama-math-1b-r8-512tokens-test/8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json deleted file mode 100644 index 982f70f06..000000000 --- a/data/hfopenllm_v2/noname0202/llama-math-1b-r8-512tokens-test/8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/noname0202_llama-math-1b-r8-512tokens-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-math-1b-r8-512tokens-test", - "id": "noname0202/llama-math-1b-r8-512tokens-test", - "developer": "noname0202", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5792 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3496 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3169 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1753 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/f76ce244-29f7-44f0-9850-7291f8e4cbf1.json b/data/hfopenllm_v2/notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/f76ce244-29f7-44f0-9850-7291f8e4cbf1.json deleted file mode 100644 index 92f477fa6..000000000 --- a/data/hfopenllm_v2/notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/f76ce244-29f7-44f0-9850-7291f8e4cbf1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/notbdq_Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Instruct-1M-GRPO-Reasoning", - "id": "notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning", - "developer": "notbdq", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8414 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6198 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.418 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.485 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nothingiisreal/L3.1-8B-Celeste-V1.5/506871f1-0c87-4e8c-a270-eed7b5da2599.json b/data/hfopenllm_v2/nothingiisreal/L3.1-8B-Celeste-V1.5/506871f1-0c87-4e8c-a270-eed7b5da2599.json deleted file mode 100644 index 6a557a511..000000000 --- a/data/hfopenllm_v2/nothingiisreal/L3.1-8B-Celeste-V1.5/506871f1-0c87-4e8c-a270-eed7b5da2599.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nothingiisreal_L3.1-8B-Celeste-V1.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-8B-Celeste-V1.5", - "id": "nothingiisreal/L3.1-8B-Celeste-V1.5", - "developer": "nothingiisreal", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7327 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5012 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1465 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3749 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3704 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v2/c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json b/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v2/c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json deleted file mode 100644 index b14841c8a..000000000 --- a/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v2/c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nothingiisreal_MN-12B-Starcannon-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Starcannon-v2", - "id": "nothingiisreal/MN-12B-Starcannon-v2", - "developer": "nothingiisreal", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3925 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5004 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0597 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3978 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v3/59f14dca-923a-41f1-b443-cc3551063f45.json b/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v3/59f14dca-923a-41f1-b443-cc3551063f45.json deleted file mode 100644 index 9f9ddf54f..000000000 --- a/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v3/59f14dca-923a-41f1-b443-cc3551063f45.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nothingiisreal_MN-12B-Starcannon-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MN-12B-Starcannon-v3", - "id": "nothingiisreal/MN-12B-Starcannon-v3", - "developer": "nothingiisreal", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5171 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4046 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3265 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/AceInstruct-1.5B/a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json b/data/hfopenllm_v2/nvidia/AceInstruct-1.5B/a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json deleted file mode 100644 index e9741ba03..000000000 --- a/data/hfopenllm_v2/nvidia/AceInstruct-1.5B/a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_AceInstruct-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceInstruct-1.5B", - "id": "nvidia/AceInstruct-1.5B", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3948 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3932 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/AceInstruct-72B/51d8f53f-ad7e-4dae-9e2a-0895729ff790.json b/data/hfopenllm_v2/nvidia/AceInstruct-72B/51d8f53f-ad7e-4dae-9e2a-0895729ff790.json deleted file mode 100644 index 589e694b8..000000000 --- a/data/hfopenllm_v2/nvidia/AceInstruct-72B/51d8f53f-ad7e-4dae-9e2a-0895729ff790.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_AceInstruct-72B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceInstruct-72B", - "id": "nvidia/AceInstruct-72B", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7119 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6139 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3213 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4206 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4874 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/AceInstruct-7B/421119ea-0da8-4b26-a335-f2e720618c44.json b/data/hfopenllm_v2/nvidia/AceInstruct-7B/421119ea-0da8-4b26-a335-f2e720618c44.json deleted file mode 100644 index 50b4c8bff..000000000 --- a/data/hfopenllm_v2/nvidia/AceInstruct-7B/421119ea-0da8-4b26-a335-f2e720618c44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_AceInstruct-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceInstruct-7B", - "id": "nvidia/AceInstruct-7B", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5422 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5501 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4255 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4177 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/AceMath-1.5B-Instruct/b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json b/data/hfopenllm_v2/nvidia/AceMath-1.5B-Instruct/b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json deleted file mode 100644 index cd6ae73f6..000000000 --- a/data/hfopenllm_v2/nvidia/AceMath-1.5B-Instruct/b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_AceMath-1.5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceMath-1.5B-Instruct", - "id": "nvidia/AceMath-1.5B-Instruct", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3212 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4024 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2064 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/AceMath-72B-Instruct/7c4c2ccf-7d7b-4d24-802e-20c182290d07.json b/data/hfopenllm_v2/nvidia/AceMath-72B-Instruct/7c4c2ccf-7d7b-4d24-802e-20c182290d07.json deleted file mode 100644 index 8c57ec762..000000000 --- a/data/hfopenllm_v2/nvidia/AceMath-72B-Instruct/7c4c2ccf-7d7b-4d24-802e-20c182290d07.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_AceMath-72B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceMath-72B-Instruct", - "id": "nvidia/AceMath-72B-Instruct", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.495 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6402 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4411 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/AceMath-72B-RM/95212a55-f382-4869-9e11-cfa201ba865b.json b/data/hfopenllm_v2/nvidia/AceMath-72B-RM/95212a55-f382-4869-9e11-cfa201ba865b.json deleted file mode 100644 index a94d6d7b0..000000000 --- a/data/hfopenllm_v2/nvidia/AceMath-72B-RM/95212a55-f382-4869-9e11-cfa201ba865b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_AceMath-72B-RM/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceMath-72B-RM", - "id": "nvidia/AceMath-72B-RM", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForSequenceClassification", - "params_billions": 71.461 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1413 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2717 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2341 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3351 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1179 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/AceMath-7B-Instruct/a7da2118-063c-489f-bb31-40f1b7beeefe.json b/data/hfopenllm_v2/nvidia/AceMath-7B-Instruct/a7da2118-063c-489f-bb31-40f1b7beeefe.json deleted file mode 100644 index 66e2a33e8..000000000 --- a/data/hfopenllm_v2/nvidia/AceMath-7B-Instruct/a7da2118-063c-489f-bb31-40f1b7beeefe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_AceMath-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceMath-7B-Instruct", - "id": "nvidia/AceMath-7B-Instruct", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4532 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4994 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6337 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4193 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3383 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/AceMath-7B-RM/9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json b/data/hfopenllm_v2/nvidia/AceMath-7B-RM/9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json deleted file mode 100644 index 9eb39d05c..000000000 --- a/data/hfopenllm_v2/nvidia/AceMath-7B-RM/9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_AceMath-7B-RM/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AceMath-7B-RM", - "id": "nvidia/AceMath-7B-RM", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForSequenceClassification", - "params_billions": 7.071 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1494 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2423 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/Hymba-1.5B-Base/a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json b/data/hfopenllm_v2/nvidia/Hymba-1.5B-Base/a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json deleted file mode 100644 index f9cc991e3..000000000 --- a/data/hfopenllm_v2/nvidia/Hymba-1.5B-Base/a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_Hymba-1.5B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hymba-1.5B-Base", - "id": "nvidia/Hymba-1.5B-Base", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "HymbaForCausalLM", - "params_billions": 1.523 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2295 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3256 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3566 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1922 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/Hymba-1.5B-Instruct/2fd1c45e-209c-43da-ae85-d60887513a96.json b/data/hfopenllm_v2/nvidia/Hymba-1.5B-Instruct/2fd1c45e-209c-43da-ae85-d60887513a96.json deleted file mode 100644 index 9057e3697..000000000 --- a/data/hfopenllm_v2/nvidia/Hymba-1.5B-Instruct/2fd1c45e-209c-43da-ae85-d60887513a96.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_Hymba-1.5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hymba-1.5B-Instruct", - "id": "nvidia/Hymba-1.5B-Instruct", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "HymbaForCausalLM", - "params_billions": 1.523 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6009 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3067 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.204 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/Llama-3.1-Minitron-4B-Depth-Base/91e0e6aa-b933-4a02-a28d-8d69e698c60a.json b/data/hfopenllm_v2/nvidia/Llama-3.1-Minitron-4B-Depth-Base/91e0e6aa-b933-4a02-a28d-8d69e698c60a.json deleted file mode 100644 index cafbeee97..000000000 --- a/data/hfopenllm_v2/nvidia/Llama-3.1-Minitron-4B-Depth-Base/91e0e6aa-b933-4a02-a28d-8d69e698c60a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_Llama-3.1-Minitron-4B-Depth-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Minitron-4B-Depth-Base", - "id": "nvidia/Llama-3.1-Minitron-4B-Depth-Base", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.02 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1607 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4011 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2798 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF/6f3f3d06-2937-4c55-9b95-a62ae5253571.json b/data/hfopenllm_v2/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF/6f3f3d06-2937-4c55-9b95-a62ae5253571.json deleted file mode 100644 index 666f668f2..000000000 --- a/data/hfopenllm_v2/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF/6f3f3d06-2937-4c55-9b95-a62ae5253571.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_Llama-3.1-Nemotron-70B-Instruct-HF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-Nemotron-70B-Instruct-HF", - "id": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4267 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4919 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/Minitron-4B-Base/9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json b/data/hfopenllm_v2/nvidia/Minitron-4B-Base/9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json deleted file mode 100644 index 2e2aa65ef..000000000 --- a/data/hfopenllm_v2/nvidia/Minitron-4B-Base/9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_Minitron-4B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minitron-4B-Base", - "id": "nvidia/Minitron-4B-Base", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "NemotronForCausalLM", - "params_billions": 4.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2218 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4134 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.262 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/Minitron-8B-Base/60077cbd-87af-4a00-a359-9235acb011ed.json b/data/hfopenllm_v2/nvidia/Minitron-8B-Base/60077cbd-87af-4a00-a359-9235acb011ed.json deleted file mode 100644 index e6699dedf..000000000 --- a/data/hfopenllm_v2/nvidia/Minitron-8B-Base/60077cbd-87af-4a00-a359-9235acb011ed.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_Minitron-8B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Minitron-8B-Base", - "id": "nvidia/Minitron-8B-Base", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "NemotronForCausalLM", - "params_billions": 7.22 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4395 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4026 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3181 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Base/577936a8-b450-4233-b633-064565b3d1a4.json b/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Base/577936a8-b450-4233-b633-064565b3d1a4.json deleted file mode 100644 index b690a4bd6..000000000 --- a/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Base/577936a8-b450-4233-b633-064565b3d1a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_Mistral-NeMo-Minitron-8B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-NeMo-Minitron-8B-Base", - "id": "nvidia/Mistral-NeMo-Minitron-8B-Base", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.88 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1946 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5219 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4092 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3796 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Instruct/470b9413-2cc8-4bf4-9e7c-0b8e99929568.json b/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Instruct/470b9413-2cc8-4bf4-9e7c-0b8e99929568.json deleted file mode 100644 index 964990218..000000000 --- a/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Instruct/470b9413-2cc8-4bf4-9e7c-0b8e99929568.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_Mistral-NeMo-Minitron-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-NeMo-Minitron-8B-Instruct", - "id": "nvidia/Mistral-NeMo-Minitron-8B-Instruct", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.414 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5004 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5321 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3886 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3991 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/Nemotron-Mini-4B-Instruct/3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json b/data/hfopenllm_v2/nvidia/Nemotron-Mini-4B-Instruct/3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json deleted file mode 100644 index 95835d5c7..000000000 --- a/data/hfopenllm_v2/nvidia/Nemotron-Mini-4B-Instruct/3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_Nemotron-Mini-4B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nemotron-Mini-4B-Instruct", - "id": "nvidia/Nemotron-Mini-4B-Instruct", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "NemotronForCausalLM", - "params_billions": 4.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6669 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3865 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3767 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nvidia/OpenMath2-Llama3.1-8B/3fccb1d0-5ae1-427a-adae-37004ecbacaa.json b/data/hfopenllm_v2/nvidia/OpenMath2-Llama3.1-8B/3fccb1d0-5ae1-427a-adae-37004ecbacaa.json deleted file mode 100644 index 2fec4ab85..000000000 --- a/data/hfopenllm_v2/nvidia/OpenMath2-Llama3.1-8B/3fccb1d0-5ae1-427a-adae-37004ecbacaa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nvidia_OpenMath2-Llama3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenMath2-Llama3.1-8B", - "id": "nvidia/OpenMath2-Llama3.1-8B", - "developer": "nvidia", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2674 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3436 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1553 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/nxmwxm/Beast-Soul-new/6463183f-4043-4b96-b4d1-0bd41b4d6876.json b/data/hfopenllm_v2/nxmwxm/Beast-Soul-new/6463183f-4043-4b96-b4d1-0bd41b4d6876.json deleted file mode 100644 index 63096f4f3..000000000 --- a/data/hfopenllm_v2/nxmwxm/Beast-Soul-new/6463183f-4043-4b96-b4d1-0bd41b4d6876.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/nxmwxm_Beast-Soul-new/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Beast-Soul-new", - "id": "nxmwxm/Beast-Soul-new", - "developer": "nxmwxm", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4869 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5227 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4459 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3102 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/occiglot/occiglot-7b-es-en-instruct/0b102423-1a06-4e5b-a287-710695658b63.json b/data/hfopenllm_v2/occiglot/occiglot-7b-es-en-instruct/0b102423-1a06-4e5b-a287-710695658b63.json deleted file mode 100644 index 14b07b1d6..000000000 --- a/data/hfopenllm_v2/occiglot/occiglot-7b-es-en-instruct/0b102423-1a06-4e5b-a287-710695658b63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/occiglot_occiglot-7b-es-en-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "occiglot-7b-es-en-instruct", - "id": "occiglot/occiglot-7b-es-en-instruct", - "developer": "occiglot", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3485 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4111 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2311 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/odyssey-labs/Astral-1-10B/b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json b/data/hfopenllm_v2/odyssey-labs/Astral-1-10B/b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json deleted file mode 100644 index 81a00bf26..000000000 --- a/data/hfopenllm_v2/odyssey-labs/Astral-1-10B/b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/odyssey-labs_Astral-1-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Astral-1-10B", - "id": "odyssey-labs/Astral-1-10B", - "developer": "odyssey-labs", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3878 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4873 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2985 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/olabs-ai/reflection_model/3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json b/data/hfopenllm_v2/olabs-ai/reflection_model/3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json deleted file mode 100644 index 619a40e89..000000000 --- a/data/hfopenllm_v2/olabs-ai/reflection_model/3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/olabs-ai_reflection_model/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "reflection_model", - "id": "olabs-ai/reflection_model", - "developer": "olabs-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 9.3 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1599 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4713 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3508 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3311 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/Llama_3.2_1b-autoredteam_helpfulness-train/abd48d9d-0443-40be-a23a-68922771e14f.json b/data/hfopenllm_v2/ontocord/Llama_3.2_1b-autoredteam_helpfulness-train/abd48d9d-0443-40be-a23a-68922771e14f.json deleted file mode 100644 index 03e54e74a..000000000 --- a/data/hfopenllm_v2/ontocord/Llama_3.2_1b-autoredteam_helpfulness-train/abd48d9d-0443-40be-a23a-68922771e14f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_Llama_3.2_1b-autoredteam_helpfulness-train/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama_3.2_1b-autoredteam_helpfulness-train", - "id": "ontocord/Llama_3.2_1b-autoredteam_helpfulness-train", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.498 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2765 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3115 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3459 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1132 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only/436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json b/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only/436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json deleted file mode 100644 index 070fd99d6..000000000 --- a/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only/436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_RedPajama-3B-v1-AutoRedteam-Harmless-only/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-3B-v1-AutoRedteam-Harmless-only", - "id": "ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 2.776 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1525 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3124 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2315 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.11 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam/7a654100-b206-4011-828e-fb386df27d0c.json b/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam/7a654100-b206-4011-828e-fb386df27d0c.json deleted file mode 100644 index 20317204f..000000000 --- a/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam/7a654100-b206-4011-828e-fb386df27d0c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_RedPajama-3B-v1-AutoRedteam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-3B-v1-AutoRedteam", - "id": "ontocord/RedPajama-3B-v1-AutoRedteam", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 2.776 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1343 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2424 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/RedPajama3b_v1-autoredteam_helpfulness-train/2f0e262c-a099-41f4-89f1-8b251708a960.json b/data/hfopenllm_v2/ontocord/RedPajama3b_v1-autoredteam_helpfulness-train/2f0e262c-a099-41f4-89f1-8b251708a960.json deleted file mode 100644 index b83d49b55..000000000 --- a/data/hfopenllm_v2/ontocord/RedPajama3b_v1-autoredteam_helpfulness-train/2f0e262c-a099-41f4-89f1-8b251708a960.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_RedPajama3b_v1-autoredteam_helpfulness-train/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama3b_v1-autoredteam_helpfulness-train", - "id": "ontocord/RedPajama3b_v1-autoredteam_helpfulness-train", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 2.776 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2848 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1107 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8-stack_2x/7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json b/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8-stack_2x/7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json deleted file mode 100644 index 14485961f..000000000 --- a/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8-stack_2x/7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_merged_0.2_expert_0.8-stack_2x/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merged_0.2_expert_0.8-stack_2x", - "id": "ontocord/merged_0.2_expert_0.8-stack_2x", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 6.512 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1796 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3006 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3541 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1103 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8/8703dbdd-12ef-457b-8cda-f570c8f5c890.json b/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8/8703dbdd-12ef-457b-8cda-f570c8f5c890.json deleted file mode 100644 index 4f9707321..000000000 --- a/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8/8703dbdd-12ef-457b-8cda-f570c8f5c890.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_merged_0.2_expert_0.8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merged_0.2_expert_0.8", - "id": "ontocord/merged_0.2_expert_0.8", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1743 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3046 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/merged_0.5_expert_0.5/d77f3e8f-1eea-478e-babd-ba873d2d427c.json b/data/hfopenllm_v2/ontocord/merged_0.5_expert_0.5/d77f3e8f-1eea-478e-babd-ba873d2d427c.json deleted file mode 100644 index 3c97d6dc5..000000000 --- a/data/hfopenllm_v2/ontocord/merged_0.5_expert_0.5/d77f3e8f-1eea-478e-babd-ba873d2d427c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_merged_0.5_expert_0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "merged_0.5_expert_0.5", - "id": "ontocord/merged_0.5_expert_0.5", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1787 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3017 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/783a4385-c802-4bb3-9a21-90629d16efc7.json b/data/hfopenllm_v2/ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/783a4385-c802-4bb3-9a21-90629d16efc7.json deleted file mode 100644 index b166ddfd7..000000000 --- a/data/hfopenllm_v2/ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/783a4385-c802-4bb3-9a21-90629d16efc7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful", - "id": "ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1318 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1-instruct/bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json b/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1-instruct/bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json deleted file mode 100644 index 3d829740d..000000000 --- a/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1-instruct/bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_ontocord_wide_7b-stacked-stage1-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ontocord_wide_7b-stacked-stage1-instruct", - "id": "ontocord/ontocord_wide_7b-stacked-stage1-instruct", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.888 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.153 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2854 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3538 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1/e80d25b5-3f4b-45a7-9472-09f98db03bf0.json b/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1/e80d25b5-3f4b-45a7-9472-09f98db03bf0.json deleted file mode 100644 index 0a16462de..000000000 --- a/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1/e80d25b5-3f4b-45a7-9472-09f98db03bf0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_ontocord_wide_7b-stacked-stage1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ontocord_wide_7b-stacked-stage1", - "id": "ontocord/ontocord_wide_7b-stacked-stage1", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.888 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1485 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2897 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/starcoder2-29b-ls/7fed0b1d-0d79-4784-8fd6-42f8611b1751.json b/data/hfopenllm_v2/ontocord/starcoder2-29b-ls/7fed0b1d-0d79-4784-8fd6-42f8611b1751.json deleted file mode 100644 index 3cb91cf0f..000000000 --- a/data/hfopenllm_v2/ontocord/starcoder2-29b-ls/7fed0b1d-0d79-4784-8fd6-42f8611b1751.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_starcoder2-29b-ls/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "starcoder2-29b-ls", - "id": "ontocord/starcoder2-29b-ls", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Starcoder2ForCausalLM", - "params_billions": 29.009 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2149 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3735 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1869 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/starcoder2_3b-AutoRedteam/be534cd3-8245-4370-ba6c-9687b431ee8d.json b/data/hfopenllm_v2/ontocord/starcoder2_3b-AutoRedteam/be534cd3-8245-4370-ba6c-9687b431ee8d.json deleted file mode 100644 index ad59c53df..000000000 --- a/data/hfopenllm_v2/ontocord/starcoder2_3b-AutoRedteam/be534cd3-8245-4370-ba6c-9687b431ee8d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_starcoder2_3b-AutoRedteam/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "starcoder2_3b-AutoRedteam", - "id": "ontocord/starcoder2_3b-AutoRedteam", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Starcoder2ForCausalLM", - "params_billions": 3.181 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1574 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3498 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3646 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1336 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b-merge_test/e98967b7-3aff-4baa-92eb-eff86bf09797.json b/data/hfopenllm_v2/ontocord/wide_3b-merge_test/e98967b7-3aff-4baa-92eb-eff86bf09797.json deleted file mode 100644 index e0c7c037c..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b-merge_test/e98967b7-3aff-4baa-92eb-eff86bf09797.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b-merge_test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b-merge_test", - "id": "ontocord/wide_3b-merge_test", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3011 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1066 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained/8736a22a-f980-4a01-953d-217f27050129.json b/data/hfopenllm_v2/ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained/8736a22a-f980-4a01-953d-217f27050129.json deleted file mode 100644 index abe0c08b1..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained/8736a22a-f980-4a01-953d-217f27050129.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b-stage1_shuf_sample1_jsonl-pretrained", - "id": "ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1395 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3004 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3632 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json deleted file mode 100644 index 129aad1c1..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge", - "id": "ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1664 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3031 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3845 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json deleted file mode 100644 index 2b49af2c2..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge", - "id": "ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1697 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2975 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3778 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/f8579305-003b-4727-b904-bad4f363a616.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/f8579305-003b-4727-b904-bad4f363a616.json deleted file mode 100644 index b29b83734..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/f8579305-003b-4727-b904-bad4f363a616.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3095 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/3103f36a-4a88-4a39-8261-0b597f8d6db4.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/3103f36a-4a88-4a39-8261-0b597f8d6db4.json deleted file mode 100644 index e2c5d63f0..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/3103f36a-4a88-4a39-8261-0b597f8d6db4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1237 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3673 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/eda9de3b-ae53-4102-b203-eddadbc50464.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/eda9de3b-ae53-4102-b203-eddadbc50464.json deleted file mode 100644 index d30f5ad9b..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/eda9de3b-ae53-4102-b203-eddadbc50464.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1192 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2956 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3553 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1183 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json deleted file mode 100644 index 9fef62a26..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/fa6ecaf9-457e-4135-ad25-4790ebc27737.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/fa6ecaf9-457e-4135-ad25-4790ebc27737.json deleted file mode 100644 index 053d679ba..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/fa6ecaf9-457e-4135-ad25-4790ebc27737.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1162 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3184 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json deleted file mode 100644 index 4832c4eb6..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1317 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3064 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1144 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/e388c707-8b35-49a4-94eb-f32e983fe33e.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/e388c707-8b35-49a4-94eb-f32e983fe33e.json deleted file mode 100644 index b73dce637..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/e388c707-8b35-49a4-94eb-f32e983fe33e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1182 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3567 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1162 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/f6273192-31cf-4ee1-af45-c2f62de05330.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/f6273192-31cf-4ee1-af45-c2f62de05330.json deleted file mode 100644 index f93ee1cb2..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/f6273192-31cf-4ee1-af45-c2f62de05330.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.124 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue/105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue/105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json deleted file mode 100644 index 635e7e0ec..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue/105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_math.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-with_math.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1298 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3052 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json deleted file mode 100644 index 22c12a778..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue", - "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2049 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2912 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json deleted file mode 100644 index b7dea3b9d..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical", - "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1461 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2998 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1141 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text/3c4713a3-3973-4a04-9c4a-a6782251734e.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text/3c4713a3-3973-4a04-9c4a-a6782251734e.json deleted file mode 100644 index 7106f073b..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text/3c4713a3-3973-4a04-9c4a-a6782251734e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_formatted_text/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.2-ss1-expert_formatted_text", - "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3069 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1146 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to/de70c700-a007-4e87-a3db-941ee285eb1f.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to/de70c700-a007-4e87-a3db-941ee285eb1f.json deleted file mode 100644 index 0860e6dcd..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to/de70c700-a007-4e87-a3db-941ee285eb1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_how-to/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.2-ss1-expert_how-to", - "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1245 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1153 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_math/a1324a7f-1911-4fa9-8d83-be891f752a61.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_math/a1324a7f-1911-4fa9-8d83-be891f752a61.json deleted file mode 100644 index ffb80060b..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_math/a1324a7f-1911-4fa9-8d83-be891f752a61.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_math/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.2-ss1-expert_math", - "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_math", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1915 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1092 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_news/9c4af0df-f538-4755-8cd0-eec6b2b26524.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_news/9c4af0df-f538-4755-8cd0-eec6b2b26524.json deleted file mode 100644 index 1ef6d19ad..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_news/9c4af0df-f538-4755-8cd0-eec6b2b26524.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_news/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.2-ss1-expert_news", - "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_news", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1658 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2926 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_software/fde650a6-a5d1-4edc-bd64-8be806663263.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_software/fde650a6-a5d1-4edc-bd64-8be806663263.json deleted file mode 100644 index 511c00854..000000000 --- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_software/fde650a6-a5d1-4edc-bd64-8be806663263.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_software/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_3b_sft_stage1.2-ss1-expert_software", - "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_software", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.759 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1734 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3569 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/96dd1a08-b166-4d8e-ac31-5e948adf931b.json b/data/hfopenllm_v2/ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/96dd1a08-b166-4d8e-ac31-5e948adf931b.json deleted file mode 100644 index a1a9a9d89..000000000 --- a/data/hfopenllm_v2/ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/96dd1a08-b166-4d8e-ac31-5e948adf931b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ontocord_wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked", - "id": "ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked", - "developer": "ontocord", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.888 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1244 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oobabooga/CodeBooga-34B-v0.1/3b90b9db-a68e-4ee9-bd4d-a18cec357753.json b/data/hfopenllm_v2/oobabooga/CodeBooga-34B-v0.1/3b90b9db-a68e-4ee9-bd4d-a18cec357753.json deleted file mode 100644 index e386ca80a..000000000 --- a/data/hfopenllm_v2/oobabooga/CodeBooga-34B-v0.1/3b90b9db-a68e-4ee9-bd4d-a18cec357753.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oobabooga_CodeBooga-34B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CodeBooga-34B-v0.1", - "id": "oobabooga/CodeBooga-34B-v0.1", - "developer": "oobabooga", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 33.744 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.525 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3427 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.236 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/Llama-FinSent-S/444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json b/data/hfopenllm_v2/oopere/Llama-FinSent-S/444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json deleted file mode 100644 index 466bcc8a4..000000000 --- a/data/hfopenllm_v2/oopere/Llama-FinSent-S/444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_Llama-FinSent-S/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-FinSent-S", - "id": "oopere/Llama-FinSent-S", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.914 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2119 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3832 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/Llama-FinSent-S/7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json b/data/hfopenllm_v2/oopere/Llama-FinSent-S/7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json deleted file mode 100644 index d92a91932..000000000 --- a/data/hfopenllm_v2/oopere/Llama-FinSent-S/7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_Llama-FinSent-S/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-FinSent-S", - "id": "oopere/Llama-FinSent-S", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.914 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2164 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3169 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3832 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1134 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/pruned10-llama-3.2-3B/e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json b/data/hfopenllm_v2/oopere/pruned10-llama-3.2-3B/e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json deleted file mode 100644 index afabeb373..000000000 --- a/data/hfopenllm_v2/oopere/pruned10-llama-3.2-3B/e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_pruned10-llama-3.2-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pruned10-llama-3.2-3B", - "id": "oopere/pruned10-llama-3.2-3B", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.001 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1776 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.334 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3722 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/pruned20-llama-1b/d05b129c-6b9e-4e6b-80fc-af65db620c5d.json b/data/hfopenllm_v2/oopere/pruned20-llama-1b/d05b129c-6b9e-4e6b-80fc-af65db620c5d.json deleted file mode 100644 index d15232953..000000000 --- a/data/hfopenllm_v2/oopere/pruned20-llama-1b/d05b129c-6b9e-4e6b-80fc-af65db620c5d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_pruned20-llama-1b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pruned20-llama-1b", - "id": "oopere/pruned20-llama-1b", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.075 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1994 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3031 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/pruned20-llama-3.2-3b/d9792fac-29c1-45b2-b649-cdebb6830e2f.json b/data/hfopenllm_v2/oopere/pruned20-llama-3.2-3b/d9792fac-29c1-45b2-b649-cdebb6830e2f.json deleted file mode 100644 index bc86d9513..000000000 --- a/data/hfopenllm_v2/oopere/pruned20-llama-3.2-3b/d9792fac-29c1-45b2-b649-cdebb6830e2f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_pruned20-llama-3.2-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pruned20-llama-3.2-3b", - "id": "oopere/pruned20-llama-3.2-3b", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.79 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1789 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3248 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3418 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.128 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/pruned40-llama-1b/fcc2f06a-e6c8-4c28-bf22-4ee582392912.json b/data/hfopenllm_v2/oopere/pruned40-llama-1b/fcc2f06a-e6c8-4c28-bf22-4ee582392912.json deleted file mode 100644 index 2afb1508c..000000000 --- a/data/hfopenllm_v2/oopere/pruned40-llama-1b/fcc2f06a-e6c8-4c28-bf22-4ee582392912.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_pruned40-llama-1b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pruned40-llama-1b", - "id": "oopere/pruned40-llama-1b", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.914 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2284 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2969 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1082 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/pruned40-llama-3.2-1B/c6e13327-90b3-440d-9367-dbcec54dd6cc.json b/data/hfopenllm_v2/oopere/pruned40-llama-3.2-1B/c6e13327-90b3-440d-9367-dbcec54dd6cc.json deleted file mode 100644 index ee1ba1f96..000000000 --- a/data/hfopenllm_v2/oopere/pruned40-llama-3.2-1B/c6e13327-90b3-440d-9367-dbcec54dd6cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_pruned40-llama-3.2-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pruned40-llama-3.2-1B", - "id": "oopere/pruned40-llama-3.2-1B", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.914 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2266 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2982 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4352 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/pruned40-llama-3.2-3b/30b02429-350c-4d86-aded-ba8597bec4d5.json b/data/hfopenllm_v2/oopere/pruned40-llama-3.2-3b/30b02429-350c-4d86-aded-ba8597bec4d5.json deleted file mode 100644 index 343aa660b..000000000 --- a/data/hfopenllm_v2/oopere/pruned40-llama-3.2-3b/30b02429-350c-4d86-aded-ba8597bec4d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_pruned40-llama-3.2-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pruned40-llama-3.2-3b", - "id": "oopere/pruned40-llama-3.2-3b", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.367 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3167 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2299 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3539 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1177 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/pruned60-llama-1b/7d1ee802-106e-4313-ba1d-72d5a0676c88.json b/data/hfopenllm_v2/oopere/pruned60-llama-1b/7d1ee802-106e-4313-ba1d-72d5a0676c88.json deleted file mode 100644 index 116617b5f..000000000 --- a/data/hfopenllm_v2/oopere/pruned60-llama-1b/7d1ee802-106e-4313-ba1d-72d5a0676c88.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_pruned60-llama-1b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pruned60-llama-1b", - "id": "oopere/pruned60-llama-1b", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.753 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1829 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3016 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4088 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oopere/pruned60-llama-3.2-3b/1b3af020-f65e-44b8-a9a2-ad60fa686427.json b/data/hfopenllm_v2/oopere/pruned60-llama-3.2-3b/1b3af020-f65e-44b8-a9a2-ad60fa686427.json deleted file mode 100644 index 321e3d3d4..000000000 --- a/data/hfopenllm_v2/oopere/pruned60-llama-3.2-3b/1b3af020-f65e-44b8-a9a2-ad60fa686427.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oopere_pruned60-llama-3.2-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "pruned60-llama-3.2-3b", - "id": "oopere/pruned60-llama-3.2-3b", - "developer": "oopere", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.944 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1825 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3166 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/open-atlas/Atlas-Flash-1.5B-Preview/6e40871d-bc23-4f1c-a005-f5b8eb096f84.json b/data/hfopenllm_v2/open-atlas/Atlas-Flash-1.5B-Preview/6e40871d-bc23-4f1c-a005-f5b8eb096f84.json deleted file mode 100644 index 2ba09d639..000000000 --- a/data/hfopenllm_v2/open-atlas/Atlas-Flash-1.5B-Preview/6e40871d-bc23-4f1c-a005-f5b8eb096f84.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/open-atlas_Atlas-Flash-1.5B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Atlas-Flash-1.5B-Preview", - "id": "open-atlas/Atlas-Flash-1.5B-Preview", - "developer": "open-atlas", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.327 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3215 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3488 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1374 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/open-atlas/Atlas-Flash-7B-Preview/1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json b/data/hfopenllm_v2/open-atlas/Atlas-Flash-7B-Preview/1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json deleted file mode 100644 index 5c4b96098..000000000 --- a/data/hfopenllm_v2/open-atlas/Atlas-Flash-7B-Preview/1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/open-atlas_Atlas-Flash-7B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Atlas-Flash-7B-Preview", - "id": "open-atlas/Atlas-Flash-7B-Preview", - "developer": "open-atlas", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3908 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3836 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2784 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/open-neo/Kyro-n1-3B/ec601f5d-bf19-4407-ac41-6b9272d94735.json b/data/hfopenllm_v2/open-neo/Kyro-n1-3B/ec601f5d-bf19-4407-ac41-6b9272d94735.json deleted file mode 100644 index eab4e7f5e..000000000 --- a/data/hfopenllm_v2/open-neo/Kyro-n1-3B/ec601f5d-bf19-4407-ac41-6b9272d94735.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/open-neo_Kyro-n1-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kyro-n1-3B", - "id": "open-neo/Kyro-n1-3B", - "developer": "open-neo", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4685 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2855 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4088 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/open-neo/Kyro-n1-7B/87e53761-e8b7-4032-ae7a-c3a91704d115.json b/data/hfopenllm_v2/open-neo/Kyro-n1-7B/87e53761-e8b7-4032-ae7a-c3a91704d115.json deleted file mode 100644 index 7c4002489..000000000 --- a/data/hfopenllm_v2/open-neo/Kyro-n1-7B/87e53761-e8b7-4032-ae7a-c3a91704d115.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/open-neo_Kyro-n1-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Kyro-n1-7B", - "id": "open-neo/Kyro-n1-7B", - "developer": "open-neo", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5573 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5387 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3897 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4333 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/open-thoughts/OpenThinker-7B/59492d86-4b85-4865-84e9-84ab4ace630c.json b/data/hfopenllm_v2/open-thoughts/OpenThinker-7B/59492d86-4b85-4865-84e9-84ab4ace630c.json deleted file mode 100644 index b02b2a84c..000000000 --- a/data/hfopenllm_v2/open-thoughts/OpenThinker-7B/59492d86-4b85-4865-84e9-84ab4ace630c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/open-thoughts_OpenThinker-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenThinker-7B", - "id": "open-thoughts/OpenThinker-7B", - "developer": "open-thoughts", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4089 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4165 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openai-community/gpt2-large/cc082df2-259c-44c1-abe4-ef349056a2a9.json b/data/hfopenllm_v2/openai-community/gpt2-large/cc082df2-259c-44c1-abe4-ef349056a2a9.json deleted file mode 100644 index f23bbd9c9..000000000 --- a/data/hfopenllm_v2/openai-community/gpt2-large/cc082df2-259c-44c1-abe4-ef349056a2a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openai-community_gpt2-large/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt2-large", - "id": "openai-community/gpt2-large", - "developer": "openai-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.812 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2048 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3069 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3789 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openai-community/gpt2-medium/3f069053-b24e-4242-9302-d46b82e511aa.json b/data/hfopenllm_v2/openai-community/gpt2-medium/3f069053-b24e-4242-9302-d46b82e511aa.json deleted file mode 100644 index 3800ef51a..000000000 --- a/data/hfopenllm_v2/openai-community/gpt2-medium/3f069053-b24e-4242-9302-d46b82e511aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openai-community_gpt2-medium/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt2-medium", - "id": "openai-community/gpt2-medium", - "developer": "openai-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.38 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.305 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1182 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openai-community/gpt2-xl/62cd9bcb-a74c-40b9-be84-a0077235ae3c.json b/data/hfopenllm_v2/openai-community/gpt2-xl/62cd9bcb-a74c-40b9-be84-a0077235ae3c.json deleted file mode 100644 index 4094508d3..000000000 --- a/data/hfopenllm_v2/openai-community/gpt2-xl/62cd9bcb-a74c-40b9-be84-a0077235ae3c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openai-community_gpt2-xl/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt2-xl", - "id": "openai-community/gpt2-xl", - "developer": "openai-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 1.608 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2039 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3009 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.371 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openai-community/gpt2/b4cd25f1-87d5-4173-a4d3-928444f6cb37.json b/data/hfopenllm_v2/openai-community/gpt2/b4cd25f1-87d5-4173-a4d3-928444f6cb37.json deleted file mode 100644 index 172402db8..000000000 --- a/data/hfopenllm_v2/openai-community/gpt2/b4cd25f1-87d5-4173-a4d3-928444f6cb37.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openai-community_gpt2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt2", - "id": "openai-community/gpt2", - "developer": "openai-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.137 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1793 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3036 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4471 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1159 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openai-community/gpt2/ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json b/data/hfopenllm_v2/openai-community/gpt2/ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json deleted file mode 100644 index 040f869b3..000000000 --- a/data/hfopenllm_v2/openai-community/gpt2/ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openai-community_gpt2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt2", - "id": "openai-community/gpt2", - "developer": "openai-community", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.137 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.178 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3017 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.439 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1165 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openbmb/MiniCPM-S-1B-sft-llama-format/1e5b62a3-018b-429a-b2b4-325545ee99dc.json b/data/hfopenllm_v2/openbmb/MiniCPM-S-1B-sft-llama-format/1e5b62a3-018b-429a-b2b4-325545ee99dc.json deleted file mode 100644 index 404c67964..000000000 --- a/data/hfopenllm_v2/openbmb/MiniCPM-S-1B-sft-llama-format/1e5b62a3-018b-429a-b2b4-325545ee99dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openbmb_MiniCPM-S-1B-sft-llama-format/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MiniCPM-S-1B-sft-llama-format", - "id": "openbmb/MiniCPM-S-1B-sft-llama-format", - "developer": "openbmb", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3329 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3049 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3317 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openchat/openchat-3.5-0106/958d410e-ce43-44c0-8a56-685c0a618408.json b/data/hfopenllm_v2/openchat/openchat-3.5-0106/958d410e-ce43-44c0-8a56-685c0a618408.json deleted file mode 100644 index c3931b7fa..000000000 --- a/data/hfopenllm_v2/openchat/openchat-3.5-0106/958d410e-ce43-44c0-8a56-685c0a618408.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openchat_openchat-3.5-0106/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openchat-3.5-0106", - "id": "openchat/openchat-3.5-0106", - "developer": "openchat", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5967 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4617 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4254 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3291 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openchat/openchat-3.5-1210/57c53f20-aa32-49fd-926a-f26c9d0759d4.json b/data/hfopenllm_v2/openchat/openchat-3.5-1210/57c53f20-aa32-49fd-926a-f26c9d0759d4.json deleted file mode 100644 index 2d6ea71b7..000000000 --- a/data/hfopenllm_v2/openchat/openchat-3.5-1210/57c53f20-aa32-49fd-926a-f26c9d0759d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openchat_openchat-3.5-1210/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openchat-3.5-1210", - "id": "openchat/openchat-3.5-1210", - "developer": "openchat", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6037 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4535 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4414 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openchat/openchat-3.6-8b-20240522/76def522-6fe1-458f-bfbf-99b50ece3367.json b/data/hfopenllm_v2/openchat/openchat-3.6-8b-20240522/76def522-6fe1-458f-bfbf-99b50ece3367.json deleted file mode 100644 index 162c9b8e8..000000000 --- a/data/hfopenllm_v2/openchat/openchat-3.6-8b-20240522/76def522-6fe1-458f-bfbf-99b50ece3367.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openchat_openchat-3.6-8b-20240522/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openchat-3.6-8b-20240522", - "id": "openchat/openchat-3.6-8b-20240522", - "developer": "openchat", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5343 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5338 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3999 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3229 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openchat/openchat_3.5/c467bc88-6769-48ac-abd4-867ee38bbe57.json b/data/hfopenllm_v2/openchat/openchat_3.5/c467bc88-6769-48ac-abd4-867ee38bbe57.json deleted file mode 100644 index 652c57fc3..000000000 --- a/data/hfopenllm_v2/openchat/openchat_3.5/c467bc88-6769-48ac-abd4-867ee38bbe57.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openchat_openchat_3.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openchat_3.5", - "id": "openchat/openchat_3.5", - "developer": "openchat", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5931 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4426 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4229 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3153 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openchat/openchat_v3.2/801681eb-66f4-46e0-bb2b-7ba4b46679af.json b/data/hfopenllm_v2/openchat/openchat_v3.2/801681eb-66f4-46e0-bb2b-7ba4b46679af.json deleted file mode 100644 index 781672e47..000000000 --- a/data/hfopenllm_v2/openchat/openchat_v3.2/801681eb-66f4-46e0-bb2b-7ba4b46679af.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openchat_openchat_v3.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openchat_v3.2", - "id": "openchat/openchat_v3.2", - "developer": "openchat", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2981 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4331 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4336 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2422 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/openchat/openchat_v3.2_super/cdd0ea1c-b17a-4816-953c-1d7164c64114.json b/data/hfopenllm_v2/openchat/openchat_v3.2_super/cdd0ea1c-b17a-4816-953c-1d7164c64114.json deleted file mode 100644 index a8f2b40b0..000000000 --- a/data/hfopenllm_v2/openchat/openchat_v3.2_super/cdd0ea1c-b17a-4816-953c-1d7164c64114.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/openchat_openchat_v3.2_super/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openchat_v3.2_super", - "id": "openchat/openchat_v3.2_super", - "developer": "openchat", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2862 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4221 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4161 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2425 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/orai-nlp/Llama-eus-8B/b2060893-1f7d-4e7a-a458-3623147ac118.json b/data/hfopenllm_v2/orai-nlp/Llama-eus-8B/b2060893-1f7d-4e7a-a458-3623147ac118.json deleted file mode 100644 index 5964aa7f2..000000000 --- a/data/hfopenllm_v2/orai-nlp/Llama-eus-8B/b2060893-1f7d-4e7a-a458-3623147ac118.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/orai-nlp_Llama-eus-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-eus-8B", - "id": "orai-nlp/Llama-eus-8B", - "developer": "orai-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2161 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4418 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3919 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3058 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/oxyapi/oxy-1-small/cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json b/data/hfopenllm_v2/oxyapi/oxy-1-small/cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json deleted file mode 100644 index eae996b8a..000000000 --- a/data/hfopenllm_v2/oxyapi/oxy-1-small/cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/oxyapi_oxy-1-small/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "oxy-1-small", - "id": "oxyapi/oxy-1-small", - "developer": "oxyapi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6245 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5885 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5001 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ozone-ai/0x-lite/34bfe887-5a3a-4626-997e-c35d3a0ec341.json b/data/hfopenllm_v2/ozone-ai/0x-lite/34bfe887-5a3a-4626-997e-c35d3a0ec341.json deleted file mode 100644 index b6787efe4..000000000 --- a/data/hfopenllm_v2/ozone-ai/0x-lite/34bfe887-5a3a-4626-997e-c35d3a0ec341.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ozone-ai_0x-lite/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "0x-lite", - "id": "ozone-ai/0x-lite", - "developer": "ozone-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.774 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6341 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4221 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5184 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ozone-research/Chirp-01/b81acc47-6fd5-4f89-8c70-f8f14b677e04.json b/data/hfopenllm_v2/ozone-research/Chirp-01/b81acc47-6fd5-4f89-8c70-f8f14b677e04.json deleted file mode 100644 index 92d7d8fbf..000000000 --- a/data/hfopenllm_v2/ozone-research/Chirp-01/b81acc47-6fd5-4f89-8c70-f8f14b677e04.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ozone-research_Chirp-01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Chirp-01", - "id": "ozone-research/Chirp-01", - "developer": "ozone-research", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6348 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3467 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3508 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V1/30b977a8-7882-49be-8621-9ee3fce270ec.json b/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V1/30b977a8-7882-49be-8621-9ee3fce270ec.json deleted file mode 100644 index fa0ba291d..000000000 --- a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V1/30b977a8-7882-49be-8621-9ee3fce270ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/paloalma_ECE-TW3-JRGL-V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-TW3-JRGL-V1", - "id": "paloalma/ECE-TW3-JRGL-V1", - "developer": "paloalma", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 68.977 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5535 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4221 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V2/3367fd79-713c-4691-80cd-4abb6b2818ef.json b/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V2/3367fd79-713c-4691-80cd-4abb6b2818ef.json deleted file mode 100644 index 503c32c60..000000000 --- a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V2/3367fd79-713c-4691-80cd-4abb6b2818ef.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/paloalma_ECE-TW3-JRGL-V2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-TW3-JRGL-V2", - "id": "paloalma/ECE-TW3-JRGL-V2", - "developer": "paloalma", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.288 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2255 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6031 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.185 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4588 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V5/add899b8-f3e6-4d87-8846-8254f4dfbd5f.json b/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V5/add899b8-f3e6-4d87-8846-8254f4dfbd5f.json deleted file mode 100644 index 9ec62858b..000000000 --- a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V5/add899b8-f3e6-4d87-8846-8254f4dfbd5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/paloalma_ECE-TW3-JRGL-V5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-TW3-JRGL-V5", - "id": "paloalma/ECE-TW3-JRGL-V5", - "developer": "paloalma", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 72.289 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4553 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6025 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1835 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4621 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4648 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/paloalma/Le_Triomphant-ECE-TW3/53829ec0-f233-4b61-a672-6a467823caaa.json b/data/hfopenllm_v2/paloalma/Le_Triomphant-ECE-TW3/53829ec0-f233-4b61-a672-6a467823caaa.json deleted file mode 100644 index bd04d24ff..000000000 --- a/data/hfopenllm_v2/paloalma/Le_Triomphant-ECE-TW3/53829ec0-f233-4b61-a672-6a467823caaa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/paloalma_Le_Triomphant-ECE-TW3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Le_Triomphant-ECE-TW3", - "id": "paloalma/Le_Triomphant-ECE-TW3", - "developer": "paloalma", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 72.289 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5402 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1949 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4725 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4763 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/paloalma/TW3-JRGL-v2/e2b41200-bff2-4835-a0ea-27ff56937570.json b/data/hfopenllm_v2/paloalma/TW3-JRGL-v2/e2b41200-bff2-4835-a0ea-27ff56937570.json deleted file mode 100644 index f94c8540b..000000000 --- a/data/hfopenllm_v2/paloalma/TW3-JRGL-v2/e2b41200-bff2-4835-a0ea-27ff56937570.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/paloalma_TW3-JRGL-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TW3-JRGL-v2", - "id": "paloalma/TW3-JRGL-v2", - "developer": "paloalma", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 72.289 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5316 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6138 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.179 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4858 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4858 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/Al_Dente_v1_8b/3d33f26d-72be-451e-bcf0-501e0bc2f1db.json b/data/hfopenllm_v2/pankajmathur/Al_Dente_v1_8b/3d33f26d-72be-451e-bcf0-501e0bc2f1db.json deleted file mode 100644 index 4ce9999c5..000000000 --- a/data/hfopenllm_v2/pankajmathur/Al_Dente_v1_8b/3d33f26d-72be-451e-bcf0-501e0bc2f1db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_Al_Dente_v1_8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Al_Dente_v1_8b", - "id": "pankajmathur/Al_Dente_v1_8b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3694 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4835 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3987 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.286 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/model_007_13b_v2/3b4c05fc-2ccf-46db-8d64-045508f6614b.json b/data/hfopenllm_v2/pankajmathur/model_007_13b_v2/3b4c05fc-2ccf-46db-8d64-045508f6614b.json deleted file mode 100644 index 8fcac09dd..000000000 --- a/data/hfopenllm_v2/pankajmathur/model_007_13b_v2/3b4c05fc-2ccf-46db-8d64-045508f6614b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_model_007_13b_v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "model_007_13b_v2", - "id": "pankajmathur/model_007_13b_v2", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3056 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4702 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4611 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2461 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_3b/af83a91c-3b07-48c6-9726-5bd77347f810.json b/data/hfopenllm_v2/pankajmathur/orca_mini_3b/af83a91c-3b07-48c6-9726-5bd77347f810.json deleted file mode 100644 index ba7ea5916..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_3b/af83a91c-3b07-48c6-9726-5bd77347f810.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_3b", - "id": "pankajmathur/orca_mini_3b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.426 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0742 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3349 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_7b/48759b07-9aea-42bd-8d73-9c4208d2789f.json b/data/hfopenllm_v2/pankajmathur/orca_mini_7b/48759b07-9aea-42bd-8d73-9c4208d2789f.json deleted file mode 100644 index 9f6551021..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_7b/48759b07-9aea-42bd-8d73-9c4208d2789f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_7b", - "id": "pankajmathur/orca_mini_7b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3332 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1246 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_phi-4/68820679-55f4-494d-91a0-0db1bccb8983.json b/data/hfopenllm_v2/pankajmathur/orca_mini_phi-4/68820679-55f4-494d-91a0-0db1bccb8983.json deleted file mode 100644 index cf14929d8..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_phi-4/68820679-55f4-494d-91a0-0db1bccb8983.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_phi-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_phi-4", - "id": "pankajmathur/orca_mini_phi-4", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7781 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6856 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4703 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5255 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v2_7b/029774ac-a63d-4acc-a37c-4194e4afdecc.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v2_7b/029774ac-a63d-4acc-a37c-4194e4afdecc.json deleted file mode 100644 index 37a85929b..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v2_7b/029774ac-a63d-4acc-a37c-4194e4afdecc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v2_7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v2_7b", - "id": "pankajmathur/orca_mini_v2_7b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1358 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3536 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3593 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1542 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_13b/146df856-e2c8-41eb-b860-ceb78c126e55.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v3_13b/146df856-e2c8-41eb-b860-ceb78c126e55.json deleted file mode 100644 index d255ddd83..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_13b/146df856-e2c8-41eb-b860-ceb78c126e55.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v3_13b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v3_13b", - "id": "pankajmathur/orca_mini_v3_13b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2897 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4711 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4598 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2305 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_70b/74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v3_70b/74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json deleted file mode 100644 index 39384845f..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_70b/74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v3_70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v3_70b", - "id": "pankajmathur/orca_mini_v3_70b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5949 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5079 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3757 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_7b/b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v3_7b/b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json deleted file mode 100644 index 33c79ebe0..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_7b/b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v3_7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v3_7b", - "id": "pankajmathur/orca_mini_v3_7b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2821 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4095 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4982 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2084 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b/e79d0a8c-caec-4dec-b119-3229ffa69a73.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b/e79d0a8c-caec-4dec-b119-3229ffa69a73.json deleted file mode 100644 index 73bf37a29..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b/e79d0a8c-caec-4dec-b119-3229ffa69a73.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v5_8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v5_8b", - "id": "pankajmathur/orca_mini_v5_8b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4806 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5064 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3076 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_dpo/2c760893-b52a-40a9-9420-fb193a62a5c3.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_dpo/2c760893-b52a-40a9-9420-fb193a62a5c3.json deleted file mode 100644 index 1fb80ed95..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_dpo/2c760893-b52a-40a9-9420-fb193a62a5c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v5_8b_dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v5_8b_dpo", - "id": "pankajmathur/orca_mini_v5_8b_dpo", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4896 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3894 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3116 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_orpo/ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_orpo/ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json deleted file mode 100644 index b10b829f3..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_orpo/ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v5_8b_orpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v5_8b_orpo", - "id": "pankajmathur/orca_mini_v5_8b_orpo", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0824 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4964 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4131 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2947 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b/fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b/fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json deleted file mode 100644 index 0c0674af7..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b/fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v6_8b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v6_8b", - "id": "pankajmathur/orca_mini_v6_8b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0111 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2383 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b_dpo/9450acd9-16b6-49a2-9b73-cf1161b96df3.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b_dpo/9450acd9-16b6-49a2-9b73-cf1161b96df3.json deleted file mode 100644 index 92b4ebda8..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b_dpo/9450acd9-16b6-49a2-9b73-cf1161b96df3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v6_8b_dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v6_8b_dpo", - "id": "pankajmathur/orca_mini_v6_8b_dpo", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5203 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3596 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v7_72b/0d50ec2d-5dd4-487e-80cb-9533246a9876.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v7_72b/0d50ec2d-5dd4-487e-80cb-9533246a9876.json deleted file mode 100644 index 475ec2f73..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v7_72b/0d50ec2d-5dd4-487e-80cb-9533246a9876.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v7_72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v7_72b", - "id": "pankajmathur/orca_mini_v7_72b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.593 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6842 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5622 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v7_7b/f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v7_7b/f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json deleted file mode 100644 index 7c8b93f98..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v7_7b/f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v7_7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v7_7b", - "id": "pankajmathur/orca_mini_v7_7b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4388 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5275 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.436 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v8_1_70b/c5e48fd8-0eea-46a9-8790-1745923561d3.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v8_1_70b/c5e48fd8-0eea-46a9-8790-1745923561d3.json deleted file mode 100644 index 2d067df48..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v8_1_70b/c5e48fd8-0eea-46a9-8790-1745923561d3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v8_1_70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v8_1_70b", - "id": "pankajmathur/orca_mini_v8_1_70b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8571 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6781 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4329 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4983 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_0_3B-Instruct/870c7739-8886-47df-8e20-09bfae03b9c5.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_0_3B-Instruct/870c7739-8886-47df-8e20-09bfae03b9c5.json deleted file mode 100644 index 1af69e1fb..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_0_3B-Instruct/870c7739-8886-47df-8e20-09bfae03b9c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_0_3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_0_3B-Instruct", - "id": "pankajmathur/orca_mini_v9_0_3B-Instruct", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5754 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4413 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1465 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3659 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2603 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_1_1B-Instruct/d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_1_1B-Instruct/d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json deleted file mode 100644 index fd6a47751..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_1_1B-Instruct/d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_1_1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_1_1B-Instruct", - "id": "pankajmathur/orca_mini_v9_1_1B-Instruct", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3629 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1374 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_14B/6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_14B/6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json deleted file mode 100644 index a42075775..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_14B/6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_2_14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_2_14B", - "id": "pankajmathur/orca_mini_v9_2_14B", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7781 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6856 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4703 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5255 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_70b/24e7df20-e046-48f7-909e-502d0c70216a.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_70b/24e7df20-e046-48f7-909e-502d0c70216a.json deleted file mode 100644 index 7a7d9e13a..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_70b/24e7df20-e046-48f7-909e-502d0c70216a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_2_70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_2_70b", - "id": "pankajmathur/orca_mini_v9_2_70b", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8383 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6745 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2938 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.471 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4821 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_4_70B/7920f562-9e7f-4a64-85f4-584b13af44de.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_4_70B/7920f562-9e7f-4a64-85f4-584b13af44de.json deleted file mode 100644 index 2d1a1c562..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_4_70B/7920f562-9e7f-4a64-85f4-584b13af44de.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_4_70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_4_70B", - "id": "pankajmathur/orca_mini_v9_4_70B", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8015 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4647 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4536 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct/c6620817-69fe-40e2-bb0a-1e9c739ab65d.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct/c6620817-69fe-40e2-bb0a-1e9c739ab65d.json deleted file mode 100644 index e8976fd6e..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct/c6620817-69fe-40e2-bb0a-1e9c739ab65d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_5_1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_5_1B-Instruct", - "id": "pankajmathur/orca_mini_v9_5_1B-Instruct", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4638 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3337 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct_preview/520e2d66-4143-493b-8533-64f86c6d676e.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct_preview/520e2d66-4143-493b-8533-64f86c6d676e.json deleted file mode 100644 index ede7abcd5..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct_preview/520e2d66-4143-493b-8533-64f86c6d676e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_5_1B-Instruct_preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_5_1B-Instruct_preview", - "id": "pankajmathur/orca_mini_v9_5_1B-Instruct_preview", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3936 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3277 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1327 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_3B-Instruct/993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_3B-Instruct/993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json deleted file mode 100644 index 067bf5772..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_3B-Instruct/993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_5_3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_5_3B-Instruct", - "id": "pankajmathur/orca_mini_v9_5_3B-Instruct", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7207 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4496 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.427 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2882 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_1B-Instruct/4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_1B-Instruct/4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json deleted file mode 100644 index 6331cdee7..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_1B-Instruct/4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_6_1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_6_1B-Instruct", - "id": "pankajmathur/orca_mini_v9_6_1B-Instruct", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6086 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3561 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_3B-Instruct/42c174d1-6211-4438-bb9a-24f3cf386a6d.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_3B-Instruct/42c174d1-6211-4438-bb9a-24f3cf386a6d.json deleted file mode 100644 index 4727738e9..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_3B-Instruct/42c174d1-6211-4438-bb9a-24f3cf386a6d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_6_3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_6_3B-Instruct", - "id": "pankajmathur/orca_mini_v9_6_3B-Instruct", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7316 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4568 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1329 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4068 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2851 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_1B-Instruct/625bf39b-a118-4ec6-82d0-5405cf70ba53.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_1B-Instruct/625bf39b-a118-4ec6-82d0-5405cf70ba53.json deleted file mode 100644 index ad5c217d2..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_1B-Instruct/625bf39b-a118-4ec6-82d0-5405cf70ba53.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_7_1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_7_1B-Instruct", - "id": "pankajmathur/orca_mini_v9_7_1B-Instruct", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.561 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1345 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_3B-Instruct/e09cb198-d259-42ea-a356-6efe61b1e12b.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_3B-Instruct/e09cb198-d259-42ea-a356-6efe61b1e12b.json deleted file mode 100644 index 57b5c8eb9..000000000 --- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_3B-Instruct/e09cb198-d259-42ea-a356-6efe61b1e12b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_7_3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "orca_mini_v9_7_3B-Instruct", - "id": "pankajmathur/orca_mini_v9_7_3B-Instruct", - "developer": "pankajmathur", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5618 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3619 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1375 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/paulml/ECE-ILAB-Q1/5838b130-c2e6-400c-80b7-6822efb5db2c.json b/data/hfopenllm_v2/paulml/ECE-ILAB-Q1/5838b130-c2e6-400c-80b7-6822efb5db2c.json deleted file mode 100644 index 4666ebf13..000000000 --- a/data/hfopenllm_v2/paulml/ECE-ILAB-Q1/5838b130-c2e6-400c-80b7-6822efb5db2c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/paulml_ECE-ILAB-Q1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-ILAB-Q1", - "id": "paulml/ECE-ILAB-Q1", - "developer": "paulml", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7865 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6718 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4614 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pints-ai/1.5-Pints-16K-v0.1/52b51638-64cd-4b19-8fc7-c223d50bc549.json b/data/hfopenllm_v2/pints-ai/1.5-Pints-16K-v0.1/52b51638-64cd-4b19-8fc7-c223d50bc549.json deleted file mode 100644 index e89688fb3..000000000 --- a/data/hfopenllm_v2/pints-ai/1.5-Pints-16K-v0.1/52b51638-64cd-4b19-8fc7-c223d50bc549.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pints-ai_1.5-Pints-16K-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "1.5-Pints-16K-v0.1", - "id": "pints-ai/1.5-Pints-16K-v0.1", - "developer": "pints-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.566 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1636 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3133 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2357 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1119 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pints-ai/1.5-Pints-2K-v0.1/28b3178b-c963-4267-9649-3f7fc10fba3c.json b/data/hfopenllm_v2/pints-ai/1.5-Pints-2K-v0.1/28b3178b-c963-4267-9649-3f7fc10fba3c.json deleted file mode 100644 index 9f6e80b6f..000000000 --- a/data/hfopenllm_v2/pints-ai/1.5-Pints-2K-v0.1/28b3178b-c963-4267-9649-3f7fc10fba3c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pints-ai_1.5-Pints-2K-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "1.5-Pints-2K-v0.1", - "id": "pints-ai/1.5-Pints-2K-v0.1", - "developer": "pints-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.566 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1104 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/piotr25691/thea-3b-25r/748298a2-5042-4636-ac7e-051c28916f3a.json b/data/hfopenllm_v2/piotr25691/thea-3b-25r/748298a2-5042-4636-ac7e-051c28916f3a.json deleted file mode 100644 index b4477c2ac..000000000 --- a/data/hfopenllm_v2/piotr25691/thea-3b-25r/748298a2-5042-4636-ac7e-051c28916f3a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/piotr25691_thea-3b-25r/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "thea-3b-25r", - "id": "piotr25691/thea-3b-25r", - "developer": "piotr25691", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7344 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4484 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1782 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/piotr25691/thea-c-3b-25r/03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json b/data/hfopenllm_v2/piotr25691/thea-c-3b-25r/03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json deleted file mode 100644 index add3fcd04..000000000 --- a/data/hfopenllm_v2/piotr25691/thea-c-3b-25r/03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/piotr25691_thea-c-3b-25r/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "thea-c-3b-25r", - "id": "piotr25691/thea-c-3b-25r", - "developer": "piotr25691", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7402 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4532 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1526 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3315 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3178 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/piotr25691/thea-rp-3b-25r/c7fba530-63cc-4ece-a171-4a2919aa8057.json b/data/hfopenllm_v2/piotr25691/thea-rp-3b-25r/c7fba530-63cc-4ece-a171-4a2919aa8057.json deleted file mode 100644 index 2c66084d4..000000000 --- a/data/hfopenllm_v2/piotr25691/thea-rp-3b-25r/c7fba530-63cc-4ece-a171-4a2919aa8057.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/piotr25691_thea-rp-3b-25r/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "thea-rp-3b-25r", - "id": "piotr25691/thea-rp-3b-25r", - "developer": "piotr25691", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6578 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.439 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1322 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3819 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/postbot/gpt2-medium-emailgen/c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json b/data/hfopenllm_v2/postbot/gpt2-medium-emailgen/c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json deleted file mode 100644 index e0f2e52de..000000000 --- a/data/hfopenllm_v2/postbot/gpt2-medium-emailgen/c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/postbot_gpt2-medium-emailgen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt2-medium-emailgen", - "id": "postbot/gpt2-medium-emailgen", - "developer": "postbot", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.38 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1492 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.313 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3911 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prince-canuma/Ministral-8B-Instruct-2410-HF/c3800a5c-310b-41cb-9b07-cfc1f1b13256.json b/data/hfopenllm_v2/prince-canuma/Ministral-8B-Instruct-2410-HF/c3800a5c-310b-41cb-9b07-cfc1f1b13256.json deleted file mode 100644 index 6b36039e0..000000000 --- a/data/hfopenllm_v2/prince-canuma/Ministral-8B-Instruct-2410-HF/c3800a5c-310b-41cb-9b07-cfc1f1b13256.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prince-canuma_Ministral-8B-Instruct-2410-HF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ministral-8B-Instruct-2410-HF", - "id": "prince-canuma/Ministral-8B-Instruct-2410-HF", - "developer": "prince-canuma", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.02 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5912 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4586 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1918 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3298 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Base/e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Base/e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json deleted file mode 100644 index f4645a2cd..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Base/e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-512k-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-ProLong-512k-Base", - "id": "princeton-nlp/Llama-3-8B-ProLong-512k-Base", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5322 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5033 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4223 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/2da19e45-117f-446b-b956-b35a20bb7411.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/2da19e45-117f-446b-b956-b35a20bb7411.json deleted file mode 100644 index f3f5626a4..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/2da19e45-117f-446b-b956-b35a20bb7411.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-512k-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-ProLong-512k-Instruct", - "id": "princeton-nlp/Llama-3-8B-ProLong-512k-Instruct", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5508 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5028 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4266 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3231 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/9e982a33-19cb-4381-8560-884bc8946a2b.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/9e982a33-19cb-4381-8560-884bc8946a2b.json deleted file mode 100644 index 62d796011..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/9e982a33-19cb-4381-8560-884bc8946a2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-512k-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-ProLong-512k-Instruct", - "id": "princeton-nlp/Llama-3-8B-ProLong-512k-Instruct", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3978 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4983 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3246 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Base/9130a862-cfd7-47ce-a92a-f60438739491.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Base/9130a862-cfd7-47ce-a92a-f60438739491.json deleted file mode 100644 index 41ea67f01..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Base/9130a862-cfd7-47ce-a92a-f60438739491.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-64k-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-ProLong-64k-Base", - "id": "princeton-nlp/Llama-3-8B-ProLong-64k-Base", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5201 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4927 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3348 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Instruct/858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Instruct/858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json deleted file mode 100644 index b83c173d0..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Instruct/858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-64k-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-ProLong-64k-Instruct", - "id": "princeton-nlp/Llama-3-8B-ProLong-64k-Instruct", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5563 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5083 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4397 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3275 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-CPO/5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-CPO/5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json deleted file mode 100644 index 25691f45b..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-CPO/5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-CPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT-CPO", - "id": "princeton-nlp/Llama-3-Base-8B-SFT-CPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3703 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3609 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2976 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-DPO/6feca911-7a6e-43a2-b59d-7cb48070fe8e.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-DPO/6feca911-7a6e-43a2-b59d-7cb48070fe8e.json deleted file mode 100644 index 84c86e76e..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-DPO/6feca911-7a6e-43a2-b59d-7cb48070fe8e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT-DPO", - "id": "princeton-nlp/Llama-3-Base-8B-SFT-DPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4111 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4666 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3078 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-IPO/d3ad9813-273e-47de-be16-312cc67ac64f.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-IPO/d3ad9813-273e-47de-be16-312cc67ac64f.json deleted file mode 100644 index 95f0c784c..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-IPO/d3ad9813-273e-47de-be16-312cc67ac64f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-IPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT-IPO", - "id": "princeton-nlp/Llama-3-Base-8B-SFT-IPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.469 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3919 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-KTO/317205ee-2cc6-4523-9662-be6508314b08.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-KTO/317205ee-2cc6-4523-9662-be6508314b08.json deleted file mode 100644 index 1afb80262..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-KTO/317205ee-2cc6-4523-9662-be6508314b08.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-KTO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT-KTO", - "id": "princeton-nlp/Llama-3-Base-8B-SFT-KTO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4523 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4693 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-ORPO/3b5fe65a-50a1-4036-b81a-86117356cab9.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-ORPO/3b5fe65a-50a1-4036-b81a-86117356cab9.json deleted file mode 100644 index 4c1233c1f..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-ORPO/3b5fe65a-50a1-4036-b81a-86117356cab9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT-ORPO", - "id": "princeton-nlp/Llama-3-Base-8B-SFT-ORPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4734 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3707 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3083 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RDPO/812ac262-97f4-485e-93de-f8d420b8658e.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RDPO/812ac262-97f4-485e-93de-f8d420b8658e.json deleted file mode 100644 index 21691d320..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RDPO/812ac262-97f4-485e-93de-f8d420b8658e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-RDPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT-RDPO", - "id": "princeton-nlp/Llama-3-Base-8B-SFT-RDPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.448 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4662 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4027 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3014 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RRHF/39cd7eb0-781e-47b6-8eaa-c72e702f778f.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RRHF/39cd7eb0-781e-47b6-8eaa-c72e702f778f.json deleted file mode 100644 index 564369fd0..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RRHF/39cd7eb0-781e-47b6-8eaa-c72e702f778f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-RRHF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT-RRHF", - "id": "princeton-nlp/Llama-3-Base-8B-SFT-RRHF", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3357 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3722 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2889 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF/9411a8a4-306e-43da-96d7-c93eb3aac398.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF/9411a8a4-306e-43da-96d7-c93eb3aac398.json deleted file mode 100644 index 3220421bd..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF/9411a8a4-306e-43da-96d7-c93eb3aac398.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-SLiC-HF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT-SLiC-HF", - "id": "princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.489 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4704 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4091 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SimPO/c93feb32-0526-44ac-b3ed-95f08c37cc9f.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SimPO/c93feb32-0526-44ac-b3ed-95f08c37cc9f.json deleted file mode 100644 index 5ed4540df..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SimPO/c93feb32-0526-44ac-b3ed-95f08c37cc9f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-SimPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT-SimPO", - "id": "princeton-nlp/Llama-3-Base-8B-SFT-SimPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4685 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4741 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4127 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3105 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT/1a3b0f7a-afb6-4002-9321-23a86f000c5c.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT/1a3b0f7a-afb6-4002-9321-23a86f000c5c.json deleted file mode 100644 index d4c8acc9b..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT/1a3b0f7a-afb6-4002-9321-23a86f000c5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Base-8B-SFT", - "id": "princeton-nlp/Llama-3-Base-8B-SFT", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2796 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4643 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2/8d29363d-3096-4c54-a40e-acf4a7318a04.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2/8d29363d-3096-4c54-a40e-acf4a7318a04.json deleted file mode 100644 index 6799069cb..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2/8d29363d-3096-4c54-a40e-acf4a7318a04.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-CPO-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-CPO-v0.2", - "id": "princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7506 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5027 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3619 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3706 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO/8cea452d-63b8-4e82-9511-64c94f8e140d.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO/8cea452d-63b8-4e82-9511-64c94f8e140d.json deleted file mode 100644 index acd408cf8..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO/8cea452d-63b8-4e82-9511-64c94f8e140d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-CPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-CPO", - "id": "princeton-nlp/Llama-3-Instruct-8B-CPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7293 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4999 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3652 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2/5e5b5424-1d48-4a5e-8775-52c75609c338.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2/5e5b5424-1d48-4a5e-8775-52c75609c338.json deleted file mode 100644 index 68024676a..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2/5e5b5424-1d48-4a5e-8775-52c75609c338.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-DPO-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-DPO-v0.2", - "id": "princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5056 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3844 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3769 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO/73787033-ed1d-4d2e-b7b2-e886ef6f1036.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO/73787033-ed1d-4d2e-b7b2-e886ef6f1036.json deleted file mode 100644 index 9471420c1..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO/73787033-ed1d-4d2e-b7b2-e886ef6f1036.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-DPO", - "id": "princeton-nlp/Llama-3-Instruct-8B-DPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4991 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3665 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2/54c9403f-2525-45c0-a585-9ff598f95f6b.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2/54c9403f-2525-45c0-a585-9ff598f95f6b.json deleted file mode 100644 index 8762059d7..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2/54c9403f-2525-45c0-a585-9ff598f95f6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-KTO-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-KTO-v0.2", - "id": "princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.729 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3777 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3668 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO/77d0d88d-7ca8-4f3e-8b79-295f53140635.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO/77d0d88d-7ca8-4f3e-8b79-295f53140635.json deleted file mode 100644 index 0d6769c0d..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO/77d0d88d-7ca8-4f3e-8b79-295f53140635.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-KTO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-KTO", - "id": "princeton-nlp/Llama-3-Instruct-8B-KTO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6864 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4982 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3599 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2/727f27e3-2a3f-4572-8db5-87e498c4b6ca.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2/727f27e3-2a3f-4572-8db5-87e498c4b6ca.json deleted file mode 100644 index 26a0731e0..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2/727f27e3-2a3f-4572-8db5-87e498c4b6ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-ORPO-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-ORPO-v0.2", - "id": "princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7633 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5078 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3731 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO/b6e0cc97-27cf-4082-a908-95d5c39014b8.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO/b6e0cc97-27cf-4082-a908-95d5c39014b8.json deleted file mode 100644 index 64f178681..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO/b6e0cc97-27cf-4082-a908-95d5c39014b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-ORPO", - "id": "princeton-nlp/Llama-3-Instruct-8B-ORPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7128 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5001 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3646 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2/3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2/3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json deleted file mode 100644 index bac88e0ae..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2/3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-RDPO-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-RDPO-v0.2", - "id": "princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7077 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5049 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3774 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO/b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO/b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json deleted file mode 100644 index 03be08868..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO/b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-RDPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-RDPO", - "id": "princeton-nlp/Llama-3-Instruct-8B-RDPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.666 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5034 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3752 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2/e47a3cab-dfef-47f6-9377-9ee32489bab6.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2/e47a3cab-dfef-47f6-9377-9ee32489bab6.json deleted file mode 100644 index d4d35dc63..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2/e47a3cab-dfef-47f6-9377-9ee32489bab6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-RRHF-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-RRHF-v0.2", - "id": "princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7125 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF/1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF/1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json deleted file mode 100644 index e44593e69..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF/1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-RRHF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-RRHF", - "id": "princeton-nlp/Llama-3-Instruct-8B-RRHF", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4911 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3476 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3644 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2/6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2/6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json deleted file mode 100644 index 7c04e216a..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2/6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SLiC-HF-v0.2", - "id": "princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.711 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF/55f43b53-6ed9-4c16-bf75-c968999a6f36.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF/55f43b53-6ed9-4c16-bf75-c968999a6f36.json deleted file mode 100644 index abb7ea88f..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF/55f43b53-6ed9-4c16-bf75-c968999a6f36.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SLiC-HF", - "id": "princeton-nlp/Llama-3-Instruct-8B-SLiC-HF", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5029 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3585 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2/6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2/6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json deleted file mode 100644 index ffe9fbf5e..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2/6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-SimPO-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SimPO-v0.2", - "id": "princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6809 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5038 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3988 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3622 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO/95096a89-2baf-4b14-bc6e-1f30e920c086.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO/95096a89-2baf-4b14-bc6e-1f30e920c086.json deleted file mode 100644 index c46b79969..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO/95096a89-2baf-4b14-bc6e-1f30e920c086.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-SimPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Instruct-8B-SimPO", - "id": "princeton-nlp/Llama-3-Instruct-8B-SimPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6504 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4845 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3948 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3489 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-CPO/f1651632-2787-47cf-b471-89d1b89a6b01.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-CPO/f1651632-2787-47cf-b471-89d1b89a6b01.json deleted file mode 100644 index 625ad8634..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-CPO/f1651632-2787-47cf-b471-89d1b89a6b01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-CPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Base-SFT-CPO", - "id": "princeton-nlp/Mistral-7B-Base-SFT-CPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4655 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-DPO/e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-DPO/e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json deleted file mode 100644 index a960ca923..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-DPO/e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Base-SFT-DPO", - "id": "princeton-nlp/Mistral-7B-Base-SFT-DPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4403 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.435 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4122 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2645 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-IPO/d3accbc1-d698-4357-ab08-0b98fb49b4ed.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-IPO/d3accbc1-d698-4357-ab08-0b98fb49b4ed.json deleted file mode 100644 index 908f14ac8..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-IPO/d3accbc1-d698-4357-ab08-0b98fb49b4ed.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-IPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Base-SFT-IPO", - "id": "princeton-nlp/Mistral-7B-Base-SFT-IPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.483 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4458 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2792 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-KTO/5388a25a-5780-4ae1-999f-172b558a7b52.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-KTO/5388a25a-5780-4ae1-999f-172b558a7b52.json deleted file mode 100644 index 31e5cf5b4..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-KTO/5388a25a-5780-4ae1-999f-172b558a7b52.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-KTO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Base-SFT-KTO", - "id": "princeton-nlp/Mistral-7B-Base-SFT-KTO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4785 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4476 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2872 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RDPO/9e4143ff-d461-4fdb-8bc7-86f959f69e68.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RDPO/9e4143ff-d461-4fdb-8bc7-86f959f69e68.json deleted file mode 100644 index 56f6acce0..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RDPO/9e4143ff-d461-4fdb-8bc7-86f959f69e68.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-RDPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Base-SFT-RDPO", - "id": "princeton-nlp/Mistral-7B-Base-SFT-RDPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4606 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3579 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RRHF/5d843bd7-b34b-41d4-92ff-c25a709b4930.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RRHF/5d843bd7-b34b-41d4-92ff-c25a709b4930.json deleted file mode 100644 index 8699b54e7..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RRHF/5d843bd7-b34b-41d4-92ff-c25a709b4930.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-RRHF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Base-SFT-RRHF", - "id": "princeton-nlp/Mistral-7B-Base-SFT-RRHF", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4407 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4281 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2398 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF/87975b2f-298b-4297-8f4d-e5bb1bf5d113.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF/87975b2f-298b-4297-8f4d-e5bb1bf5d113.json deleted file mode 100644 index 09f698c9e..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF/87975b2f-298b-4297-8f4d-e5bb1bf5d113.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-SLiC-HF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Base-SFT-SLiC-HF", - "id": "princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5127 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4422 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4261 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2781 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SimPO/41bb8174-f3d6-4862-b892-dbc9f6e2e696.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SimPO/41bb8174-f3d6-4862-b892-dbc9f6e2e696.json deleted file mode 100644 index 5f30fecdb..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SimPO/41bb8174-f3d6-4862-b892-dbc9f6e2e696.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-SimPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Base-SFT-SimPO", - "id": "princeton-nlp/Mistral-7B-Base-SFT-SimPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4701 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4398 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3971 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2702 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-CPO/683ad2cd-5e39-4088-b98b-94d89dda7b88.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-CPO/683ad2cd-5e39-4088-b98b-94d89dda7b88.json deleted file mode 100644 index 256520f89..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-CPO/683ad2cd-5e39-4088-b98b-94d89dda7b88.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-CPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-CPO", - "id": "princeton-nlp/Mistral-7B-Instruct-CPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4069 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-DPO/08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-DPO/08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json deleted file mode 100644 index c489fb059..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-DPO/08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-DPO", - "id": "princeton-nlp/Mistral-7B-Instruct-DPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5176 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3833 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2749 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-IPO/4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-IPO/4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json deleted file mode 100644 index 2ed1a6808..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-IPO/4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-IPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-IPO", - "id": "princeton-nlp/Mistral-7B-Instruct-IPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4929 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4322 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4324 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2708 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-KTO/4986c30a-85b0-4263-9be4-d69c9b067e0c.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-KTO/4986c30a-85b0-4263-9be4-d69c9b067e0c.json deleted file mode 100644 index 9d84a30d3..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-KTO/4986c30a-85b0-4263-9be4-d69c9b067e0c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-KTO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-KTO", - "id": "princeton-nlp/Mistral-7B-Instruct-KTO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4908 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3953 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-ORPO/47b5a878-1a4a-425f-ae6f-ac286f681cca.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-ORPO/47b5a878-1a4a-425f-ae6f-ac286f681cca.json deleted file mode 100644 index ef91d0d7f..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-ORPO/47b5a878-1a4a-425f-ae6f-ac286f681cca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-ORPO", - "id": "princeton-nlp/Mistral-7B-Instruct-ORPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.472 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4104 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3912 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2662 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RDPO/992a6862-46b9-415e-858f-2eff8709ca81.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RDPO/992a6862-46b9-415e-858f-2eff8709ca81.json deleted file mode 100644 index 99e611d5b..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RDPO/992a6862-46b9-415e-858f-2eff8709ca81.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-RDPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-RDPO", - "id": "princeton-nlp/Mistral-7B-Instruct-RDPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4887 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.405 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RRHF/c6391381-c973-4068-b72c-af08762d9e5c.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RRHF/c6391381-c973-4068-b72c-af08762d9e5c.json deleted file mode 100644 index d0f2bc97e..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RRHF/c6391381-c973-4068-b72c-af08762d9e5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-RRHF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-RRHF", - "id": "princeton-nlp/Mistral-7B-Instruct-RRHF", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3979 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SLiC-HF/0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SLiC-HF/0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json deleted file mode 100644 index 43e0e4f1d..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SLiC-HF/0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-SLiC-HF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-SLiC-HF", - "id": "princeton-nlp/Mistral-7B-Instruct-SLiC-HF", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5115 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.404 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3913 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2715 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SimPO/56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SimPO/56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json deleted file mode 100644 index b5dab8271..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SimPO/56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-SimPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-SimPO", - "id": "princeton-nlp/Mistral-7B-Instruct-SimPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4687 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2797 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-1.3B/d3e753cc-37fc-4d77-8b2d-da90a7843d60.json b/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-1.3B/d3e753cc-37fc-4d77-8b2d-da90a7843d60.json deleted file mode 100644 index e3ac0d5ad..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-1.3B/d3e753cc-37fc-4d77-8b2d-da90a7843d60.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Sheared-LLaMA-1.3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sheared-LLaMA-1.3B", - "id": "princeton-nlp/Sheared-LLaMA-1.3B", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.3 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3197 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1171 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-2.7B/eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json b/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-2.7B/eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json deleted file mode 100644 index 8203b5fb5..000000000 --- a/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-2.7B/eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_Sheared-LLaMA-2.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sheared-LLaMA-2.7B", - "id": "princeton-nlp/Sheared-LLaMA-2.7B", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 2.7 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2417 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3259 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3567 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1187 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-DPO/2207b154-c5d4-4e5a-ade0-271e62d6345f.json b/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-DPO/2207b154-c5d4-4e5a-ade0-271e62d6345f.json deleted file mode 100644 index fd91acee8..000000000 --- a/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-DPO/2207b154-c5d4-4e5a-ade0-271e62d6345f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_gemma-2-9b-it-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-it-DPO", - "id": "princeton-nlp/gemma-2-9b-it-DPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2769 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5941 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-SimPO/f4161154-7777-4261-9275-a3002a1305d8.json b/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-SimPO/f4161154-7777-4261-9275-a3002a1305d8.json deleted file mode 100644 index 0d667a77a..000000000 --- a/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-SimPO/f4161154-7777-4261-9275-a3002a1305d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/princeton-nlp_gemma-2-9b-it-SimPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-it-SimPO", - "id": "princeton-nlp/gemma-2-9b-it-SimPO", - "developer": "princeton-nlp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3207 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5839 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4123 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3975 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Bellatrix-1.5B-xElite/8523812d-1db6-4a9d-b06b-ac904191789d.json b/data/hfopenllm_v2/prithivMLmods/Bellatrix-1.5B-xElite/8523812d-1db6-4a9d-b06b-ac904191789d.json deleted file mode 100644 index a73f99ed2..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Bellatrix-1.5B-xElite/8523812d-1db6-4a9d-b06b-ac904191789d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Bellatrix-1.5B-xElite/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bellatrix-1.5B-xElite", - "id": "prithivMLmods/Bellatrix-1.5B-xElite", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1964 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3619 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1657 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1.5B-R1/6cd9ea81-618d-444e-a892-d4f9819daa67.json b/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1.5B-R1/6cd9ea81-618d-444e-a892-d4f9819daa67.json deleted file mode 100644 index ed7c9d8c6..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1.5B-R1/6cd9ea81-618d-444e-a892-d4f9819daa67.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Bellatrix-Tiny-1.5B-R1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bellatrix-Tiny-1.5B-R1", - "id": "prithivMLmods/Bellatrix-Tiny-1.5B-R1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4022 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2751 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1B-v2/2217326d-377a-4503-8180-206c12c87436.json b/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1B-v2/2217326d-377a-4503-8180-206c12c87436.json deleted file mode 100644 index e1aaf8729..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1B-v2/2217326d-377a-4503-8180-206c12c87436.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Bellatrix-Tiny-1B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bellatrix-Tiny-1B-v2", - "id": "prithivMLmods/Bellatrix-Tiny-1B-v2", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.151 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3268 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1493 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Blaze-14B-xElite/3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json b/data/hfopenllm_v2/prithivMLmods/Blaze-14B-xElite/3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json deleted file mode 100644 index b456222e3..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Blaze-14B-xElite/3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Blaze-14B-xElite/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Blaze-14B-xElite", - "id": "prithivMLmods/Blaze-14B-xElite", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6628 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4625 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/COCO-7B-Instruct-1M/01124f11-b739-422b-97f7-062074b8d0fb.json b/data/hfopenllm_v2/prithivMLmods/COCO-7B-Instruct-1M/01124f11-b739-422b-97f7-062074b8d0fb.json deleted file mode 100644 index 7b3bedda6..000000000 --- a/data/hfopenllm_v2/prithivMLmods/COCO-7B-Instruct-1M/01124f11-b739-422b-97f7-062074b8d0fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_COCO-7B-Instruct-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "COCO-7B-Instruct-1M", - "id": "prithivMLmods/COCO-7B-Instruct-1M", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4743 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.541 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-1M/7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-1M/7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json deleted file mode 100644 index f174ff65b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-1M/7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-14B-Elite-1M", - "id": "prithivMLmods/Calcium-Opus-14B-Elite-1M", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5613 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6329 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4676 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5152 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-Stock/bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-Stock/bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json deleted file mode 100644 index 5f39e061c..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-Stock/bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite-Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-14B-Elite-Stock", - "id": "prithivMLmods/Calcium-Opus-14B-Elite-Stock", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6143 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6329 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4668 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4808 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5284 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/2eae8905-5338-4a78-86e7-d354d06efa23.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/2eae8905-5338-4a78-86e7-d354d06efa23.json deleted file mode 100644 index ead3156a2..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/2eae8905-5338-4a78-86e7-d354d06efa23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-14B-Elite", - "id": "prithivMLmods/Calcium-Opus-14B-Elite", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6296 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5307 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/9dcc4121-e046-49c7-969e-7255b0c32d3d.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/9dcc4121-e046-49c7-969e-7255b0c32d3d.json deleted file mode 100644 index 4683966d3..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/9dcc4121-e046-49c7-969e-7255b0c32d3d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-14B-Elite", - "id": "prithivMLmods/Calcium-Opus-14B-Elite", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6052 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6317 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4789 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2-R1/dd7d4acd-549a-467b-b461-0eba5b019122.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2-R1/dd7d4acd-549a-467b-b461-0eba5b019122.json deleted file mode 100644 index 4c691d1f5..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2-R1/dd7d4acd-549a-467b-b461-0eba5b019122.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite2-R1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-14B-Elite2-R1", - "id": "prithivMLmods/Calcium-Opus-14B-Elite2-R1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6326 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6362 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3338 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5248 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2/159969cc-32c5-4f6f-b586-8e6d44180b44.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2/159969cc-32c5-4f6f-b586-8e6d44180b44.json deleted file mode 100644 index 48a8d72bd..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2/159969cc-32c5-4f6f-b586-8e6d44180b44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-14B-Elite2", - "id": "prithivMLmods/Calcium-Opus-14B-Elite2", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6176 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6318 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.469 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.494 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5301 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite3/b80e559d-e519-4678-8abc-ee5591b81fac.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite3/b80e559d-e519-4678-8abc-ee5591b81fac.json deleted file mode 100644 index 25816e9c4..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite3/b80e559d-e519-4678-8abc-ee5591b81fac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-14B-Elite3", - "id": "prithivMLmods/Calcium-Opus-14B-Elite3", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5428 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.635 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4705 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4795 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5335 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite4/90c137c9-939d-4e77-9fcc-9e33551a6121.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite4/90c137c9-939d-4e77-9fcc-9e33551a6121.json deleted file mode 100644 index 2a82505c9..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite4/90c137c9-939d-4e77-9fcc-9e33551a6121.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-14B-Elite4", - "id": "prithivMLmods/Calcium-Opus-14B-Elite4", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6112 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6195 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4687 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5149 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Merge/f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Merge/f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json deleted file mode 100644 index 7ea9119d9..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Merge/f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-14B-Merge", - "id": "prithivMLmods/Calcium-Opus-14B-Merge", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4949 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6319 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4861 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5356 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-20B-v1/c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-20B-v1/c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json deleted file mode 100644 index 84bf99e76..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-20B-v1/c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-20B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Calcium-Opus-20B-v1", - "id": "prithivMLmods/Calcium-Opus-20B-v1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 19.173 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4943 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4734 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Codepy-Deepthink-3B/96c64d23-d23d-486c-83a4-4c0ab4f09d60.json b/data/hfopenllm_v2/prithivMLmods/Codepy-Deepthink-3B/96c64d23-d23d-486c-83a4-4c0ab4f09d60.json deleted file mode 100644 index 8b565a25a..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Codepy-Deepthink-3B/96c64d23-d23d-486c-83a4-4c0ab4f09d60.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Codepy-Deepthink-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Codepy-Deepthink-3B", - "id": "prithivMLmods/Codepy-Deepthink-3B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4327 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4259 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.331 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.309 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Coma-II-14B/243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json b/data/hfopenllm_v2/prithivMLmods/Coma-II-14B/243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json deleted file mode 100644 index 8f5675e29..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Coma-II-14B/243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Coma-II-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Coma-II-14B", - "id": "prithivMLmods/Coma-II-14B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4168 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6321 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4002 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5351 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.504 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Condor-Opus-14B-Exp/438fb728-d6ad-4c28-a43c-ff82d522cd50.json b/data/hfopenllm_v2/prithivMLmods/Condor-Opus-14B-Exp/438fb728-d6ad-4c28-a43c-ff82d522cd50.json deleted file mode 100644 index 8ad6643ba..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Condor-Opus-14B-Exp/438fb728-d6ad-4c28-a43c-ff82d522cd50.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Condor-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Condor-Opus-14B-Exp", - "id": "prithivMLmods/Condor-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4043 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6154 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3918 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5014 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Cygnus-II-14B/94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json b/data/hfopenllm_v2/prithivMLmods/Cygnus-II-14B/94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json deleted file mode 100644 index e0650e76a..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Cygnus-II-14B/94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Cygnus-II-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cygnus-II-14B", - "id": "prithivMLmods/Cygnus-II-14B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6184 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6661 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Deepthink-Llama-3-8B-Preview/5618fc82-d455-4261-8e34-1190d70fd3f3.json b/data/hfopenllm_v2/prithivMLmods/Deepthink-Llama-3-8B-Preview/5618fc82-d455-4261-8e34-1190d70fd3f3.json deleted file mode 100644 index 78544d542..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Deepthink-Llama-3-8B-Preview/5618fc82-d455-4261-8e34-1190d70fd3f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Deepthink-Llama-3-8B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Deepthink-Llama-3-8B-Preview", - "id": "prithivMLmods/Deepthink-Llama-3-8B-Preview", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2955 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4665 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3707 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2739 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-14B/395f6339-3fca-4f4d-befc-2d231008efdd.json b/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-14B/395f6339-3fca-4f4d-befc-2d231008efdd.json deleted file mode 100644 index 3bf404b51..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-14B/395f6339-3fca-4f4d-befc-2d231008efdd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Deepthink-Reasoning-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Deepthink-Reasoning-14B", - "id": "prithivMLmods/Deepthink-Reasoning-14B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5424 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6334 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4732 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5296 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-7B/b22696ac-7074-44f2-b72f-c59ca0a41ce6.json b/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-7B/b22696ac-7074-44f2-b72f-c59ca0a41ce6.json deleted file mode 100644 index 4a2741947..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-7B/b22696ac-7074-44f2-b72f-c59ca0a41ce6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Deepthink-Reasoning-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Deepthink-Reasoning-7B", - "id": "prithivMLmods/Deepthink-Reasoning-7B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.484 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5505 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3346 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4349 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Dinobot-Opus-14B-Exp/6856f8b6-a719-4f69-be71-4df582015f28.json b/data/hfopenllm_v2/prithivMLmods/Dinobot-Opus-14B-Exp/6856f8b6-a719-4f69-be71-4df582015f28.json deleted file mode 100644 index e70ce3f68..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Dinobot-Opus-14B-Exp/6856f8b6-a719-4f69-be71-4df582015f28.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Dinobot-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Dinobot-Opus-14B-Exp", - "id": "prithivMLmods/Dinobot-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.824 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.637 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5317 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4979 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Elita-0.1-Distilled-R1-abliterated/f2c0ea2b-76ae-4469-832e-84c0b79fa283.json b/data/hfopenllm_v2/prithivMLmods/Elita-0.1-Distilled-R1-abliterated/f2c0ea2b-76ae-4469-832e-84c0b79fa283.json deleted file mode 100644 index 588c887dc..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Elita-0.1-Distilled-R1-abliterated/f2c0ea2b-76ae-4469-832e-84c0b79fa283.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Elita-0.1-Distilled-R1-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Elita-0.1-Distilled-R1-abliterated", - "id": "prithivMLmods/Elita-0.1-Distilled-R1-abliterated", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3542 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3828 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3066 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2758 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Elita-1/5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json b/data/hfopenllm_v2/prithivMLmods/Elita-1/5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json deleted file mode 100644 index 54138e141..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Elita-1/5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Elita-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Elita-1", - "id": "prithivMLmods/Elita-1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4906 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.652 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3429 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3758 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4834 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Epimetheus-14B-Axo/9d5e329f-491a-4608-bcac-1ee63046b34a.json b/data/hfopenllm_v2/prithivMLmods/Epimetheus-14B-Axo/9d5e329f-491a-4608-bcac-1ee63046b34a.json deleted file mode 100644 index 188b84ba8..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Epimetheus-14B-Axo/9d5e329f-491a-4608-bcac-1ee63046b34a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Epimetheus-14B-Axo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Epimetheus-14B-Axo", - "id": "prithivMLmods/Epimetheus-14B-Axo", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5546 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6613 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5304 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Equuleus-Opus-14B-Exp/80953f08-6530-4bab-a375-cc542081aabb.json b/data/hfopenllm_v2/prithivMLmods/Equuleus-Opus-14B-Exp/80953f08-6530-4bab-a375-cc542081aabb.json deleted file mode 100644 index 31a9c3f6f..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Equuleus-Opus-14B-Exp/80953f08-6530-4bab-a375-cc542081aabb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Equuleus-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Equuleus-Opus-14B-Exp", - "id": "prithivMLmods/Equuleus-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7001 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6434 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4585 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4952 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Eridanus-Opus-14B-r999/0b8691a8-f394-4da3-a67b-faa1af9b42c9.json b/data/hfopenllm_v2/prithivMLmods/Eridanus-Opus-14B-r999/0b8691a8-f394-4da3-a67b-faa1af9b42c9.json deleted file mode 100644 index 4e0e78a3e..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Eridanus-Opus-14B-r999/0b8691a8-f394-4da3-a67b-faa1af9b42c9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Eridanus-Opus-14B-r999/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Eridanus-Opus-14B-r999", - "id": "prithivMLmods/Eridanus-Opus-14B-r999", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6386 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6584 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4769 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5362 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Evac-Opus-14B-Exp/fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json b/data/hfopenllm_v2/prithivMLmods/Evac-Opus-14B-Exp/fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json deleted file mode 100644 index 29b96ac56..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Evac-Opus-14B-Exp/fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Evac-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Evac-Opus-14B-Exp", - "id": "prithivMLmods/Evac-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5916 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6475 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4215 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4728 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5317 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/FastThink-0.5B-Tiny/c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json b/data/hfopenllm_v2/prithivMLmods/FastThink-0.5B-Tiny/c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json deleted file mode 100644 index 6a08d3a8b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/FastThink-0.5B-Tiny/c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_FastThink-0.5B-Tiny/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FastThink-0.5B-Tiny", - "id": "prithivMLmods/FastThink-0.5B-Tiny", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.258 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3206 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3566 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1649 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview/8a10eeb6-7178-4c78-8940-68fad78e389b.json b/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview/8a10eeb6-7178-4c78-8940-68fad78e389b.json deleted file mode 100644 index 5fd930e8e..000000000 --- a/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview/8a10eeb6-7178-4c78-8940-68fad78e389b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_GWQ-9B-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GWQ-9B-Preview", - "id": "prithivMLmods/GWQ-9B-Preview", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5066 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5806 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2266 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4951 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3984 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview2/f0bb774c-a842-4261-b817-b169ce65a493.json b/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview2/f0bb774c-a842-4261-b817-b169ce65a493.json deleted file mode 100644 index 99ceb5f7b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview2/f0bb774c-a842-4261-b817-b169ce65a493.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_GWQ-9B-Preview2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GWQ-9B-Preview2", - "id": "prithivMLmods/GWQ-9B-Preview2", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5209 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5797 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2372 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3997 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/GWQ2b/59afe234-3a7f-49bb-873c-df6cf793e5e5.json b/data/hfopenllm_v2/prithivMLmods/GWQ2b/59afe234-3a7f-49bb-873c-df6cf793e5e5.json deleted file mode 100644 index 814634ab3..000000000 --- a/data/hfopenllm_v2/prithivMLmods/GWQ2b/59afe234-3a7f-49bb-873c-df6cf793e5e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_GWQ2b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GWQ2b", - "id": "prithivMLmods/GWQ2b", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4115 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4143 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4311 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2473 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Gaea-Opus-14B-Exp/4074081a-66a6-42e4-994f-72541f90888b.json b/data/hfopenllm_v2/prithivMLmods/Gaea-Opus-14B-Exp/4074081a-66a6-42e4-994f-72541f90888b.json deleted file mode 100644 index 72bbcb358..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Gaea-Opus-14B-Exp/4074081a-66a6-42e4-994f-72541f90888b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Gaea-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gaea-Opus-14B-Exp", - "id": "prithivMLmods/Gaea-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5956 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.656 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4275 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4859 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5401 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp1/6a618ec8-c029-49ec-9ea5-da52b5231280.json b/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp1/6a618ec8-c029-49ec-9ea5-da52b5231280.json deleted file mode 100644 index 9d9a90116..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp1/6a618ec8-c029-49ec-9ea5-da52b5231280.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Galactic-Qwen-14B-Exp1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Galactic-Qwen-14B-Exp1", - "id": "prithivMLmods/Galactic-Qwen-14B-Exp1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5832 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6582 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4018 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp2/edc8f510-c961-4c1f-9757-e80c4247f275.json b/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp2/edc8f510-c961-4c1f-9757-e80c4247f275.json deleted file mode 100644 index 26d244447..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp2/edc8f510-c961-4c1f-9757-e80c4247f275.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Galactic-Qwen-14B-Exp2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Galactic-Qwen-14B-Exp2", - "id": "prithivMLmods/Galactic-Qwen-14B-Exp2", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7203 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3474 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3993 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5691 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Gauss-Opus-14B-R999/aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json b/data/hfopenllm_v2/prithivMLmods/Gauss-Opus-14B-R999/aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json deleted file mode 100644 index 7057ef00e..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Gauss-Opus-14B-R999/aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Gauss-Opus-14B-R999/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gauss-Opus-14B-R999", - "id": "prithivMLmods/Gauss-Opus-14B-R999", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3907 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6228 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3918 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5007 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Jolt-v0.1/89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json b/data/hfopenllm_v2/prithivMLmods/Jolt-v0.1/89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json deleted file mode 100644 index 96783fe07..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Jolt-v0.1/89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Jolt-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Jolt-v0.1", - "id": "prithivMLmods/Jolt-v0.1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5092 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6521 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Lacerta-Opus-14B-Elite8/41000c74-8b29-4369-996f-cf3a2fd09f63.json b/data/hfopenllm_v2/prithivMLmods/Lacerta-Opus-14B-Elite8/41000c74-8b29-4369-996f-cf3a2fd09f63.json deleted file mode 100644 index 4a4f258fd..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Lacerta-Opus-14B-Elite8/41000c74-8b29-4369-996f-cf3a2fd09f63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Lacerta-Opus-14B-Elite8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lacerta-Opus-14B-Elite8", - "id": "prithivMLmods/Lacerta-Opus-14B-Elite8", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6141 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6401 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3648 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4635 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5322 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-3.1-5B-Instruct/a1765846-74e1-440a-8851-12a571444059.json b/data/hfopenllm_v2/prithivMLmods/Llama-3.1-5B-Instruct/a1765846-74e1-440a-8851-12a571444059.json deleted file mode 100644 index ae7d80588..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Llama-3.1-5B-Instruct/a1765846-74e1-440a-8851-12a571444059.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-3.1-5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-5B-Instruct", - "id": "prithivMLmods/Llama-3.1-5B-Instruct", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 5.413 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1407 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3051 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1184 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-3.1-8B-Open-SFT/9c6b594f-387a-42a3-9e40-3b26363e6071.json b/data/hfopenllm_v2/prithivMLmods/Llama-3.1-8B-Open-SFT/9c6b594f-387a-42a3-9e40-3b26363e6071.json deleted file mode 100644 index 41ba66370..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Llama-3.1-8B-Open-SFT/9c6b594f-387a-42a3-9e40-3b26363e6071.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-3.1-8B-Open-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Open-SFT", - "id": "prithivMLmods/Llama-3.1-8B-Open-SFT", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4123 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4968 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3904 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3522 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-3.2-3B-Math-Oct/2b910401-457a-45dd-920a-559f4595897b.json b/data/hfopenllm_v2/prithivMLmods/Llama-3.2-3B-Math-Oct/2b910401-457a-45dd-920a-559f4595897b.json deleted file mode 100644 index 7b7a839ae..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Llama-3.2-3B-Math-Oct/2b910401-457a-45dd-920a-559f4595897b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-3.2-3B-Math-Oct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Math-Oct", - "id": "prithivMLmods/Llama-3.2-3B-Math-Oct", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4585 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1156 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.347 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-3.2-6B-AlgoCode/90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json b/data/hfopenllm_v2/prithivMLmods/Llama-3.2-6B-AlgoCode/90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json deleted file mode 100644 index 8570bf857..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Llama-3.2-6B-AlgoCode/90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-3.2-6B-AlgoCode/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-6B-AlgoCode", - "id": "prithivMLmods/Llama-3.2-6B-AlgoCode", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.339 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2136 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3748 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4013 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1798 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-8B-Distill-CoT/5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json b/data/hfopenllm_v2/prithivMLmods/Llama-8B-Distill-CoT/5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json deleted file mode 100644 index 8bc00a642..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Llama-8B-Distill-CoT/5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-8B-Distill-CoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-8B-Distill-CoT", - "id": "prithivMLmods/Llama-8B-Distill-CoT", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2732 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-1B/df6e0cfb-d720-428a-a5ad-b1529faa07c0.json b/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-1B/df6e0cfb-d720-428a-a5ad-b1529faa07c0.json deleted file mode 100644 index 193336b1d..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-1B/df6e0cfb-d720-428a-a5ad-b1529faa07c0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-Deepsync-1B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Deepsync-1B", - "id": "prithivMLmods/Llama-Deepsync-1B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.357 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3386 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-3B/a88a6e6f-2253-4b67-9527-55ab6153e40f.json b/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-3B/a88a6e6f-2253-4b67-9527-55ab6153e40f.json deleted file mode 100644 index 702c1642b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-3B/a88a6e6f-2253-4b67-9527-55ab6153e40f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-Deepsync-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Deepsync-3B", - "id": "prithivMLmods/Llama-Deepsync-3B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4302 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4292 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3324 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3031 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-Express.1-Math/00c66a37-b46b-47e8-a098-ce12433c1135.json b/data/hfopenllm_v2/prithivMLmods/Llama-Express.1-Math/00c66a37-b46b-47e8-a098-ce12433c1135.json deleted file mode 100644 index 3d5e5f05f..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Llama-Express.1-Math/00c66a37-b46b-47e8-a098-ce12433c1135.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-Express.1-Math/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-Express.1-Math", - "id": "prithivMLmods/Llama-Express.1-Math", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5084 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3143 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.161 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/LwQ-10B-Instruct/6ad5483c-13dc-4e79-a719-66af383d195a.json b/data/hfopenllm_v2/prithivMLmods/LwQ-10B-Instruct/6ad5483c-13dc-4e79-a719-66af383d195a.json deleted file mode 100644 index d8310afc7..000000000 --- a/data/hfopenllm_v2/prithivMLmods/LwQ-10B-Instruct/6ad5483c-13dc-4e79-a719-66af383d195a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_LwQ-10B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LwQ-10B-Instruct", - "id": "prithivMLmods/LwQ-10B-Instruct", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5122 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4544 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3318 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/LwQ-Reasoner-10B/9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json b/data/hfopenllm_v2/prithivMLmods/LwQ-Reasoner-10B/9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json deleted file mode 100644 index e4c1d9c13..000000000 --- a/data/hfopenllm_v2/prithivMLmods/LwQ-Reasoner-10B/9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_LwQ-Reasoner-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LwQ-Reasoner-10B", - "id": "prithivMLmods/LwQ-Reasoner-10B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2941 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5866 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4079 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4147 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Magellanic-Opus-14B-Exp/3880e3bf-6ff0-4eef-a519-2649014254e1.json b/data/hfopenllm_v2/prithivMLmods/Magellanic-Opus-14B-Exp/3880e3bf-6ff0-4eef-a519-2649014254e1.json deleted file mode 100644 index 227d602c1..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Magellanic-Opus-14B-Exp/3880e3bf-6ff0-4eef-a519-2649014254e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Magellanic-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magellanic-Opus-14B-Exp", - "id": "prithivMLmods/Magellanic-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6866 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6383 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3799 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4926 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5273 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Magellanic-Qwen-25B-R999/e77efb9d-b1fc-4833-8e7f-8da683019018.json b/data/hfopenllm_v2/prithivMLmods/Magellanic-Qwen-25B-R999/e77efb9d-b1fc-4833-8e7f-8da683019018.json deleted file mode 100644 index e8a7e420c..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Magellanic-Qwen-25B-R999/e77efb9d-b1fc-4833-8e7f-8da683019018.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Magellanic-Qwen-25B-R999/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Magellanic-Qwen-25B-R999", - "id": "prithivMLmods/Magellanic-Qwen-25B-R999", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 24.962 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1873 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2608 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0053 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.13 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp.v2/2bcc02df-8d27-412a-8b58-c331df98e4d4.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp.v2/2bcc02df-8d27-412a-8b58-c331df98e4d4.json deleted file mode 100644 index 1ccbd4ee2..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp.v2/2bcc02df-8d27-412a-8b58-c331df98e4d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Corpus-14B-Exp.v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Megatron-Corpus-14B-Exp.v2", - "id": "prithivMLmods/Megatron-Corpus-14B-Exp.v2", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.487 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6321 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2591 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.449 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.481 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp/622531d5-03f8-42cf-974e-94291aa1e515.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp/622531d5-03f8-42cf-974e-94291aa1e515.json deleted file mode 100644 index 7f5ceece4..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp/622531d5-03f8-42cf-974e-94291aa1e515.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Corpus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Megatron-Corpus-14B-Exp", - "id": "prithivMLmods/Megatron-Corpus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4983 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6355 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3429 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4767 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.526 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.0/b772f20f-afbd-496c-9f94-e5fd30d54466.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.0/b772f20f-afbd-496c-9f94-e5fd30d54466.json deleted file mode 100644 index 05306c382..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.0/b772f20f-afbd-496c-9f94-e5fd30d54466.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-14B-2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Megatron-Opus-14B-2.0", - "id": "prithivMLmods/Megatron-Opus-14B-2.0", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6694 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6871 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2779 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.414 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.517 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.1/169d5ad3-ae4a-42de-b951-f264d85bf623.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.1/169d5ad3-ae4a-42de-b951-f264d85bf623.json deleted file mode 100644 index 0def4383d..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.1/169d5ad3-ae4a-42de-b951-f264d85bf623.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-14B-2.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Megatron-Opus-14B-2.1", - "id": "prithivMLmods/Megatron-Opus-14B-2.1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0246 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6727 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2998 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4928 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5174 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Exp/e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Exp/e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json deleted file mode 100644 index 6ccd36216..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Exp/e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Megatron-Opus-14B-Exp", - "id": "prithivMLmods/Megatron-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4979 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4887 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5401 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Stock/594780dc-d969-4a6b-b90b-1cc32f40c452.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Stock/594780dc-d969-4a6b-b90b-1cc32f40c452.json deleted file mode 100644 index 5f0f9b228..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Stock/594780dc-d969-4a6b-b90b-1cc32f40c452.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-14B-Stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Megatron-Opus-14B-Stock", - "id": "prithivMLmods/Megatron-Opus-14B-Stock", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5174 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6412 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3346 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5293 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-7B-Exp/4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-7B-Exp/4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json deleted file mode 100644 index 781a02264..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-7B-Exp/4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-7B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Megatron-Opus-7B-Exp", - "id": "prithivMLmods/Megatron-Opus-7B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6017 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5367 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1971 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Messier-Opus-14B-Elite7/bb576dc9-eede-48d6-b438-732da91a4d29.json b/data/hfopenllm_v2/prithivMLmods/Messier-Opus-14B-Elite7/bb576dc9-eede-48d6-b438-732da91a4d29.json deleted file mode 100644 index a719fbf21..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Messier-Opus-14B-Elite7/bb576dc9-eede-48d6-b438-732da91a4d29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Messier-Opus-14B-Elite7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Messier-Opus-14B-Elite7", - "id": "prithivMLmods/Messier-Opus-14B-Elite7", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7113 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6499 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5404 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner-Merged/0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json b/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner-Merged/0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json deleted file mode 100644 index 86bda99c0..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner-Merged/0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Omni-Reasoner-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Omni-Reasoner-Merged", - "id": "prithivMLmods/Omni-Reasoner-Merged", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4599 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5508 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4616 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4364 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner3-Merged/03d59002-dc98-467f-b2a9-605ef8d9b763.json b/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner3-Merged/03d59002-dc98-467f-b2a9-605ef8d9b763.json deleted file mode 100644 index 12f798f0d..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner3-Merged/03d59002-dc98-467f-b2a9-605ef8d9b763.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Omni-Reasoner3-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Omni-Reasoner3-Merged", - "id": "prithivMLmods/Omni-Reasoner3-Merged", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4935 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1088 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3522 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.295 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Pegasus-Opus-14B-Exp/8a7034fd-7027-4a87-9cac-c95b745935d0.json b/data/hfopenllm_v2/prithivMLmods/Pegasus-Opus-14B-Exp/8a7034fd-7027-4a87-9cac-c95b745935d0.json deleted file mode 100644 index 8f2a0236b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Pegasus-Opus-14B-Exp/8a7034fd-7027-4a87-9cac-c95b745935d0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Pegasus-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Pegasus-Opus-14B-Exp", - "id": "prithivMLmods/Pegasus-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6982 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Empathetic/717f745f-1eae-4277-8a31-dbed140ef3e8.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Empathetic/717f745f-1eae-4277-8a31-dbed140ef3e8.json deleted file mode 100644 index 6b7f41a83..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Empathetic/717f745f-1eae-4277-8a31-dbed140ef3e8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Empathetic/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Empathetic", - "id": "prithivMLmods/Phi-4-Empathetic", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0497 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6727 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2621 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5066 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Math-IO/2dc78735-c0c3-4dd7-8e97-52c92785e623.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Math-IO/2dc78735-c0c3-4dd7-8e97-52c92785e623.json deleted file mode 100644 index 6dc544cef..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Math-IO/2dc78735-c0c3-4dd7-8e97-52c92785e623.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Math-IO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Math-IO", - "id": "prithivMLmods/Phi-4-Math-IO", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.059 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6668 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4577 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3985 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5205 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-QwQ/e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-QwQ/e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json deleted file mode 100644 index 6915cdbb1..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Phi-4-QwQ/e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-QwQ/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-QwQ", - "id": "prithivMLmods/Phi-4-QwQ", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0559 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6696 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4577 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4651 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5275 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-1/6303d73e-4129-472a-a6fd-c64cb3de7204.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-1/6303d73e-4129-472a-a6fd-c64cb3de7204.json deleted file mode 100644 index 6500288f7..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-1/6303d73e-4129-472a-a6fd-c64cb3de7204.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Super-1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Super-1", - "id": "prithivMLmods/Phi-4-Super-1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0418 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6729 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.352 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5017 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5235 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-o1/8a689e8f-19cc-45b7-80be-ce861a549af7.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-o1/8a689e8f-19cc-45b7-80be-ce861a549af7.json deleted file mode 100644 index 6a3f46611..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-o1/8a689e8f-19cc-45b7-80be-ce861a549af7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Super-o1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Super-o1", - "id": "prithivMLmods/Phi-4-Super-o1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0418 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6729 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.352 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5017 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5235 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super/84881315-55a4-4f05-a115-cf82f850090d.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Super/84881315-55a4-4f05-a115-cf82f850090d.json deleted file mode 100644 index 659daae2b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super/84881315-55a4-4f05-a115-cf82f850090d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Super/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-Super", - "id": "prithivMLmods/Phi-4-Super", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3489 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-o1/970dc71c-42be-4d50-86ac-f7301ec969ca.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-o1/970dc71c-42be-4d50-86ac-f7301ec969ca.json deleted file mode 100644 index f575a083a..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Phi-4-o1/970dc71c-42be-4d50-86ac-f7301ec969ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-o1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-4-o1", - "id": "prithivMLmods/Phi-4-o1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.029 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6689 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3995 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5174 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Phi4-Super/c02e1fcf-a837-4b8a-a42d-63837c56128d.json b/data/hfopenllm_v2/prithivMLmods/Phi4-Super/c02e1fcf-a837-4b8a-a42d-63837c56128d.json deleted file mode 100644 index af8b3653b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Phi4-Super/c02e1fcf-a837-4b8a-a42d-63837c56128d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi4-Super/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi4-Super", - "id": "prithivMLmods/Phi4-Super", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0481 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.672 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3489 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Porpoise-Opus-14B-Exp/37280340-5b9a-47d9-aa37-9299d9025518.json b/data/hfopenllm_v2/prithivMLmods/Porpoise-Opus-14B-Exp/37280340-5b9a-47d9-aa37-9299d9025518.json deleted file mode 100644 index 71bc4e12e..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Porpoise-Opus-14B-Exp/37280340-5b9a-47d9-aa37-9299d9025518.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Porpoise-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Porpoise-Opus-14B-Exp", - "id": "prithivMLmods/Porpoise-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7098 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6519 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4041 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4926 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v1/46e7ad9b-b774-46b9-933c-913d1b307f7a.json b/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v1/46e7ad9b-b774-46b9-933c-913d1b307f7a.json deleted file mode 100644 index bb915353c..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v1/46e7ad9b-b774-46b9-933c-913d1b307f7a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Primal-Opus-14B-Optimus-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Primal-Opus-14B-Optimus-v1", - "id": "prithivMLmods/Primal-Opus-14B-Optimus-v1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5013 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6419 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3384 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5259 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v2/c154d3f5-39dc-43c0-85ea-2e43b08494b4.json b/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v2/c154d3f5-39dc-43c0-85ea-2e43b08494b4.json deleted file mode 100644 index ff98c5def..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v2/c154d3f5-39dc-43c0-85ea-2e43b08494b4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Primal-Opus-14B-Optimus-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Primal-Opus-14B-Optimus-v2", - "id": "prithivMLmods/Primal-Opus-14B-Optimus-v2", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6404 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3918 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5422 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-14B-Conversational/abd830e4-2b7f-4895-8262-75926edbafd9.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-14B-Conversational/abd830e4-2b7f-4895-8262-75926edbafd9.json deleted file mode 100644 index 370d5eafa..000000000 --- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-14B-Conversational/abd830e4-2b7f-4895-8262-75926edbafd9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT-14B-Conversational/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-LCoT-14B-Conversational", - "id": "prithivMLmods/QwQ-LCoT-14B-Conversational", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4047 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.624 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4653 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3498 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-3B-Instruct/2c945021-72e3-4e7a-9c6f-81efb27b2206.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-3B-Instruct/2c945021-72e3-4e7a-9c6f-81efb27b2206.json deleted file mode 100644 index 183db0968..000000000 --- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-3B-Instruct/2c945021-72e3-4e7a-9c6f-81efb27b2206.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-LCoT-3B-Instruct", - "id": "prithivMLmods/QwQ-LCoT-3B-Instruct", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4763 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2825 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3582 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-7B-Instruct/5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-7B-Instruct/5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json deleted file mode 100644 index 97c5d8499..000000000 --- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-7B-Instruct/5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-LCoT-7B-Instruct", - "id": "prithivMLmods/QwQ-LCoT-7B-Instruct", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4987 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4802 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4334 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT1-Merged/6c73f6ae-8ffd-4948-8071-33eab07437a6.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT1-Merged/6c73f6ae-8ffd-4948-8071-33eab07437a6.json deleted file mode 100644 index 5f5612428..000000000 --- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT1-Merged/6c73f6ae-8ffd-4948-8071-33eab07437a6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT1-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-LCoT1-Merged", - "id": "prithivMLmods/QwQ-LCoT1-Merged", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4751 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5481 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3731 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4696 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT2-7B-Instruct/fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT2-7B-Instruct/fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json deleted file mode 100644 index 32ba1bb53..000000000 --- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT2-7B-Instruct/fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT2-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-LCoT2-7B-Instruct", - "id": "prithivMLmods/QwQ-LCoT2-7B-Instruct", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5561 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5425 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.327 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4564 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4342 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-MathOct-7B/e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json b/data/hfopenllm_v2/prithivMLmods/QwQ-MathOct-7B/e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json deleted file mode 100644 index 380be752b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/QwQ-MathOct-7B/e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-MathOct-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-MathOct-7B", - "id": "prithivMLmods/QwQ-MathOct-7B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4684 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5486 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4601 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.433 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-1.5B-CoT/9278bcf2-bfab-437f-bd64-7496b24fb8cf.json b/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-1.5B-CoT/9278bcf2-bfab-437f-bd64-7496b24fb8cf.json deleted file mode 100644 index cd87f5d54..000000000 --- a/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-1.5B-CoT/9278bcf2-bfab-437f-bd64-7496b24fb8cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-R1-Distill-1.5B-CoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-R1-Distill-1.5B-CoT", - "id": "prithivMLmods/QwQ-R1-Distill-1.5B-CoT", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2194 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3346 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1913 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-7B-CoT/633aa068-5613-41d8-a194-aebc9ce1586f.json b/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-7B-CoT/633aa068-5613-41d8-a194-aebc9ce1586f.json deleted file mode 100644 index 3e840aa4b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-7B-CoT/633aa068-5613-41d8-a194-aebc9ce1586f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-R1-Distill-7B-CoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-R1-Distill-7B-CoT", - "id": "prithivMLmods/QwQ-R1-Distill-7B-CoT", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4683 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3779 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2804 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Qwen-7B-Distill-Reasoner/d3c1a922-a453-4c7b-b33b-52934e7bf72b.json b/data/hfopenllm_v2/prithivMLmods/Qwen-7B-Distill-Reasoner/d3c1a922-a453-4c7b-b33b-52934e7bf72b.json deleted file mode 100644 index d23b96a42..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Qwen-7B-Distill-Reasoner/d3c1a922-a453-4c7b-b33b-52934e7bf72b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Qwen-7B-Distill-Reasoner/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-7B-Distill-Reasoner", - "id": "prithivMLmods/Qwen-7B-Distill-Reasoner", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4409 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2818 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct/3a27b2a6-5eea-450b-91c7-1dc006229985.json b/data/hfopenllm_v2/prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct/3a27b2a6-5eea-450b-91c7-1dc006229985.json deleted file mode 100644 index 5d9c9b69d..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct/3a27b2a6-5eea-450b-91c7-1dc006229985.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Qwen2.5-1.5B-DeepSeek-R1-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-1.5B-DeepSeek-R1-Instruct", - "id": "prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1397 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2824 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1123 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M/395e37ae-005d-47c0-9cf5-919460e34350.json b/data/hfopenllm_v2/prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M/395e37ae-005d-47c0-9cf5-919460e34350.json deleted file mode 100644 index d92c0c284..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M/395e37ae-005d-47c0-9cf5-919460e34350.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Qwen2.5-14B-DeepSeek-R1-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-DeepSeek-R1-1M", - "id": "prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4193 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5935 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3322 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4606 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4899 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M/b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json b/data/hfopenllm_v2/prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M/b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json deleted file mode 100644 index 5a7f8d146..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M/b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Qwen2.5-7B-DeepSeek-R1-1M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-DeepSeek-R1-1M", - "id": "prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1861 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3126 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/SmolLM2-CoT-360M/452ab810-6921-4922-9446-f2a5c081dc61.json b/data/hfopenllm_v2/prithivMLmods/SmolLM2-CoT-360M/452ab810-6921-4922-9446-f2a5c081dc61.json deleted file mode 100644 index b9e15197c..000000000 --- a/data/hfopenllm_v2/prithivMLmods/SmolLM2-CoT-360M/452ab810-6921-4922-9446-f2a5c081dc61.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_SmolLM2-CoT-360M/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-CoT-360M", - "id": "prithivMLmods/SmolLM2-CoT-360M", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.362 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2216 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3135 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2366 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1085 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite5/1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite5/1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json deleted file mode 100644 index 29c5fd248..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite5/1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Elite5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sombrero-Opus-14B-Elite5", - "id": "prithivMLmods/Sombrero-Opus-14B-Elite5", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7881 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6502 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4287 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.52 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite6/b2eefd3a-795c-4dc0-a10e-924bece05ea5.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite6/b2eefd3a-795c-4dc0-a10e-924bece05ea5.json deleted file mode 100644 index 625a33f9d..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite6/b2eefd3a-795c-4dc0-a10e-924bece05ea5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Elite6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sombrero-Opus-14B-Elite6", - "id": "prithivMLmods/Sombrero-Opus-14B-Elite6", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7226 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6488 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4079 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm1/008cc919-f156-4a2e-af4b-eed015ca91f6.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm1/008cc919-f156-4a2e-af4b-eed015ca91f6.json deleted file mode 100644 index f425d01b1..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm1/008cc919-f156-4a2e-af4b-eed015ca91f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Sm1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sombrero-Opus-14B-Sm1", - "id": "prithivMLmods/Sombrero-Opus-14B-Sm1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3813 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6355 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4035 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5299 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm2/9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm2/9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json deleted file mode 100644 index 333fc6de0..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm2/9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Sm2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sombrero-Opus-14B-Sm2", - "id": "prithivMLmods/Sombrero-Opus-14B-Sm2", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4272 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6609 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4864 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5088 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5345 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm4/7ea26e73-a501-40bf-8f01-81ab8e850a91.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm4/7ea26e73-a501-40bf-8f01-81ab8e850a91.json deleted file mode 100644 index 9a2d75354..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm4/7ea26e73-a501-40bf-8f01-81ab8e850a91.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Sm4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sombrero-Opus-14B-Sm4", - "id": "prithivMLmods/Sombrero-Opus-14B-Sm4", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4347 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6613 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4879 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5192 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.53 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm5/e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm5/e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json deleted file mode 100644 index 2d767adc0..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm5/e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Sm5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sombrero-Opus-14B-Sm5", - "id": "prithivMLmods/Sombrero-Opus-14B-Sm5", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6852 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6564 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4094 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4806 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Sqweeks-7B-Instruct/ba1965f8-b59f-4d71-920c-e3b401ca0534.json b/data/hfopenllm_v2/prithivMLmods/Sqweeks-7B-Instruct/ba1965f8-b59f-4d71-920c-e3b401ca0534.json deleted file mode 100644 index 55de0790e..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Sqweeks-7B-Instruct/ba1965f8-b59f-4d71-920c-e3b401ca0534.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Sqweeks-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Sqweeks-7B-Instruct", - "id": "prithivMLmods/Sqweeks-7B-Instruct", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2158 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4667 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4476 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Tadpole-Opus-14B-Exp/6dc87410-a39e-41b1-8759-68c1556c8419.json b/data/hfopenllm_v2/prithivMLmods/Tadpole-Opus-14B-Exp/6dc87410-a39e-41b1-8759-68c1556c8419.json deleted file mode 100644 index dc0362947..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Tadpole-Opus-14B-Exp/6dc87410-a39e-41b1-8759-68c1556c8419.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Tadpole-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tadpole-Opus-14B-Exp", - "id": "prithivMLmods/Tadpole-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6369 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3134 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4728 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5322 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Taurus-Opus-7B/c4ebe788-fb60-453b-914b-56bf87dd6374.json b/data/hfopenllm_v2/prithivMLmods/Taurus-Opus-7B/c4ebe788-fb60-453b-914b-56bf87dd6374.json deleted file mode 100644 index 02432070f..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Taurus-Opus-7B/c4ebe788-fb60-453b-914b-56bf87dd6374.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Taurus-Opus-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Taurus-Opus-7B", - "id": "prithivMLmods/Taurus-Opus-7B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4223 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5367 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2168 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4399 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Triangulum-10B/45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json b/data/hfopenllm_v2/prithivMLmods/Triangulum-10B/45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json deleted file mode 100644 index 880fb2e12..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Triangulum-10B/45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Triangulum-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Triangulum-10B", - "id": "prithivMLmods/Triangulum-10B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3229 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5968 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.355 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4172 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4178 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Triangulum-5B/10593c13-3b30-4605-8063-c6a6526fc9d9.json b/data/hfopenllm_v2/prithivMLmods/Triangulum-5B/10593c13-3b30-4605-8063-c6a6526fc9d9.json deleted file mode 100644 index 6aff0ab5e..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Triangulum-5B/10593c13-3b30-4605-8063-c6a6526fc9d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Triangulum-5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Triangulum-5B", - "id": "prithivMLmods/Triangulum-5B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 5.413 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1283 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3124 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1223 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Triangulum-v2-10B/12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json b/data/hfopenllm_v2/prithivMLmods/Triangulum-v2-10B/12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json deleted file mode 100644 index dc5fbfdfd..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Triangulum-v2-10B/12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Triangulum-v2-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Triangulum-v2-10B", - "id": "prithivMLmods/Triangulum-v2-10B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6705 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6065 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2447 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4281 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4466 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Tucana-Opus-14B-r999/96d9b675-c299-4138-a381-fb4de36287e5.json b/data/hfopenllm_v2/prithivMLmods/Tucana-Opus-14B-r999/96d9b675-c299-4138-a381-fb4de36287e5.json deleted file mode 100644 index 044957de8..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Tucana-Opus-14B-r999/96d9b675-c299-4138-a381-fb4de36287e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Tucana-Opus-14B-r999/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tucana-Opus-14B-r999", - "id": "prithivMLmods/Tucana-Opus-14B-r999", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6067 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6557 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4063 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3918 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Tulu-MathLingo-8B/17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json b/data/hfopenllm_v2/prithivMLmods/Tulu-MathLingo-8B/17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json deleted file mode 100644 index 075ca9a4b..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Tulu-MathLingo-8B/17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Tulu-MathLingo-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tulu-MathLingo-8B", - "id": "prithivMLmods/Tulu-MathLingo-8B", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5589 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4659 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3864 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3044 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-7B-Elite14/8999a5f3-f421-4663-835e-7626cebd2282.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-7B-Elite14/8999a5f3-f421-4663-835e-7626cebd2282.json deleted file mode 100644 index df24efe05..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-7B-Elite14/8999a5f3-f421-4663-835e-7626cebd2282.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-7B-Elite14/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Viper-Coder-7B-Elite14", - "id": "prithivMLmods/Viper-Coder-7B-Elite14", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1488 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2829 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1089 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.2/951e1a4f-ed6c-49ca-b648-6086989e333f.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.2/951e1a4f-ed6c-49ca-b648-6086989e333f.json deleted file mode 100644 index 82f43e5e6..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.2/951e1a4f-ed6c-49ca-b648-6086989e333f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-Hybrid-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Viper-Coder-Hybrid-v1.2", - "id": "prithivMLmods/Viper-Coder-Hybrid-v1.2", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6736 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6391 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4822 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5243 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.3/2acc0666-e0ff-4760-a74a-227a02775344.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.3/2acc0666-e0ff-4760-a74a-227a02775344.json deleted file mode 100644 index 0c7fa3197..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.3/2acc0666-e0ff-4760-a74a-227a02775344.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-Hybrid-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Viper-Coder-Hybrid-v1.3", - "id": "prithivMLmods/Viper-Coder-Hybrid-v1.3", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6471 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4403 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5097 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-HybridMini-v1.3/3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-HybridMini-v1.3/3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json deleted file mode 100644 index 37eb769a1..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-HybridMini-v1.3/3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-HybridMini-v1.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Viper-Coder-HybridMini-v1.3", - "id": "prithivMLmods/Viper-Coder-HybridMini-v1.3", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6104 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5365 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.463 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v0.1/e858aa6c-c424-447e-b512-7dcf794f9f0f.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v0.1/e858aa6c-c424-447e-b512-7dcf794f9f0f.json deleted file mode 100644 index f8a0e8b52..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v0.1/e858aa6c-c424-447e-b512-7dcf794f9f0f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Viper-Coder-v0.1", - "id": "prithivMLmods/Viper-Coder-v0.1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5521 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6143 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.327 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4394 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3928 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.1/8773eac5-205e-4264-981b-58f1a25f872a.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.1/8773eac5-205e-4264-981b-58f1a25f872a.json deleted file mode 100644 index 781f52345..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.1/8773eac5-205e-4264-981b-58f1a25f872a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-v1.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Viper-Coder-v1.1", - "id": "prithivMLmods/Viper-Coder-v1.1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.401 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5219 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5232 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.6-r999/c26ae286-a9b8-499f-b886-4b75be0cf2da.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.6-r999/c26ae286-a9b8-499f-b886-4b75be0cf2da.json deleted file mode 100644 index 852cfb91f..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.6-r999/c26ae286-a9b8-499f-b886-4b75be0cf2da.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-v1.6-r999/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Viper-Coder-v1.6-r999", - "id": "prithivMLmods/Viper-Coder-v1.6-r999", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4433 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6492 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.401 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5219 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5232 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.7-Vsm6/d3a61998-2d41-4349-bd15-ce29143cc910.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.7-Vsm6/d3a61998-2d41-4349-bd15-ce29143cc910.json deleted file mode 100644 index 0353bf927..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.7-Vsm6/d3a61998-2d41-4349-bd15-ce29143cc910.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-v1.7-Vsm6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Viper-Coder-v1.7-Vsm6", - "id": "prithivMLmods/Viper-Coder-v1.7-Vsm6", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5004 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6502 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4645 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5288 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-OneCoder-UIGEN/56b66428-2751-4c62-b98c-6c60e58c45ca.json b/data/hfopenllm_v2/prithivMLmods/Viper-OneCoder-UIGEN/56b66428-2751-4c62-b98c-6c60e58c45ca.json deleted file mode 100644 index 022eaef25..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Viper-OneCoder-UIGEN/56b66428-2751-4c62-b98c-6c60e58c45ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-OneCoder-UIGEN/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Viper-OneCoder-UIGEN", - "id": "prithivMLmods/Viper-OneCoder-UIGEN", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4692 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6047 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3904 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/Volans-Opus-14B-Exp/9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json b/data/hfopenllm_v2/prithivMLmods/Volans-Opus-14B-Exp/9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json deleted file mode 100644 index ee55e09c5..000000000 --- a/data/hfopenllm_v2/prithivMLmods/Volans-Opus-14B-Exp/9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_Volans-Opus-14B-Exp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Volans-Opus-14B-Exp", - "id": "prithivMLmods/Volans-Opus-14B-Exp", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5868 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6521 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4252 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4872 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5385 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/prithivMLmods/WebMind-7B-v0.1/5855a920-428f-4699-becc-73d4422f706f.json b/data/hfopenllm_v2/prithivMLmods/WebMind-7B-v0.1/5855a920-428f-4699-becc-73d4422f706f.json deleted file mode 100644 index 5e3113aff..000000000 --- a/data/hfopenllm_v2/prithivMLmods/WebMind-7B-v0.1/5855a920-428f-4699-becc-73d4422f706f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/prithivMLmods_WebMind-7B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WebMind-7B-v0.1", - "id": "prithivMLmods/WebMind-7B-v0.1", - "developer": "prithivMLmods", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5434 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3648 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4537 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pszemraj/Llama-3-6.3b-v0.1/f1004f08-7f46-4eb1-8f60-66893fca7180.json b/data/hfopenllm_v2/pszemraj/Llama-3-6.3b-v0.1/f1004f08-7f46-4eb1-8f60-66893fca7180.json deleted file mode 100644 index 00c890719..000000000 --- a/data/hfopenllm_v2/pszemraj/Llama-3-6.3b-v0.1/f1004f08-7f46-4eb1-8f60-66893fca7180.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pszemraj_Llama-3-6.3b-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-6.3b-v0.1", - "id": "pszemraj/Llama-3-6.3b-v0.1", - "developer": "pszemraj", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.3 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1044 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4197 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3908 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.284 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/pszemraj/Mistral-v0.3-6B/97db158a-3035-45d3-8d92-a08c9e605493.json b/data/hfopenllm_v2/pszemraj/Mistral-v0.3-6B/97db158a-3035-45d3-8d92-a08c9e605493.json deleted file mode 100644 index 73d3cc391..000000000 --- a/data/hfopenllm_v2/pszemraj/Mistral-v0.3-6B/97db158a-3035-45d3-8d92-a08c9e605493.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/pszemraj_Mistral-v0.3-6B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-v0.3-6B", - "id": "pszemraj/Mistral-v0.3-6B", - "developer": "pszemraj", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 5.939 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2454 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3774 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3908 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2143 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2019/LLaMa_3.2_3B_Catalysts/0d81b928-2a24-4eb4-93d5-224e3c505532.json b/data/hfopenllm_v2/qingy2019/LLaMa_3.2_3B_Catalysts/0d81b928-2a24-4eb4-93d5-224e3c505532.json deleted file mode 100644 index 077f5175f..000000000 --- a/data/hfopenllm_v2/qingy2019/LLaMa_3.2_3B_Catalysts/0d81b928-2a24-4eb4-93d5-224e3c505532.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2019_LLaMa_3.2_3B_Catalysts/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMa_3.2_3B_Catalysts", - "id": "qingy2019/LLaMa_3.2_3B_Catalysts", - "developer": "qingy2019", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4992 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4468 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1292 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3788 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3008 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2019/OpenMath2-Llama3.1-8B/bf4cc7ee-cad4-42af-8638-6b371577ec68.json b/data/hfopenllm_v2/qingy2019/OpenMath2-Llama3.1-8B/bf4cc7ee-cad4-42af-8638-6b371577ec68.json deleted file mode 100644 index 8d52de0fa..000000000 --- a/data/hfopenllm_v2/qingy2019/OpenMath2-Llama3.1-8B/bf4cc7ee-cad4-42af-8638-6b371577ec68.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2019_OpenMath2-Llama3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenMath2-Llama3.1-8B", - "id": "qingy2019/OpenMath2-Llama3.1-8B", - "developer": "qingy2019", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2331 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2674 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3436 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1553 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2019/Oracle-14B/5b574dda-0d85-47aa-9ebc-7f8581d402ca.json b/data/hfopenllm_v2/qingy2019/Oracle-14B/5b574dda-0d85-47aa-9ebc-7f8581d402ca.json deleted file mode 100644 index 8c2258503..000000000 --- a/data/hfopenllm_v2/qingy2019/Oracle-14B/5b574dda-0d85-47aa-9ebc-7f8581d402ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2019_Oracle-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Oracle-14B", - "id": "qingy2019/Oracle-14B", - "developer": "qingy2019", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 13.668 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2401 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4622 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0725 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3703 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2019/Oracle-14B/6043830f-8a9d-4a03-9de5-4805724a9ae8.json b/data/hfopenllm_v2/qingy2019/Oracle-14B/6043830f-8a9d-4a03-9de5-4805724a9ae8.json deleted file mode 100644 index 3296a5d2c..000000000 --- a/data/hfopenllm_v2/qingy2019/Oracle-14B/6043830f-8a9d-4a03-9de5-4805724a9ae8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2019_Oracle-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Oracle-14B", - "id": "qingy2019/Oracle-14B", - "developer": "qingy2019", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 13.668 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2358 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4612 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0642 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3717 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Alpha/9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Alpha/9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json deleted file mode 100644 index 7da78d24a..000000000 --- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Alpha/9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Math-14B-Instruct-Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-14B-Instruct-Alpha", - "id": "qingy2019/Qwen2.5-Math-14B-Instruct-Alpha", - "developer": "qingy2019", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5981 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6375 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Pro/217819b0-2c4b-4c26-823b-1ea14f893e01.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Pro/217819b0-2c4b-4c26-823b-1ea14f893e01.json deleted file mode 100644 index 302221eda..000000000 --- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Pro/217819b0-2c4b-4c26-823b-1ea14f893e01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Math-14B-Instruct-Pro/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-14B-Instruct-Pro", - "id": "qingy2019/Qwen2.5-Math-14B-Instruct-Pro", - "developer": "qingy2019", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1922 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5319 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3558 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/0f844855-fb46-4b53-82c2-f36e5721c385.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/0f844855-fb46-4b53-82c2-f36e5721c385.json deleted file mode 100644 index 792719c86..000000000 --- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/0f844855-fb46-4b53-82c2-f36e5721c385.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Math-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-14B-Instruct", - "id": "qingy2019/Qwen2.5-Math-14B-Instruct", - "developer": "qingy2019", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6005 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6356 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2764 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3691 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5339 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/59aaa7ed-27d4-4765-b115-90570ad86c77.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/59aaa7ed-27d4-4765-b115-90570ad86c77.json deleted file mode 100644 index 5ff1d5b79..000000000 --- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/59aaa7ed-27d4-4765-b115-90570ad86c77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Math-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-14B-Instruct", - "id": "qingy2019/Qwen2.5-Math-14B-Instruct", - "developer": "qingy2019", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.635 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Ultimate-14B-Instruct/4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Ultimate-14B-Instruct/4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json deleted file mode 100644 index 12db4718f..000000000 --- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Ultimate-14B-Instruct/4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Ultimate-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Ultimate-14B-Instruct", - "id": "qingy2019/Qwen2.5-Ultimate-14B-Instruct", - "developer": "qingy2019", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3938 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5842 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2893 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4135 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4929 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Benchmaxx-Llama-3.2-1B-Instruct/9202146d-5889-49fd-9025-e03153ba9093.json b/data/hfopenllm_v2/qingy2024/Benchmaxx-Llama-3.2-1B-Instruct/9202146d-5889-49fd-9025-e03153ba9093.json deleted file mode 100644 index e323ed5cd..000000000 --- a/data/hfopenllm_v2/qingy2024/Benchmaxx-Llama-3.2-1B-Instruct/9202146d-5889-49fd-9025-e03153ba9093.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Benchmaxx-Llama-3.2-1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Benchmaxx-Llama-3.2-1B-Instruct", - "id": "qingy2024/Benchmaxx-Llama-3.2-1B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2014 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8269 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4804 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1113 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Eyas-17B-Instruct/94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json b/data/hfopenllm_v2/qingy2024/Eyas-17B-Instruct/94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json deleted file mode 100644 index a3811ca19..000000000 --- a/data/hfopenllm_v2/qingy2024/Eyas-17B-Instruct/94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Eyas-17B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Eyas-17B-Instruct", - "id": "qingy2024/Eyas-17B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 17.431 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6575 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6085 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.247 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4522 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Falcon3-2x10B-MoE-Instruct/2245cf71-fb8d-44ca-b58d-06608312ee8c.json b/data/hfopenllm_v2/qingy2024/Falcon3-2x10B-MoE-Instruct/2245cf71-fb8d-44ca-b58d-06608312ee8c.json deleted file mode 100644 index 97d1ad31b..000000000 --- a/data/hfopenllm_v2/qingy2024/Falcon3-2x10B-MoE-Instruct/2245cf71-fb8d-44ca-b58d-06608312ee8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Falcon3-2x10B-MoE-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-2x10B-MoE-Instruct", - "id": "qingy2024/Falcon3-2x10B-MoE-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 18.799 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.785 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6185 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2795 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4423 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Fusion-14B-Instruct/9a823fde-7802-4876-b72c-d8f73cd17236.json b/data/hfopenllm_v2/qingy2024/Fusion-14B-Instruct/9a823fde-7802-4876-b72c-d8f73cd17236.json deleted file mode 100644 index 7fb915e5f..000000000 --- a/data/hfopenllm_v2/qingy2024/Fusion-14B-Instruct/9a823fde-7802-4876-b72c-d8f73cd17236.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Fusion-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fusion-14B-Instruct", - "id": "qingy2024/Fusion-14B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.726 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6396 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Fusion2-14B-Instruct/ede99239-ef8f-49eb-a48b-0ec2553c99e5.json b/data/hfopenllm_v2/qingy2024/Fusion2-14B-Instruct/ede99239-ef8f-49eb-a48b-0ec2553c99e5.json deleted file mode 100644 index 130eb0036..000000000 --- a/data/hfopenllm_v2/qingy2024/Fusion2-14B-Instruct/ede99239-ef8f-49eb-a48b-0ec2553c99e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Fusion2-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fusion2-14B-Instruct", - "id": "qingy2024/Fusion2-14B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6119 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3127 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4634 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5051 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Fusion4-14B-Instruct/4a307570-994f-491c-87a7-ad90b7965b8b.json b/data/hfopenllm_v2/qingy2024/Fusion4-14B-Instruct/4a307570-994f-491c-87a7-ad90b7965b8b.json deleted file mode 100644 index 144b32ba3..000000000 --- a/data/hfopenllm_v2/qingy2024/Fusion4-14B-Instruct/4a307570-994f-491c-87a7-ad90b7965b8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Fusion4-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fusion4-14B-Instruct", - "id": "qingy2024/Fusion4-14B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6543 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4326 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/OwO-14B-Instruct/eb448d78-6417-4533-8458-99c1869a74ae.json b/data/hfopenllm_v2/qingy2024/OwO-14B-Instruct/eb448d78-6417-4533-8458-99c1869a74ae.json deleted file mode 100644 index 2f6ab406d..000000000 --- a/data/hfopenllm_v2/qingy2024/OwO-14B-Instruct/eb448d78-6417-4533-8458-99c1869a74ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_OwO-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OwO-14B-Instruct", - "id": "qingy2024/OwO-14B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1383 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6165 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4407 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5181 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/QwEnlarge-16B-Instruct/e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json b/data/hfopenllm_v2/qingy2024/QwEnlarge-16B-Instruct/e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json deleted file mode 100644 index 64c399092..000000000 --- a/data/hfopenllm_v2/qingy2024/QwEnlarge-16B-Instruct/e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_QwEnlarge-16B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwEnlarge-16B-Instruct", - "id": "qingy2024/QwEnlarge-16B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 15.871 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7802 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5949 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4101 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4476 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/QwQ-14B-Math-v0.2/aab6b224-b948-4fb1-84b7-0dbe5c46d527.json b/data/hfopenllm_v2/qingy2024/QwQ-14B-Math-v0.2/aab6b224-b948-4fb1-84b7-0dbe5c46d527.json deleted file mode 100644 index 1d8168654..000000000 --- a/data/hfopenllm_v2/qingy2024/QwQ-14B-Math-v0.2/aab6b224-b948-4fb1-84b7-0dbe5c46d527.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_QwQ-14B-Math-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "QwQ-14B-Math-v0.2", - "id": "qingy2024/QwQ-14B-Math-v0.2", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3391 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5731 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4811 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.48 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Qwarkstar-4B-Instruct-Preview/2e5cd1de-6109-4f76-b722-abbd4b207f4d.json b/data/hfopenllm_v2/qingy2024/Qwarkstar-4B-Instruct-Preview/2e5cd1de-6109-4f76-b722-abbd4b207f4d.json deleted file mode 100644 index dd72818b2..000000000 --- a/data/hfopenllm_v2/qingy2024/Qwarkstar-4B-Instruct-Preview/2e5cd1de-6109-4f76-b722-abbd4b207f4d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Qwarkstar-4B-Instruct-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwarkstar-4B-Instruct-Preview", - "id": "qingy2024/Qwarkstar-4B-Instruct-Preview", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 4.473 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5324 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1284 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3896 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2502 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Qwarkstar-4B/767d1296-4971-478f-8d78-1d63d162ae5b.json b/data/hfopenllm_v2/qingy2024/Qwarkstar-4B/767d1296-4971-478f-8d78-1d63d162ae5b.json deleted file mode 100644 index 4fa365663..000000000 --- a/data/hfopenllm_v2/qingy2024/Qwarkstar-4B/767d1296-4971-478f-8d78-1d63d162ae5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Qwarkstar-4B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwarkstar-4B", - "id": "qingy2024/Qwarkstar-4B", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 4.473 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1994 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2425 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.5-4B/eab74e3b-de61-4fa9-87c2-56e69b70349a.json b/data/hfopenllm_v2/qingy2024/Qwen2.5-4B/eab74e3b-de61-4fa9-87c2-56e69b70349a.json deleted file mode 100644 index 9a1241837..000000000 --- a/data/hfopenllm_v2/qingy2024/Qwen2.5-4B/eab74e3b-de61-4fa9-87c2-56e69b70349a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.5-4B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-4B", - "id": "qingy2024/Qwen2.5-4B", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 4.168 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2158 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4269 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct/3219d563-3bfb-4618-8cb3-e9b198d5b11f.json b/data/hfopenllm_v2/qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct/3219d563-3bfb-4618-8cb3-e9b198d5b11f.json deleted file mode 100644 index 56cfd6f64..000000000 --- a/data/hfopenllm_v2/qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct/3219d563-3bfb-4618-8cb3-e9b198d5b11f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.5-Coder-Draft-1.5B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-Draft-1.5B-Instruct", - "id": "qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3837 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1579 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2244 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Alpha/233fd27c-561e-4c9e-a917-cbc5b08c055a.json b/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Alpha/233fd27c-561e-4c9e-a917-cbc5b08c055a.json deleted file mode 100644 index 140c278d3..000000000 --- a/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Alpha/233fd27c-561e-4c9e-a917-cbc5b08c055a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.5-Math-14B-Instruct-Alpha/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-14B-Instruct-Alpha", - "id": "qingy2024/Qwen2.5-Math-14B-Instruct-Alpha", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7704 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6465 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4966 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Preview/a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json b/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Preview/a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json deleted file mode 100644 index f9376abc8..000000000 --- a/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Preview/a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.5-Math-14B-Instruct-Preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Math-14B-Instruct-Preview", - "id": "qingy2024/Qwen2.5-Math-14B-Instruct-Preview", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7826 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6294 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4758 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4115 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4993 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.6-14B-Instruct/4b68ba49-6681-4add-9197-2cd711701e15.json b/data/hfopenllm_v2/qingy2024/Qwen2.6-14B-Instruct/4b68ba49-6681-4add-9197-2cd711701e15.json deleted file mode 100644 index c2c1c3606..000000000 --- a/data/hfopenllm_v2/qingy2024/Qwen2.6-14B-Instruct/4b68ba49-6681-4add-9197-2cd711701e15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.6-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.6-14B-Instruct", - "id": "qingy2024/Qwen2.6-14B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5811 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6394 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3051 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4569 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5285 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.6-Math-14B-Instruct/5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json b/data/hfopenllm_v2/qingy2024/Qwen2.6-Math-14B-Instruct/5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json deleted file mode 100644 index ee424cd53..000000000 --- a/data/hfopenllm_v2/qingy2024/Qwen2.6-Math-14B-Instruct/5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.6-Math-14B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.6-Math-14B-Instruct", - "id": "qingy2024/Qwen2.6-Math-14B-Instruct", - "developer": "qingy2024", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3862 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6324 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4759 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5241 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/qq8933/OpenLongCoT-Base-Gemma2-2B/a6c631f6-890c-4199-abee-18b012bc48df.json b/data/hfopenllm_v2/qq8933/OpenLongCoT-Base-Gemma2-2B/a6c631f6-890c-4199-abee-18b012bc48df.json deleted file mode 100644 index 62c5fe7c9..000000000 --- a/data/hfopenllm_v2/qq8933/OpenLongCoT-Base-Gemma2-2B/a6c631f6-890c-4199-abee-18b012bc48df.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/qq8933_OpenLongCoT-Base-Gemma2-2B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenLongCoT-Base-Gemma2-2B", - "id": "qq8933/OpenLongCoT-Base-Gemma2-2B", - "developer": "qq8933", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 3.204 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1965 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3106 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3222 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1316 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/raphgg/test-2.5-72B/1edc3610-40fc-467d-8410-26d4b6adebce.json b/data/hfopenllm_v2/raphgg/test-2.5-72B/1edc3610-40fc-467d-8410-26d4b6adebce.json deleted file mode 100644 index f9c3617d5..000000000 --- a/data/hfopenllm_v2/raphgg/test-2.5-72B/1edc3610-40fc-467d-8410-26d4b6adebce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/raphgg_test-2.5-72B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "test-2.5-72B", - "id": "raphgg/test-2.5-72B", - "developer": "raphgg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8437 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7266 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4109 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4812 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5837 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rasyosef/Mistral-NeMo-Minitron-8B-Chat/42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json b/data/hfopenllm_v2/rasyosef/Mistral-NeMo-Minitron-8B-Chat/42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json deleted file mode 100644 index 33c5db79a..000000000 --- a/data/hfopenllm_v2/rasyosef/Mistral-NeMo-Minitron-8B-Chat/42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rasyosef_Mistral-NeMo-Minitron-8B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-NeMo-Minitron-8B-Chat", - "id": "rasyosef/Mistral-NeMo-Minitron-8B-Chat", - "developer": "rasyosef", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 8.414 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4759 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4304 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2404 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rasyosef/Phi-1_5-Instruct-v0.1/1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json b/data/hfopenllm_v2/rasyosef/Phi-1_5-Instruct-v0.1/1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json deleted file mode 100644 index a55a5da1a..000000000 --- a/data/hfopenllm_v2/rasyosef/Phi-1_5-Instruct-v0.1/1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rasyosef_Phi-1_5-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-1_5-Instruct-v0.1", - "id": "rasyosef/Phi-1_5-Instruct-v0.1", - "developer": "rasyosef", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "PhiForCausalLM", - "params_billions": 1.415 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2402 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3118 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1562 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rasyosef/phi-2-instruct-apo/821a21a0-6fd7-438a-933d-5e31b2dd2adc.json b/data/hfopenllm_v2/rasyosef/phi-2-instruct-apo/821a21a0-6fd7-438a-933d-5e31b2dd2adc.json deleted file mode 100644 index 02df295de..000000000 --- a/data/hfopenllm_v2/rasyosef/phi-2-instruct-apo/821a21a0-6fd7-438a-933d-5e31b2dd2adc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rasyosef_phi-2-instruct-apo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-2-instruct-apo", - "id": "rasyosef/phi-2-instruct-apo", - "developer": "rasyosef", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.775 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4445 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3342 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2155 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rasyosef/phi-2-instruct-v0.1/781a4cc6-a69d-4106-81aa-06e114f7c897.json b/data/hfopenllm_v2/rasyosef/phi-2-instruct-v0.1/781a4cc6-a69d-4106-81aa-06e114f7c897.json deleted file mode 100644 index 16d481683..000000000 --- a/data/hfopenllm_v2/rasyosef/phi-2-instruct-v0.1/781a4cc6-a69d-4106-81aa-06e114f7c897.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rasyosef_phi-2-instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-2-instruct-v0.1", - "id": "rasyosef/phi-2-instruct-v0.1", - "developer": "rasyosef", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.775 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3681 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4726 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3524 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2247 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/realtreetune/rho-1b-sft-MATH/e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json b/data/hfopenllm_v2/realtreetune/rho-1b-sft-MATH/e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json deleted file mode 100644 index 56ba23460..000000000 --- a/data/hfopenllm_v2/realtreetune/rho-1b-sft-MATH/e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/realtreetune_rho-1b-sft-MATH/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "rho-1b-sft-MATH", - "id": "realtreetune/rho-1b-sft-MATH", - "developer": "realtreetune", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2121 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3144 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0347 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3458 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/3b7524a8-d17b-4788-93f2-11076df464a7.json b/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/3b7524a8-d17b-4788-93f2-11076df464a7.json deleted file mode 100644 index b8063cb4c..000000000 --- a/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/3b7524a8-d17b-4788-93f2-11076df464a7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/recoilme_Gemma-2-Ataraxy-Gemmasutra-9B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp", - "id": "recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp", - "developer": "recoilme", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2854 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1005 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4607 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4162 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/6188a57f-4bc3-42a5-ad18-c59774e40407.json b/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/6188a57f-4bc3-42a5-ad18-c59774e40407.json deleted file mode 100644 index eda35f4dd..000000000 --- a/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/6188a57f-4bc3-42a5-ad18-c59774e40407.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/recoilme_Gemma-2-Ataraxy-Gemmasutra-9B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp", - "id": "recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp", - "developer": "recoilme", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5974 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4245 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.1/28689805-7c4c-438e-8431-f4a6aceb5e94.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.1/28689805-7c4c-438e-8431-f4a6aceb5e94.json deleted file mode 100644 index fb6c7b792..000000000 --- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.1/28689805-7c4c-438e-8431-f4a6aceb5e94.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-9B-v0.1", - "id": "recoilme/recoilme-gemma-2-9B-v0.1", - "developer": "recoilme", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7515 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5995 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2039 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4191 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4159 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7c156689-9668-4ded-bacc-c88a03ad1526.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7c156689-9668-4ded-bacc-c88a03ad1526.json deleted file mode 100644 index 8be1d1216..000000000 --- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7c156689-9668-4ded-bacc-c88a03ad1526.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-9B-v0.2", - "id": "recoilme/recoilme-gemma-2-9B-v0.2", - "developer": "recoilme", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7592 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6026 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4099 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4163 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7e43f187-1959-4dfe-802f-094ba88f3b0d.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7e43f187-1959-4dfe-802f-094ba88f3b0d.json deleted file mode 100644 index 2194c56b4..000000000 --- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7e43f187-1959-4dfe-802f-094ba88f3b0d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-9B-v0.2", - "id": "recoilme/recoilme-gemma-2-9B-v0.2", - "developer": "recoilme", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2747 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6031 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/a6170173-ef17-4cfa-a76e-8e51cb8cb970.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/a6170173-ef17-4cfa-a76e-8e51cb8cb970.json deleted file mode 100644 index b07a872e1..000000000 --- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/a6170173-ef17-4cfa-a76e-8e51cb8cb970.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-9B-v0.3", - "id": "recoilme/recoilme-gemma-2-9B-v0.3", - "developer": "recoilme", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7439 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5993 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4204 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4072 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/e998d52b-dd94-4ef2-9cfc-5034ded0105a.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/e998d52b-dd94-4ef2-9cfc-5034ded0105a.json deleted file mode 100644 index f0948a794..000000000 --- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/e998d52b-dd94-4ef2-9cfc-5034ded0105a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-9B-v0.3", - "id": "recoilme/recoilme-gemma-2-9B-v0.3", - "developer": "recoilme", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5761 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4632 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4039 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.4/a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.4/a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json deleted file mode 100644 index 4927c66a1..000000000 --- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.4/a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-9B-v0.4", - "id": "recoilme/recoilme-gemma-2-9B-v0.4", - "developer": "recoilme", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2562 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5967 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.5/0f69217c-74ed-4398-8d1b-53d1a43be890.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.5/0f69217c-74ed-4398-8d1b-53d1a43be890.json deleted file mode 100644 index 31bf944b2..000000000 --- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.5/0f69217c-74ed-4398-8d1b-53d1a43be890.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-9B-v0.5", - "id": "recoilme/recoilme-gemma-2-9B-v0.5", - "developer": "recoilme", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7664 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5981 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2115 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.42 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/b973adcc-769c-4009-87c5-5f5af02a5d3a.json b/data/hfopenllm_v2/redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/b973adcc-769c-4009-87c5-5f5af02a5d3a.json deleted file mode 100644 index f90aef9ac..000000000 --- a/data/hfopenllm_v2/redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/b973adcc-769c-4009-87c5-5f5af02a5d3a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/redrix_AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS", - "id": "redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS", - "developer": "redrix", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.536 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5129 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1133 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3818 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/redrix/patricide-12B-Unslop-Mell/4b30f11e-a2b9-40e9-b080-9d7484a5d048.json b/data/hfopenllm_v2/redrix/patricide-12B-Unslop-Mell/4b30f11e-a2b9-40e9-b080-9d7484a5d048.json deleted file mode 100644 index 98a7c58db..000000000 --- a/data/hfopenllm_v2/redrix/patricide-12B-Unslop-Mell/4b30f11e-a2b9-40e9-b080-9d7484a5d048.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/redrix_patricide-12B-Unslop-Mell/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "patricide-12B-Unslop-Mell", - "id": "redrix/patricide-12B-Unslop-Mell", - "developer": "redrix", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4074 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5399 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1314 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4026 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/refuelai/Llama-3-Refueled/befdae09-4caa-4996-a3ac-fe36310aaf01.json b/data/hfopenllm_v2/refuelai/Llama-3-Refueled/befdae09-4caa-4996-a3ac-fe36310aaf01.json deleted file mode 100644 index d68f26a1b..000000000 --- a/data/hfopenllm_v2/refuelai/Llama-3-Refueled/befdae09-4caa-4996-a3ac-fe36310aaf01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/refuelai_Llama-3-Refueled/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Refueled", - "id": "refuelai/Llama-3-Refueled", - "developer": "refuelai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5871 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4454 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rhplus0831/maid-yuzu-v7/8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json b/data/hfopenllm_v2/rhplus0831/maid-yuzu-v7/8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json deleted file mode 100644 index 92de2faa1..000000000 --- a/data/hfopenllm_v2/rhplus0831/maid-yuzu-v7/8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rhplus0831_maid-yuzu-v7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "maid-yuzu-v7", - "id": "rhplus0831/maid-yuzu-v7", - "developer": "rhplus0831", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4805 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rhymes-ai/Aria/7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json b/data/hfopenllm_v2/rhymes-ai/Aria/7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json deleted file mode 100644 index f9084404d..000000000 --- a/data/hfopenllm_v2/rhymes-ai/Aria/7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rhymes-ai_Aria/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Aria", - "id": "rhymes-ai/Aria", - "developer": "rhymes-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "AriaForConditionalGeneration", - "params_billions": 25.307 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4773 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5695 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1934 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4405 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rhysjones/phi-2-orange-v2/7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json b/data/hfopenllm_v2/rhysjones/phi-2-orange-v2/7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json deleted file mode 100644 index c708415c7..000000000 --- a/data/hfopenllm_v2/rhysjones/phi-2-orange-v2/7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rhysjones_phi-2-orange-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-2-orange-v2", - "id": "rhysjones/phi-2-orange-v2", - "developer": "rhysjones", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "PhiForCausalLM", - "params_billions": 2.78 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.367 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.477 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.363 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2532 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/1f3a733d-a6d3-453b-9763-61992cd514b0.json b/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/1f3a733d-a6d3-453b-9763-61992cd514b0.json deleted file mode 100644 index 8ee7040c9..000000000 --- a/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/1f3a733d-a6d3-453b-9763-61992cd514b0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/riaz_FineLlama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FineLlama-3.1-8B", - "id": "riaz/FineLlama-3.1-8B", - "developer": "riaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4586 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3763 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2964 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/d0eed3c1-2226-48c5-a314-e429f66c5053.json b/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/d0eed3c1-2226-48c5-a314-e429f66c5053.json deleted file mode 100644 index d57a902b6..000000000 --- a/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/d0eed3c1-2226-48c5-a314-e429f66c5053.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/riaz_FineLlama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FineLlama-3.1-8B", - "id": "riaz/FineLlama-3.1-8B", - "developer": "riaz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4137 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4565 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rmdhirr/Gluon-8B/957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json b/data/hfopenllm_v2/rmdhirr/Gluon-8B/957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json deleted file mode 100644 index 37d3c1fbe..000000000 --- a/data/hfopenllm_v2/rmdhirr/Gluon-8B/957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rmdhirr_Gluon-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gluon-8B", - "id": "rmdhirr/Gluon-8B", - "developer": "rmdhirr", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5053 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5153 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1443 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4039 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3808 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-14b/55a01e8e-318a-4609-a862-bab4d62b3e7a.json b/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-14b/55a01e8e-318a-4609-a862-bab4d62b3e7a.json deleted file mode 100644 index c74361cf2..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-14b/55a01e8e-318a-4609-a862-bab4d62b3e7a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-Coder-V2.5-Qwen-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-Coder-V2.5-Qwen-14b", - "id": "rombodawg/Rombos-Coder-V2.5-Qwen-14b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7047 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6165 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3915 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-7b/cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json b/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-7b/cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json deleted file mode 100644 index e153878a8..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-7b/cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-Coder-V2.5-Qwen-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-Coder-V2.5-Qwen-7b", - "id": "rombodawg/Rombos-Coder-V2.5-Qwen-7b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5077 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3338 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3979 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-0.5b/c7b6515e-6f96-468b-8bc0-15212c31e790.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-0.5b/c7b6515e-6f96-468b-8bc0-15212c31e790.json deleted file mode 100644 index 65683c672..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-0.5b/c7b6515e-6f96-468b-8bc0-15212c31e790.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-0.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5-Qwen-0.5b", - "id": "rombodawg/Rombos-LLM-V2.5-Qwen-0.5b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2847 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3294 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3236 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1866 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-1.5b/f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-1.5b/f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json deleted file mode 100644 index c6d4a9338..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-1.5b/f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-1.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5-Qwen-1.5b", - "id": "rombodawg/Rombos-LLM-V2.5-Qwen-1.5b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3402 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4257 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4186 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2922 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-14b/994aa481-627a-4bed-8719-9e874373cbc6.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-14b/994aa481-627a-4bed-8719-9e874373cbc6.json deleted file mode 100644 index 85be3f982..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-14b/994aa481-627a-4bed-8719-9e874373cbc6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5-Qwen-14b", - "id": "rombodawg/Rombos-LLM-V2.5-Qwen-14b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.584 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6481 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4554 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4717 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-32b/9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-32b/9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json deleted file mode 100644 index 85c6ec04a..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-32b/9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-32b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5-Qwen-32b", - "id": "rombodawg/Rombos-LLM-V2.5-Qwen-32b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6827 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7046 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5916 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-3b/c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-3b/c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json deleted file mode 100644 index 272c24433..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-3b/c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5-Qwen-3b", - "id": "rombodawg/Rombos-LLM-V2.5-Qwen-3b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5342 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4809 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2795 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4042 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-72b/e908b473-a015-4156-8e88-d67153479cb9.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-72b/e908b473-a015-4156-8e88-d67153479cb9.json deleted file mode 100644 index 04db7b85b..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-72b/e908b473-a015-4156-8e88-d67153479cb9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-72b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5-Qwen-72b", - "id": "rombodawg/Rombos-LLM-V2.5-Qwen-72b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7155 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3985 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4599 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5935 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-7b/173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-7b/173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json deleted file mode 100644 index 5fba641ae..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-7b/173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5-Qwen-7b", - "id": "rombodawg/Rombos-LLM-V2.5-Qwen-7b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6237 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4291 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4469 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/0bb65f09-323d-485f-886e-5a35c8bcd342.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/0bb65f09-323d-485f-886e-5a35c8bcd342.json deleted file mode 100644 index 18b32860b..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/0bb65f09-323d-485f-886e-5a35c8bcd342.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5.1-Qwen-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5.1-Qwen-3b", - "id": "rombodawg/Rombos-LLM-V2.5.1-Qwen-3b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2566 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1208 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2741 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/86b4c877-ef2d-4563-93a2-92d7e77eab5c.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/86b4c877-ef2d-4563-93a2-92d7e77eab5c.json deleted file mode 100644 index df1c02b1d..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/86b4c877-ef2d-4563-93a2-92d7e77eab5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5.1-Qwen-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.5.1-Qwen-3b", - "id": "rombodawg/Rombos-LLM-V2.5.1-Qwen-3b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.397 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2595 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2719 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Nemotron-70b/be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Nemotron-70b/be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json deleted file mode 100644 index 945721456..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Nemotron-70b/be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.6-Nemotron-70b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.6-Nemotron-70b", - "id": "rombodawg/Rombos-LLM-V2.6-Nemotron-70b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7527 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6938 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4669 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Qwen-14b/e574af17-dd3b-4c09-8689-ea598d44e562.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Qwen-14b/e574af17-dd3b-4c09-8689-ea598d44e562.json deleted file mode 100644 index a382429d5..000000000 --- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Qwen-14b/e574af17-dd3b-4c09-8689-ea598d44e562.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.6-Qwen-14b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-LLM-V2.6-Qwen-14b", - "id": "rombodawg/Rombos-LLM-V2.6-Qwen-14b", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8432 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6442 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4221 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4961 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Instruct-8b-Merged/83958185-047a-4356-918d-2f45f273c08a.json b/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Instruct-8b-Merged/83958185-047a-4356-918d-2f45f273c08a.json deleted file mode 100644 index 3c606dff2..000000000 --- a/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Instruct-8b-Merged/83958185-047a-4356-918d-2f45f273c08a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_rombos_Replete-Coder-Instruct-8b-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "rombos_Replete-Coder-Instruct-8b-Merged", - "id": "rombodawg/rombos_Replete-Coder-Instruct-8b-Merged", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5388 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1809 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Llama3-8B/d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json b/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Llama3-8B/d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json deleted file mode 100644 index c4f4fe486..000000000 --- a/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Llama3-8B/d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rombodawg_rombos_Replete-Coder-Llama3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "rombos_Replete-Coder-Llama3-8B", - "id": "rombodawg/rombos_Replete-Coder-Llama3-8B", - "developer": "rombodawg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4714 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3276 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3966 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1335 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rootxhacker/Apollo-70B/a218e260-7f56-4676-af58-254bd84d0327.json b/data/hfopenllm_v2/rootxhacker/Apollo-70B/a218e260-7f56-4676-af58-254bd84d0327.json deleted file mode 100644 index c04a8ad40..000000000 --- a/data/hfopenllm_v2/rootxhacker/Apollo-70B/a218e260-7f56-4676-af58-254bd84d0327.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rootxhacker_Apollo-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Apollo-70B", - "id": "rootxhacker/Apollo-70B", - "developer": "rootxhacker", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5099 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6804 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4572 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4948 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5279 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rootxhacker/Apollo_v2-32B/f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json b/data/hfopenllm_v2/rootxhacker/Apollo_v2-32B/f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json deleted file mode 100644 index c4804ceaf..000000000 --- a/data/hfopenllm_v2/rootxhacker/Apollo_v2-32B/f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rootxhacker_Apollo_v2-32B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Apollo_v2-32B", - "id": "rootxhacker/Apollo_v2-32B", - "developer": "rootxhacker", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7072 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4275 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4994 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5869 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rootxhacker/apollo-7B/da5774b2-8a6f-4f2d-8267-beb25490b06a.json b/data/hfopenllm_v2/rootxhacker/apollo-7B/da5774b2-8a6f-4f2d-8267-beb25490b06a.json deleted file mode 100644 index eac1bfc10..000000000 --- a/data/hfopenllm_v2/rootxhacker/apollo-7B/da5774b2-8a6f-4f2d-8267-beb25490b06a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rootxhacker_apollo-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "apollo-7B", - "id": "rootxhacker/apollo-7B", - "developer": "rootxhacker", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3636 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4131 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1748 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/274705bd-8eb6-4863-8998-f5d67c4ac827.json b/data/hfopenllm_v2/rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/274705bd-8eb6-4863-8998-f5d67c4ac827.json deleted file mode 100644 index cf9929e17..000000000 --- a/data/hfopenllm_v2/rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/274705bd-8eb6-4863-8998-f5d67c4ac827.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rsh345_mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B", - "id": "rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B", - "developer": "rsh345", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3892 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5188 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0733 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4672 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rubenroy/Geneva-12B-GCv2-5m/5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json b/data/hfopenllm_v2/rubenroy/Geneva-12B-GCv2-5m/5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json deleted file mode 100644 index ca4143a6b..000000000 --- a/data/hfopenllm_v2/rubenroy/Geneva-12B-GCv2-5m/5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rubenroy_Geneva-12B-GCv2-5m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Geneva-12B-GCv2-5m", - "id": "rubenroy/Geneva-12B-GCv2-5m", - "developer": "rubenroy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2586 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0801 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3525 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rubenroy/Gilgamesh-72B/6918d1a3-e547-46b7-9062-274057c1f513.json b/data/hfopenllm_v2/rubenroy/Gilgamesh-72B/6918d1a3-e547-46b7-9062-274057c1f513.json deleted file mode 100644 index 5a7ac28c6..000000000 --- a/data/hfopenllm_v2/rubenroy/Gilgamesh-72B/6918d1a3-e547-46b7-9062-274057c1f513.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rubenroy_Gilgamesh-72B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gilgamesh-72B", - "id": "rubenroy/Gilgamesh-72B", - "developer": "rubenroy", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8486 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7253 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4626 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5802 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rubenroy/Zurich-14B-GCv2-5m/599deb3c-49f9-4c0b-af8d-78f9e166820b.json b/data/hfopenllm_v2/rubenroy/Zurich-14B-GCv2-5m/599deb3c-49f9-4c0b-af8d-78f9e166820b.json deleted file mode 100644 index fc4afde1a..000000000 --- a/data/hfopenllm_v2/rubenroy/Zurich-14B-GCv2-5m/599deb3c-49f9-4c0b-af8d-78f9e166820b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rubenroy_Zurich-14B-GCv2-5m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Zurich-14B-GCv2-5m", - "id": "rubenroy/Zurich-14B-GCv2-5m", - "developer": "rubenroy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6164 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6308 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3616 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4874 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5233 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ruizhe1217/sft-s1-qwen-0.5b/b4ea3f14-3787-434b-8f26-20ff640c0146.json b/data/hfopenllm_v2/ruizhe1217/sft-s1-qwen-0.5b/b4ea3f14-3787-434b-8f26-20ff640c0146.json deleted file mode 100644 index 739248056..000000000 --- a/data/hfopenllm_v2/ruizhe1217/sft-s1-qwen-0.5b/b4ea3f14-3787-434b-8f26-20ff640c0146.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ruizhe1217_sft-s1-qwen-0.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sft-s1-qwen-0.5b", - "id": "ruizhe1217/sft-s1-qwen-0.5b", - "developer": "ruizhe1217", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.494 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2749 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3301 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1892 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/rwitz/go-bruins-v2/6952c527-ca23-494a-910c-1c027e4a5a29.json b/data/hfopenllm_v2/rwitz/go-bruins-v2/6952c527-ca23-494a-910c-1c027e4a5a29.json deleted file mode 100644 index 9ab1d084b..000000000 --- a/data/hfopenllm_v2/rwitz/go-bruins-v2/6952c527-ca23-494a-910c-1c027e4a5a29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/rwitz_go-bruins-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "go-bruins-v2", - "id": "rwitz/go-bruins-v2", - "developer": "rwitz", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3799 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2761 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-CPO/3f12e79c-dd1b-428d-9094-10a047205e3e.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-CPO/3f12e79c-dd1b-428d-9094-10a047205e3e.json deleted file mode 100644 index 63c7d223e..000000000 --- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-CPO/3f12e79c-dd1b-428d-9094-10a047205e3e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-CPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama2-7B-CPO", - "id": "sabersaleh/Llama2-7B-CPO", - "developer": "sabersaleh", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1545 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3458 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4048 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1606 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-DPO/d508da29-0288-4a0a-b727-fc5355515c5e.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-DPO/d508da29-0288-4a0a-b727-fc5355515c5e.json deleted file mode 100644 index 86e3966a1..000000000 --- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-DPO/d508da29-0288-4a0a-b727-fc5355515c5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama2-7B-DPO", - "id": "sabersaleh/Llama2-7B-DPO", - "developer": "sabersaleh", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1453 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3512 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1626 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-IPO/48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-IPO/48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json deleted file mode 100644 index 2dd58f3e1..000000000 --- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-IPO/48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-IPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama2-7B-IPO", - "id": "sabersaleh/Llama2-7B-IPO", - "developer": "sabersaleh", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1769 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4048 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1617 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-KTO/4bb7d331-f305-4c08-a073-87ba7b2cbde2.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-KTO/4bb7d331-f305-4c08-a073-87ba7b2cbde2.json deleted file mode 100644 index 1848858c0..000000000 --- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-KTO/4bb7d331-f305-4c08-a073-87ba7b2cbde2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-KTO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama2-7B-KTO", - "id": "sabersaleh/Llama2-7B-KTO", - "developer": "sabersaleh", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1528 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4167 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1636 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-SPO/94639454-c525-4e6f-af27-d92d45a9ac40.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-SPO/94639454-c525-4e6f-af27-d92d45a9ac40.json deleted file mode 100644 index 108215f39..000000000 --- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-SPO/94639454-c525-4e6f-af27-d92d45a9ac40.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-SPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama2-7B-SPO", - "id": "sabersaleh/Llama2-7B-SPO", - "developer": "sabersaleh", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1567 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3383 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1757 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-SimPO/9fa81bb7-7abc-4764-9465-d61217590da5.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-SimPO/9fa81bb7-7abc-4764-9465-d61217590da5.json deleted file mode 100644 index 7ee98a8ff..000000000 --- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-SimPO/9fa81bb7-7abc-4764-9465-d61217590da5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-SimPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama2-7B-SimPO", - "id": "sabersaleh/Llama2-7B-SimPO", - "developer": "sabersaleh", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1659 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3489 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4007 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1641 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersaleh/Llama3/9a683492-4057-4de4-a30a-aa66becffb13.json b/data/hfopenllm_v2/sabersaleh/Llama3/9a683492-4057-4de4-a30a-aa66becffb13.json deleted file mode 100644 index 8c46cb2b7..000000000 --- a/data/hfopenllm_v2/sabersaleh/Llama3/9a683492-4057-4de4-a30a-aa66becffb13.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersaleh_Llama3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3", - "id": "sabersaleh/Llama3", - "developer": "sabersaleh", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3321 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4782 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3933 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3162 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersalehk/Llama3-001-300/b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json b/data/hfopenllm_v2/sabersalehk/Llama3-001-300/b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json deleted file mode 100644 index aa57d205c..000000000 --- a/data/hfopenllm_v2/sabersalehk/Llama3-001-300/b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersalehk_Llama3-001-300/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-001-300", - "id": "sabersalehk/Llama3-001-300", - "developer": "sabersalehk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3179 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4745 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4064 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3158 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersalehk/Llama3-SimPO/ba658bc7-b89d-4fb7-a794-f48bd3715a49.json b/data/hfopenllm_v2/sabersalehk/Llama3-SimPO/ba658bc7-b89d-4fb7-a794-f48bd3715a49.json deleted file mode 100644 index ec3a89b1c..000000000 --- a/data/hfopenllm_v2/sabersalehk/Llama3-SimPO/ba658bc7-b89d-4fb7-a794-f48bd3715a49.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersalehk_Llama3-SimPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-SimPO", - "id": "sabersalehk/Llama3-SimPO", - "developer": "sabersalehk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3642 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4874 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4046 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersalehk/Llama3_001_200/93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json b/data/hfopenllm_v2/sabersalehk/Llama3_001_200/93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json deleted file mode 100644 index 71b6274f4..000000000 --- a/data/hfopenllm_v2/sabersalehk/Llama3_001_200/93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersalehk_Llama3_001_200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3_001_200", - "id": "sabersalehk/Llama3_001_200", - "developer": "sabersalehk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3218 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4728 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4037 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3183 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sabersalehk/Llama3_01_300/5a91b0bf-b043-41d2-960d-5f0e78abc400.json b/data/hfopenllm_v2/sabersalehk/Llama3_01_300/5a91b0bf-b043-41d2-960d-5f0e78abc400.json deleted file mode 100644 index 5ecbad865..000000000 --- a/data/hfopenllm_v2/sabersalehk/Llama3_01_300/5a91b0bf-b043-41d2-960d-5f0e78abc400.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sabersalehk_Llama3_01_300/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3_01_300", - "id": "sabersalehk/Llama3_01_300", - "developer": "sabersalehk", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2959 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4691 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4065 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/saishf/Fimbulvetr-Kuro-Lotus-10.7B/263f56e5-b578-475a-9bc4-b5ffc142f9e2.json b/data/hfopenllm_v2/saishf/Fimbulvetr-Kuro-Lotus-10.7B/263f56e5-b578-475a-9bc4-b5ffc142f9e2.json deleted file mode 100644 index 603e3b142..000000000 --- a/data/hfopenllm_v2/saishf/Fimbulvetr-Kuro-Lotus-10.7B/263f56e5-b578-475a-9bc4-b5ffc142f9e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/saishf_Fimbulvetr-Kuro-Lotus-10.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fimbulvetr-Kuro-Lotus-10.7B", - "id": "saishf/Fimbulvetr-Kuro-Lotus-10.7B", - "developer": "saishf", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4939 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4342 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4445 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/saishf/Neural-SOVLish-Devil-8B-L3/9219ff66-73ba-45d8-99a0-23d23b3555ba.json b/data/hfopenllm_v2/saishf/Neural-SOVLish-Devil-8B-L3/9219ff66-73ba-45d8-99a0-23d23b3555ba.json deleted file mode 100644 index 11c0ba6c8..000000000 --- a/data/hfopenllm_v2/saishf/Neural-SOVLish-Devil-8B-L3/9219ff66-73ba-45d8-99a0-23d23b3555ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/saishf_Neural-SOVLish-Devil-8B-L3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Neural-SOVLish-Devil-8B-L3", - "id": "saishf/Neural-SOVLish-Devil-8B-L3", - "developer": "saishf", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5142 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.411 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3807 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/saishshinde15/TethysAI_Base_Reasoning/b2328396-e9b2-464d-94e4-f03db19144ea.json b/data/hfopenllm_v2/saishshinde15/TethysAI_Base_Reasoning/b2328396-e9b2-464d-94e4-f03db19144ea.json deleted file mode 100644 index 75ae575a6..000000000 --- a/data/hfopenllm_v2/saishshinde15/TethysAI_Base_Reasoning/b2328396-e9b2-464d-94e4-f03db19144ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/saishshinde15_TethysAI_Base_Reasoning/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TethysAI_Base_Reasoning", - "id": "saishshinde15/TethysAI_Base_Reasoning", - "developer": "saishshinde15", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6369 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4519 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3142 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4075 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3236 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex/3f895edf-8f54-48ff-a731-666144af0fda.json b/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex/3f895edf-8f54-48ff-a731-666144af0fda.json deleted file mode 100644 index ce5a5eb6e..000000000 --- a/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex/3f895edf-8f54-48ff-a731-666144af0fda.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/saishshinde15_TethysAI_Vortex/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TethysAI_Vortex", - "id": "saishshinde15/TethysAI_Vortex", - "developer": "saishshinde15", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4749 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.315 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4458 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3241 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex_Reasoning/b48b8e16-a555-466b-8b1c-246137223311.json b/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex_Reasoning/b48b8e16-a555-466b-8b1c-246137223311.json deleted file mode 100644 index 5425463aa..000000000 --- a/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex_Reasoning/b48b8e16-a555-466b-8b1c-246137223311.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/saishshinde15_TethysAI_Vortex_Reasoning/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "TethysAI_Vortex_Reasoning", - "id": "saishshinde15/TethysAI_Vortex_Reasoning", - "developer": "saishshinde15", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4021 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4694 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4084 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sakaltcommunity/novablast-preview/5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json b/data/hfopenllm_v2/sakaltcommunity/novablast-preview/5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json deleted file mode 100644 index 899a6a5a1..000000000 --- a/data/hfopenllm_v2/sakaltcommunity/novablast-preview/5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sakaltcommunity_novablast-preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "novablast-preview", - "id": "sakaltcommunity/novablast-preview", - "developer": "sakaltcommunity", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.453 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7043 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4894 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5915 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sakaltcommunity/sakaltum-7b/d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json b/data/hfopenllm_v2/sakaltcommunity/sakaltum-7b/d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json deleted file mode 100644 index 3cafb3d05..000000000 --- a/data/hfopenllm_v2/sakaltcommunity/sakaltum-7b/d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sakaltcommunity_sakaltum-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sakaltum-7b", - "id": "sakaltcommunity/sakaltum-7b", - "developer": "sakaltcommunity", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4575 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2769 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sakhan10/quantized_open_llama_3b_v2/0176903f-e6ca-4f21-b98a-00bc443bf244.json b/data/hfopenllm_v2/sakhan10/quantized_open_llama_3b_v2/0176903f-e6ca-4f21-b98a-00bc443bf244.json deleted file mode 100644 index 19cf5433b..000000000 --- a/data/hfopenllm_v2/sakhan10/quantized_open_llama_3b_v2/0176903f-e6ca-4f21-b98a-00bc443bf244.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sakhan10_quantized_open_llama_3b_v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "quantized_open_llama_3b_v2", - "id": "sakhan10/quantized_open_llama_3b_v2", - "developer": "sakhan10", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1872 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3682 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.0/11f32afc-95c1-4531-ae45-5a0974d36b3a.json b/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.0/11f32afc-95c1-4531-ae45-5a0974d36b3a.json deleted file mode 100644 index 30efb3c9e..000000000 --- a/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.0/11f32afc-95c1-4531-ae45-5a0974d36b3a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/saltlux_luxia-21.4b-alignment-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "luxia-21.4b-alignment-v1.0", - "id": "saltlux/luxia-21.4b-alignment-v1.0", - "developer": "saltlux", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 21.421 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6373 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4328 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3403 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.2/70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json b/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.2/70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json deleted file mode 100644 index 1f660f4e7..000000000 --- a/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.2/70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/saltlux_luxia-21.4b-alignment-v1.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "luxia-21.4b-alignment-v1.2", - "id": "saltlux/luxia-21.4b-alignment-v1.2", - "developer": "saltlux", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 21.421 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4115 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6371 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4459 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sam-paech/Darkest-muse-v1/53cf325b-6f32-4791-8f95-8b982ea03b23.json b/data/hfopenllm_v2/sam-paech/Darkest-muse-v1/53cf325b-6f32-4791-8f95-8b982ea03b23.json deleted file mode 100644 index 3a8e6c998..000000000 --- a/data/hfopenllm_v2/sam-paech/Darkest-muse-v1/53cf325b-6f32-4791-8f95-8b982ea03b23.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sam-paech_Darkest-muse-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Darkest-muse-v1", - "id": "sam-paech/Darkest-muse-v1", - "developer": "sam-paech", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7344 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5968 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4184 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sam-paech/Delirium-v1/8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json b/data/hfopenllm_v2/sam-paech/Delirium-v1/8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json deleted file mode 100644 index 2fdabfe78..000000000 --- a/data/hfopenllm_v2/sam-paech/Delirium-v1/8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sam-paech_Delirium-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Delirium-v1", - "id": "sam-paech/Delirium-v1", - "developer": "sam-paech", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5962 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2107 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.419 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sam-paech/Quill-v1/7adf79de-a51d-4b87-989a-c218ec6d99e3.json b/data/hfopenllm_v2/sam-paech/Quill-v1/7adf79de-a51d-4b87-989a-c218ec6d99e3.json deleted file mode 100644 index cd6582f9b..000000000 --- a/data/hfopenllm_v2/sam-paech/Quill-v1/7adf79de-a51d-4b87-989a-c218ec6d99e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sam-paech_Quill-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Quill-v1", - "id": "sam-paech/Quill-v1", - "developer": "sam-paech", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7122 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5969 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2122 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3398 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sarvamai/OpenHathi-7B-Hi-v0.1-Base/92358e5a-5e73-4747-9e92-e5ac003b97f7.json b/data/hfopenllm_v2/sarvamai/OpenHathi-7B-Hi-v0.1-Base/92358e5a-5e73-4747-9e92-e5ac003b97f7.json deleted file mode 100644 index 10e232e86..000000000 --- a/data/hfopenllm_v2/sarvamai/OpenHathi-7B-Hi-v0.1-Base/92358e5a-5e73-4747-9e92-e5ac003b97f7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sarvamai_OpenHathi-7B-Hi-v0.1-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenHathi-7B-Hi-v0.1-Base", - "id": "sarvamai/OpenHathi-7B-Hi-v0.1-Base", - "developer": "sarvamai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.87 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1804 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1543 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/schnapss/testmerge-7b/f1636512-b98f-4fe4-adf3-abd556dd0ab9.json b/data/hfopenllm_v2/schnapss/testmerge-7b/f1636512-b98f-4fe4-adf3-abd556dd0ab9.json deleted file mode 100644 index 5594f41d4..000000000 --- a/data/hfopenllm_v2/schnapss/testmerge-7b/f1636512-b98f-4fe4-adf3-abd556dd0ab9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/schnapss_testmerge-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "testmerge-7b", - "id": "schnapss/testmerge-7b", - "developer": "schnapss", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5187 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0687 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sci-m-wang/Mistral-7B-Instruct-sa-v0.1/9333afdd-4866-412b-b11b-dfb118a06db9.json b/data/hfopenllm_v2/sci-m-wang/Mistral-7B-Instruct-sa-v0.1/9333afdd-4866-412b-b11b-dfb118a06db9.json deleted file mode 100644 index 792d4db80..000000000 --- a/data/hfopenllm_v2/sci-m-wang/Mistral-7B-Instruct-sa-v0.1/9333afdd-4866-412b-b11b-dfb118a06db9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sci-m-wang_Mistral-7B-Instruct-sa-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-Instruct-sa-v0.1", - "id": "sci-m-wang/Mistral-7B-Instruct-sa-v0.1", - "developer": "sci-m-wang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 14.483 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4335 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3273 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.39 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2362 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1/840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json b/data/hfopenllm_v2/sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1/840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json deleted file mode 100644 index 4019e2d95..000000000 --- a/data/hfopenllm_v2/sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1/840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sci-m-wang_Phi-3-mini-4k-instruct-sa-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-mini-4k-instruct-sa-v0.1", - "id": "sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1", - "developer": "sci-m-wang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 7.642 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5021 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5502 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.148 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3985 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sci-m-wang/deepseek-llm-7b-chat-sa-v0.1/071b49f2-8e23-47b1-9858-78d676d9905e.json b/data/hfopenllm_v2/sci-m-wang/deepseek-llm-7b-chat-sa-v0.1/071b49f2-8e23-47b1-9858-78d676d9905e.json deleted file mode 100644 index ea4d21a1e..000000000 --- a/data/hfopenllm_v2/sci-m-wang/deepseek-llm-7b-chat-sa-v0.1/071b49f2-8e23-47b1-9858-78d676d9905e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sci-m-wang_deepseek-llm-7b-chat-sa-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "deepseek-llm-7b-chat-sa-v0.1", - "id": "sci-m-wang/deepseek-llm-7b-chat-sa-v0.1", - "developer": "sci-m-wang", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4036 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3718 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4173 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2209 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/securin/Securin-LLM-V2.5-Qwen-1.5B/d3821f53-87aa-470a-a403-c8e3cd100ae1.json b/data/hfopenllm_v2/securin/Securin-LLM-V2.5-Qwen-1.5B/d3821f53-87aa-470a-a403-c8e3cd100ae1.json deleted file mode 100644 index 4a45da063..000000000 --- a/data/hfopenllm_v2/securin/Securin-LLM-V2.5-Qwen-1.5B/d3821f53-87aa-470a-a403-c8e3cd100ae1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/securin_Securin-LLM-V2.5-Qwen-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Securin-LLM-V2.5-Qwen-1.5B", - "id": "securin/Securin-LLM-V2.5-Qwen-1.5B", - "developer": "securin", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.543 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1492 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3158 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3606 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/senseable/WestLake-7B-v2/389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json b/data/hfopenllm_v2/senseable/WestLake-7B-v2/389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json deleted file mode 100644 index 60607b3dd..000000000 --- a/data/hfopenllm_v2/senseable/WestLake-7B-v2/389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/senseable_WestLake-7B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WestLake-7B-v2", - "id": "senseable/WestLake-7B-v2", - "developer": "senseable", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4419 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2764 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-70B-PlumChat/5f78f39a-42cc-4cf6-bb27-e2160765bf24.json b/data/hfopenllm_v2/sequelbox/Llama3.1-70B-PlumChat/5f78f39a-42cc-4cf6-bb27-e2160765bf24.json deleted file mode 100644 index e8f20a830..000000000 --- a/data/hfopenllm_v2/sequelbox/Llama3.1-70B-PlumChat/5f78f39a-42cc-4cf6-bb27-e2160765bf24.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-70B-PlumChat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-70B-PlumChat", - "id": "sequelbox/Llama3.1-70B-PlumChat", - "developer": "sequelbox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5616 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6753 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4774 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-MOTH/b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json b/data/hfopenllm_v2/sequelbox/Llama3.1-8B-MOTH/b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json deleted file mode 100644 index fa67f2014..000000000 --- a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-MOTH/b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-8B-MOTH/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-MOTH", - "id": "sequelbox/Llama3.1-8B-MOTH", - "developer": "sequelbox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5245 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3689 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumChat/bef1cbad-4f75-4dde-b467-6145f72a87f4.json b/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumChat/bef1cbad-4f75-4dde-b467-6145f72a87f4.json deleted file mode 100644 index 7e037a4de..000000000 --- a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumChat/bef1cbad-4f75-4dde-b467-6145f72a87f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-8B-PlumChat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-PlumChat", - "id": "sequelbox/Llama3.1-8B-PlumChat", - "developer": "sequelbox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4243 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3755 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumCode/654bebe0-b461-427e-a4cf-06386e9272d8.json b/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumCode/654bebe0-b461-427e-a4cf-06386e9272d8.json deleted file mode 100644 index cef3dea6d..000000000 --- a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumCode/654bebe0-b461-427e-a4cf-06386e9272d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-8B-PlumCode/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-PlumCode", - "id": "sequelbox/Llama3.1-8B-PlumCode", - "developer": "sequelbox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2045 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0272 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3773 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2335 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumMath/37ef4e34-58f8-463a-950f-48b3a6833d54.json b/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumMath/37ef4e34-58f8-463a-950f-48b3a6833d54.json deleted file mode 100644 index 553cddc4a..000000000 --- a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumMath/37ef4e34-58f8-463a-950f-48b3a6833d54.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-8B-PlumMath/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1-8B-PlumMath", - "id": "sequelbox/Llama3.1-8B-PlumMath", - "developer": "sequelbox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2242 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4032 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.318 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3919 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2975 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sequelbox/gemma-2-9B-MOTH/20687086-8aab-40f1-aec6-03917f4f9bf5.json b/data/hfopenllm_v2/sequelbox/gemma-2-9B-MOTH/20687086-8aab-40f1-aec6-03917f4f9bf5.json deleted file mode 100644 index 1c49b7861..000000000 --- a/data/hfopenllm_v2/sequelbox/gemma-2-9B-MOTH/20687086-8aab-40f1-aec6-03917f4f9bf5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sequelbox_gemma-2-9B-MOTH/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9B-MOTH", - "id": "sequelbox/gemma-2-9B-MOTH", - "developer": "sequelbox", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2059 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.308 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3409 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.114 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct/53a0a998-a0a6-4800-80bf-bfd83123f2f6.json b/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct/53a0a998-a0a6-4800-80bf-bfd83123f2f6.json deleted file mode 100644 index 4beac0c7e..000000000 --- a/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct/53a0a998-a0a6-4800-80bf-bfd83123f2f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sethuiyer_Llama-3.1-8B-Experimental-1206-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Experimental-1206-Instruct", - "id": "sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct", - "developer": "sethuiyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6967 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5104 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3966 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct/4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json b/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct/4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json deleted file mode 100644 index a0e1dafd6..000000000 --- a/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct/4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sethuiyer_Llama-3.1-8B-Experimental-1208-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Experimental-1208-Instruct", - "id": "sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct", - "developer": "sethuiyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4964 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.379 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3511 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sethuiyer/LlamaZero-3.1-8B-Experimental-1208/42c8d84d-c8b8-42c6-8f49-4e971df173d7.json b/data/hfopenllm_v2/sethuiyer/LlamaZero-3.1-8B-Experimental-1208/42c8d84d-c8b8-42c6-8f49-4e971df173d7.json deleted file mode 100644 index 27df7dbc2..000000000 --- a/data/hfopenllm_v2/sethuiyer/LlamaZero-3.1-8B-Experimental-1208/42c8d84d-c8b8-42c6-8f49-4e971df173d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sethuiyer_LlamaZero-3.1-8B-Experimental-1208/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LlamaZero-3.1-8B-Experimental-1208", - "id": "sethuiyer/LlamaZero-3.1-8B-Experimental-1208", - "developer": "sethuiyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6051 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4981 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.108 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sethuiyer/Llamaverse-3.1-8B-Instruct/77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json b/data/hfopenllm_v2/sethuiyer/Llamaverse-3.1-8B-Instruct/77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json deleted file mode 100644 index 38d270dc1..000000000 --- a/data/hfopenllm_v2/sethuiyer/Llamaverse-3.1-8B-Instruct/77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sethuiyer_Llamaverse-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llamaverse-3.1-8B-Instruct", - "id": "sethuiyer/Llamaverse-3.1-8B-Instruct", - "developer": "sethuiyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6185 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5414 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3762 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sethuiyer/Llamazing-3.1-8B-Instruct/a9ed5d04-57d2-4566-91df-b798be939fdb.json b/data/hfopenllm_v2/sethuiyer/Llamazing-3.1-8B-Instruct/a9ed5d04-57d2-4566-91df-b798be939fdb.json deleted file mode 100644 index f6ecfe05e..000000000 --- a/data/hfopenllm_v2/sethuiyer/Llamazing-3.1-8B-Instruct/a9ed5d04-57d2-4566-91df-b798be939fdb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sethuiyer_Llamazing-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llamazing-3.1-8B-Instruct", - "id": "sethuiyer/Llamazing-3.1-8B-Instruct", - "developer": "sethuiyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5711 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5291 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3976 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3606 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sethuiyer/Qwen2.5-7B-Anvita/bad4ec47-fe84-4518-b072-6955938f0c86.json b/data/hfopenllm_v2/sethuiyer/Qwen2.5-7B-Anvita/bad4ec47-fe84-4518-b072-6955938f0c86.json deleted file mode 100644 index de9d6f292..000000000 --- a/data/hfopenllm_v2/sethuiyer/Qwen2.5-7B-Anvita/bad4ec47-fe84-4518-b072-6955938f0c86.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sethuiyer_Qwen2.5-7B-Anvita/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Anvita", - "id": "sethuiyer/Qwen2.5-7B-Anvita", - "developer": "sethuiyer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.648 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2017 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4337 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/shadowml/BeagSake-7B/497e585c-059a-4e18-9a8f-bdaa066f59ea.json b/data/hfopenllm_v2/shadowml/BeagSake-7B/497e585c-059a-4e18-9a8f-bdaa066f59ea.json deleted file mode 100644 index 3c7a54074..000000000 --- a/data/hfopenllm_v2/shadowml/BeagSake-7B/497e585c-059a-4e18-9a8f-bdaa066f59ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/shadowml_BeagSake-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BeagSake-7B", - "id": "shadowml/BeagSake-7B", - "developer": "shadowml", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5216 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4711 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2585 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/shadowml/Mixolar-4x7b/e24b2a4e-83e4-4a79-bc41-03a54af00595.json b/data/hfopenllm_v2/shadowml/Mixolar-4x7b/e24b2a4e-83e4-4a79-bc41-03a54af00595.json deleted file mode 100644 index 193aa7115..000000000 --- a/data/hfopenllm_v2/shadowml/Mixolar-4x7b/e24b2a4e-83e4-4a79-bc41-03a54af00595.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/shadowml_Mixolar-4x7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mixolar-4x7b", - "id": "shadowml/Mixolar-4x7b", - "developer": "shadowml", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 36.099 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5216 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4258 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/shastraai/Shastra-LLAMA2-Math-Commonsense-SFT/15e39361-585b-4870-b91a-64dce4fb37ec.json b/data/hfopenllm_v2/shastraai/Shastra-LLAMA2-Math-Commonsense-SFT/15e39361-585b-4870-b91a-64dce4fb37ec.json deleted file mode 100644 index 015b23d62..000000000 --- a/data/hfopenllm_v2/shastraai/Shastra-LLAMA2-Math-Commonsense-SFT/15e39361-585b-4870-b91a-64dce4fb37ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/shastraai_Shastra-LLAMA2-Math-Commonsense-SFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Shastra-LLAMA2-Math-Commonsense-SFT", - "id": "shastraai/Shastra-LLAMA2-Math-Commonsense-SFT", - "developer": "shastraai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3042 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1997 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/shivam9980/NEPALI-LLM/96efd11b-e9f2-4bf1-90f9-561714137edf.json b/data/hfopenllm_v2/shivam9980/NEPALI-LLM/96efd11b-e9f2-4bf1-90f9-561714137edf.json deleted file mode 100644 index 60b5094a5..000000000 --- a/data/hfopenllm_v2/shivam9980/NEPALI-LLM/96efd11b-e9f2-4bf1-90f9-561714137edf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/shivam9980_NEPALI-LLM/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NEPALI-LLM", - "id": "shivam9980/NEPALI-LLM", - "developer": "shivam9980", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.273 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0417 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3828 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4122 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2064 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/shivam9980/mistral-7b-news-cnn-merged/98e9936d-d376-4c72-80a6-0a28cf722ac4.json b/data/hfopenllm_v2/shivam9980/mistral-7b-news-cnn-merged/98e9936d-d376-4c72-80a6-0a28cf722ac4.json deleted file mode 100644 index aa2c40613..000000000 --- a/data/hfopenllm_v2/shivam9980/mistral-7b-news-cnn-merged/98e9936d-d376-4c72-80a6-0a28cf722ac4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/shivam9980_mistral-7b-news-cnn-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-7b-news-cnn-merged", - "id": "shivam9980/mistral-7b-news-cnn-merged", - "developer": "shivam9980", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 7.723 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4634 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3635 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0189 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4523 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/shivank21/mistral_dpo_self/7ada9c83-7851-4da2-b9d1-d744b174b777.json b/data/hfopenllm_v2/shivank21/mistral_dpo_self/7ada9c83-7851-4da2-b9d1-d744b174b777.json deleted file mode 100644 index ab32e42b6..000000000 --- a/data/hfopenllm_v2/shivank21/mistral_dpo_self/7ada9c83-7851-4da2-b9d1-d744b174b777.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/shivank21_mistral_dpo_self/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral_dpo_self", - "id": "shivank21/mistral_dpo_self", - "developer": "shivank21", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "", - "params_billions": 7.913 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3403 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3216 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2214 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/shuttleai/shuttle-3/a6ed72b7-14f1-464c-a7f5-590791982696.json b/data/hfopenllm_v2/shuttleai/shuttle-3/a6ed72b7-14f1-464c-a7f5-590791982696.json deleted file mode 100644 index b5153a72b..000000000 --- a/data/hfopenllm_v2/shuttleai/shuttle-3/a6ed72b7-14f1-464c-a7f5-590791982696.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/shuttleai_shuttle-3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "shuttle-3", - "id": "shuttleai/shuttle-3", - "developer": "shuttleai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8154 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.742 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.46 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4119 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4377 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5716 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/shyamieee/Padma-v7.0/79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json b/data/hfopenllm_v2/shyamieee/Padma-v7.0/79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json deleted file mode 100644 index 17b7fc494..000000000 --- a/data/hfopenllm_v2/shyamieee/Padma-v7.0/79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/shyamieee_Padma-v7.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Padma-v7.0", - "id": "shyamieee/Padma-v7.0", - "developer": "shyamieee", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3841 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5119 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/silma-ai/SILMA-9B-Instruct-v1.0/ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json b/data/hfopenllm_v2/silma-ai/SILMA-9B-Instruct-v1.0/ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json deleted file mode 100644 index d7f107192..000000000 --- a/data/hfopenllm_v2/silma-ai/SILMA-9B-Instruct-v1.0/ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/silma-ai_SILMA-9B-Instruct-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SILMA-9B-Instruct-v1.0", - "id": "silma-ai/SILMA-9B-Instruct-v1.0", - "developer": "silma-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5842 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5219 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1163 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.392 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/silma-ai/SILMA-Kashif-2B-Instruct-v1.0/2663884f-941c-4e16-8029-b38e3a543733.json b/data/hfopenllm_v2/silma-ai/SILMA-Kashif-2B-Instruct-v1.0/2663884f-941c-4e16-8029-b38e3a543733.json deleted file mode 100644 index 31c0700a3..000000000 --- a/data/hfopenllm_v2/silma-ai/SILMA-Kashif-2B-Instruct-v1.0/2663884f-941c-4e16-8029-b38e3a543733.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/silma-ai_SILMA-Kashif-2B-Instruct-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SILMA-Kashif-2B-Instruct-v1.0", - "id": "silma-ai/SILMA-Kashif-2B-Instruct-v1.0", - "developer": "silma-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1181 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3793 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4043 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2258 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/siqi00/Mistral-7B-DFT/ca7af645-4796-4b31-ae7d-2cbebe5a369b.json b/data/hfopenllm_v2/siqi00/Mistral-7B-DFT/ca7af645-4796-4b31-ae7d-2cbebe5a369b.json deleted file mode 100644 index 6c0b09631..000000000 --- a/data/hfopenllm_v2/siqi00/Mistral-7B-DFT/ca7af645-4796-4b31-ae7d-2cbebe5a369b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/siqi00_Mistral-7B-DFT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-DFT", - "id": "siqi00/Mistral-7B-DFT", - "developer": "siqi00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5569 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4665 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4191 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2963 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/siqi00/Mistral-7B-DFT2/f95e098c-d320-4db1-887d-8c3252bbaf77.json b/data/hfopenllm_v2/siqi00/Mistral-7B-DFT2/f95e098c-d320-4db1-887d-8c3252bbaf77.json deleted file mode 100644 index e31ef75fe..000000000 --- a/data/hfopenllm_v2/siqi00/Mistral-7B-DFT2/f95e098c-d320-4db1-887d-8c3252bbaf77.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/siqi00_Mistral-7B-DFT2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-DFT2", - "id": "siqi00/Mistral-7B-DFT2", - "developer": "siqi00", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5804 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/skumar9/Llama-medx_v2/2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json b/data/hfopenllm_v2/skumar9/Llama-medx_v2/2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json deleted file mode 100644 index 675a3eeee..000000000 --- a/data/hfopenllm_v2/skumar9/Llama-medx_v2/2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/skumar9_Llama-medx_v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-medx_v2", - "id": "skumar9/Llama-medx_v2", - "developer": "skumar9", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4909 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3661 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3463 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/skymizer/Llama2-7b-sft-chat-custom-template-dpo/5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json b/data/hfopenllm_v2/skymizer/Llama2-7b-sft-chat-custom-template-dpo/5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json deleted file mode 100644 index 69c7e0621..000000000 --- a/data/hfopenllm_v2/skymizer/Llama2-7b-sft-chat-custom-template-dpo/5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/skymizer_Llama2-7b-sft-chat-custom-template-dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama2-7b-sft-chat-custom-template-dpo", - "id": "skymizer/Llama2-7b-sft-chat-custom-template-dpo", - "developer": "skymizer", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.738 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2353 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1946 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/someon98/qwen-CoMa-0.5b/aadfae06-73b6-4306-b056-0a733b9bd8f4.json b/data/hfopenllm_v2/someon98/qwen-CoMa-0.5b/aadfae06-73b6-4306-b056-0a733b9bd8f4.json deleted file mode 100644 index 5b36f35c5..000000000 --- a/data/hfopenllm_v2/someon98/qwen-CoMa-0.5b/aadfae06-73b6-4306-b056-0a733b9bd8f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/someon98_qwen-CoMa-0.5b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen-CoMa-0.5b", - "id": "someon98/qwen-CoMa-0.5b", - "developer": "someon98", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2277 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2953 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4046 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1099 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/ChocoTrio-14B-v1/cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json b/data/hfopenllm_v2/sometimesanotion/ChocoTrio-14B-v1/cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json deleted file mode 100644 index 9b58091a6..000000000 --- a/data/hfopenllm_v2/sometimesanotion/ChocoTrio-14B-v1/cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_ChocoTrio-14B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ChocoTrio-14B-v1", - "id": "sometimesanotion/ChocoTrio-14B-v1", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7089 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6506 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3973 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4821 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-40/97640dd1-d415-4b56-818c-cdcede3c52fd.json b/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-40/97640dd1-d415-4b56-818c-cdcede3c52fd.json deleted file mode 100644 index 3e9ffdabd..000000000 --- a/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-40/97640dd1-d415-4b56-818c-cdcede3c52fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_IF-reasoning-experiment-40/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IF-reasoning-experiment-40", - "id": "sometimesanotion/IF-reasoning-experiment-40", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.633 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5025 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-80/b750c460-ef70-4abf-b77d-118a82039598.json b/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-80/b750c460-ef70-4abf-b77d-118a82039598.json deleted file mode 100644 index f511af28e..000000000 --- a/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-80/b750c460-ef70-4abf-b77d-118a82039598.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_IF-reasoning-experiment-80/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IF-reasoning-experiment-80", - "id": "sometimesanotion/IF-reasoning-experiment-80", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.383 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5463 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5025 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/KytheraMix-7B-v0.2/f4c20519-9e33-4698-a17a-07e5fe7d2707.json b/data/hfopenllm_v2/sometimesanotion/KytheraMix-7B-v0.2/f4c20519-9e33-4698-a17a-07e5fe7d2707.json deleted file mode 100644 index 99248458d..000000000 --- a/data/hfopenllm_v2/sometimesanotion/KytheraMix-7B-v0.2/f4c20519-9e33-4698-a17a-07e5fe7d2707.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_KytheraMix-7B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KytheraMix-7B-v0.2", - "id": "sometimesanotion/KytheraMix-7B-v0.2", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6129 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5635 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4594 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.1-experimental/0f204733-55b4-4c06-bd12-dbc2e2593abd.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.1-experimental/0f204733-55b4-4c06-bd12-dbc2e2593abd.json deleted file mode 100644 index f40410446..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.1-experimental/0f204733-55b4-4c06-bd12-dbc2e2593abd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.1-experimental/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarck-14B-v0.1-experimental", - "id": "sometimesanotion/Lamarck-14B-v0.1-experimental", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5354 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6583 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4728 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5408 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.3/0bb226ed-fe88-4678-9b50-f77883ceb708.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.3/0bb226ed-fe88-4678-9b50-f77883ceb708.json deleted file mode 100644 index 27f7815dc..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.3/0bb226ed-fe88-4678-9b50-f77883ceb708.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarck-14B-v0.3", - "id": "sometimesanotion/Lamarck-14B-v0.3", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5032 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6611 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.4-Qwenvergence/fb297e45-9e14-4853-8384-75c187b28a9b.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.4-Qwenvergence/fb297e45-9e14-4853-8384-75c187b28a9b.json deleted file mode 100644 index 96418c26f..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.4-Qwenvergence/fb297e45-9e14-4853-8384-75c187b28a9b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.4-Qwenvergence/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarck-14B-v0.4-Qwenvergence", - "id": "sometimesanotion/Lamarck-14B-v0.4-Qwenvergence", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4906 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6535 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3399 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-002-model_stock/4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-002-model_stock/4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json deleted file mode 100644 index d6ddf3a3c..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-002-model_stock/4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.6-002-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarck-14B-v0.6-002-model_stock", - "id": "sometimesanotion/Lamarck-14B-v0.6-002-model_stock", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6692 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6143 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.518 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5054 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-model_stock/c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-model_stock/c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json deleted file mode 100644 index 509a05a43..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-model_stock/c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.6-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarck-14B-v0.6-model_stock", - "id": "sometimesanotion/Lamarck-14B-v0.6-model_stock", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.679 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6269 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4245 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5007 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5198 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6/5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6/5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json deleted file mode 100644 index 0f6ebd239..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6/5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarck-14B-v0.6", - "id": "sometimesanotion/Lamarck-14B-v0.6", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6973 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.646 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4041 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-Fusion/4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-Fusion/4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json deleted file mode 100644 index 1e00d79b8..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-Fusion/4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.7-Fusion/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarck-14B-v0.7-Fusion", - "id": "sometimesanotion/Lamarck-14B-v0.7-Fusion", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6821 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4041 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.401 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc1/2c044767-1169-48c6-9e37-e9d1e35f4cfe.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc1/2c044767-1169-48c6-9e37-e9d1e35f4cfe.json deleted file mode 100644 index 0e64b71d7..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc1/2c044767-1169-48c6-9e37-e9d1e35f4cfe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.7-rc1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarck-14B-v0.7-rc1", - "id": "sometimesanotion/Lamarck-14B-v0.7-rc1", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7305 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6486 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3852 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4715 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc4/bad67b35-d9ef-417a-955b-9c33e87cb927.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc4/bad67b35-d9ef-417a-955b-9c33e87cb927.json deleted file mode 100644 index ebc21eef2..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc4/bad67b35-d9ef-417a-955b-9c33e87cb927.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.7-rc4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarck-14B-v0.7-rc4", - "id": "sometimesanotion/Lamarck-14B-v0.7-rc4", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7211 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.651 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4026 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3893 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4912 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v1/60eaa315-f489-405d-a67d-7f1312e90cab.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v1/60eaa315-f489-405d-a67d-7f1312e90cab.json deleted file mode 100644 index 1c1c6efe8..000000000 --- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v1/60eaa315-f489-405d-a67d-7f1312e90cab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LamarckInfusion-14B-v1", - "id": "sometimesanotion/LamarckInfusion-14B-v1", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7198 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4169 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4899 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-hi/50de312a-293d-41a4-8bee-4feb0c148b90.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-hi/50de312a-293d-41a4-8bee-4feb0c148b90.json deleted file mode 100644 index a6c8039e6..000000000 --- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-hi/50de312a-293d-41a4-8bee-4feb0c148b90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v2-hi/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LamarckInfusion-14B-v2-hi", - "id": "sometimesanotion/LamarckInfusion-14B-v2-hi", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6855 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6555 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4847 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5405 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-lo/56f24cac-394c-4439-8f2e-8270e7519bda.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-lo/56f24cac-394c-4439-8f2e-8270e7519bda.json deleted file mode 100644 index b52d3f3f1..000000000 --- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-lo/56f24cac-394c-4439-8f2e-8270e7519bda.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v2-lo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LamarckInfusion-14B-v2-lo", - "id": "sometimesanotion/LamarckInfusion-14B-v2-lo", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6788 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6528 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4237 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5397 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2/8efa1423-0a39-4674-a94d-3d92448010d6.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2/8efa1423-0a39-4674-a94d-3d92448010d6.json deleted file mode 100644 index c108f183f..000000000 --- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2/8efa1423-0a39-4674-a94d-3d92448010d6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LamarckInfusion-14B-v2", - "id": "sometimesanotion/LamarckInfusion-14B-v2", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6812 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6564 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4388 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4993 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5416 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v3/350b3491-cba8-46b4-a07f-3d1277270530.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v3/350b3491-cba8-46b4-a07f-3d1277270530.json deleted file mode 100644 index 54ff84883..000000000 --- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v3/350b3491-cba8-46b4-a07f-3d1277270530.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LamarckInfusion-14B-v3", - "id": "sometimesanotion/LamarckInfusion-14B-v3", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7131 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6518 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4124 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5407 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen-14B-ProseStock-v4/0741ead7-24f3-49b0-9967-f726df84f78a.json b/data/hfopenllm_v2/sometimesanotion/Qwen-14B-ProseStock-v4/0741ead7-24f3-49b0-9967-f726df84f78a.json deleted file mode 100644 index 37d385cfe..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen-14B-ProseStock-v4/0741ead7-24f3-49b0-9967-f726df84f78a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen-14B-ProseStock-v4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-14B-ProseStock-v4", - "id": "sometimesanotion/Qwen-14B-ProseStock-v4", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4942 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6498 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4938 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen-2.5-14B-Virmarckeoso/1ea4d10e-e099-4967-8c43-e84acaeb40be.json b/data/hfopenllm_v2/sometimesanotion/Qwen-2.5-14B-Virmarckeoso/1ea4d10e-e099-4967-8c43-e84acaeb40be.json deleted file mode 100644 index fb27ac194..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen-2.5-14B-Virmarckeoso/1ea4d10e-e099-4967-8c43-e84acaeb40be.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen-2.5-14B-Virmarckeoso/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen-2.5-14B-Virmarckeoso", - "id": "sometimesanotion/Qwen-2.5-14B-Virmarckeoso", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4813 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.657 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5377 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v2/6c78d9f7-a61e-4f65-ac57-61597f735541.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v2/6c78d9f7-a61e-4f65-ac57-61597f735541.json deleted file mode 100644 index 916ea88f9..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v2/6c78d9f7-a61e-4f65-ac57-61597f735541.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Vimarckoso-v2", - "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v2", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4819 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.538 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant/e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant/e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json deleted file mode 100644 index 5757a1b7a..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant/e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-IF-Variant/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Vimarckoso-v3-IF-Variant", - "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6413 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5521 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2545 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5319 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4589 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01/153cfe7f-c27a-40b8-b8d2-54351f26f583.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01/153cfe7f-c27a-40b8-b8d2-54351f26f583.json deleted file mode 100644 index 874b6da06..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01/153cfe7f-c27a-40b8-b8d2-54351f26f583.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-Prose01/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Vimarckoso-v3-Prose01", - "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6872 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6359 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3995 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4807 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5275 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock/b58372cd-5d55-4f42-a5da-2970e55b44b0.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock/b58372cd-5d55-4f42-a5da-2970e55b44b0.json deleted file mode 100644 index b0e9a12fb..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock/b58372cd-5d55-4f42-a5da-2970e55b44b0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Vimarckoso-v3-model_stock", - "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7162 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6421 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4245 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5316 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3/34a028ac-2002-480c-a1af-5b945ffe872e.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3/34a028ac-2002-480c-a1af-5b945ffe872e.json deleted file mode 100644 index f888babcd..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3/34a028ac-2002-480c-a1af-5b945ffe872e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Vimarckoso-v3", - "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7257 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6415 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4003 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4807 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5343 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso/065ffc51-154c-4a93-a342-0dd476fda473.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso/065ffc51-154c-4a93-a342-0dd476fda473.json deleted file mode 100644 index c071dc2d8..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso/065ffc51-154c-4a93-a342-0dd476fda473.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Vimarckoso", - "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4574 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6446 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3384 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3926 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4859 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose/ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose/ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json deleted file mode 100644 index 580c363ac..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose/ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Prose/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Gordion-v0.1-Prose", - "id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5347 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5599 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2893 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3205 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4525 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason/91004d26-7b8b-4c0a-bd8c-8880654dc93a.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason/91004d26-7b8b-4c0a-bd8c-8880654dc93a.json deleted file mode 100644 index 8e9f53078..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason/91004d26-7b8b-4c0a-bd8c-8880654dc93a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Reason/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Gordion-v0.1-Reason", - "id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4917 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5498 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2621 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4434 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4307 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1/5eb1aa92-a031-40d4-ad64-552075dae68a.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1/5eb1aa92-a031-40d4-ad64-552075dae68a.json deleted file mode 100644 index 806d212aa..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1/5eb1aa92-a031-40d4-ad64-552075dae68a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-7B-Gordion-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Gordion-v0.1", - "id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5524 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2915 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4016 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentessential-14B-v1/3ebc147d-58f2-4605-a011-a71c591fac0e.json b/data/hfopenllm_v2/sometimesanotion/Qwentessential-14B-v1/3ebc147d-58f2-4605-a011-a71c591fac0e.json deleted file mode 100644 index 22b1d0f94..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentessential-14B-v1/3ebc147d-58f2-4605-a011-a71c591fac0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentessential-14B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentessential-14B-v1", - "id": "sometimesanotion/Qwentessential-14B-v1", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6279 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6545 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v013/01795776-e909-46d3-8b6c-0989334e3d0e.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v013/01795776-e909-46d3-8b6c-0989334e3d0e.json deleted file mode 100644 index 29ab8f101..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v013/01795776-e909-46d3-8b6c-0989334e3d0e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v013/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v013", - "id": "sometimesanotion/Qwentinuum-14B-v013", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6711 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6087 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5154 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4991 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v1/00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v1/00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json deleted file mode 100644 index a544e7948..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v1/00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v1", - "id": "sometimesanotion/Qwentinuum-14B-v1", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5032 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6573 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.541 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v2/736249d0-cea9-46c6-9677-ecae4b410af4.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v2/736249d0-cea9-46c6-9677-ecae4b410af4.json deleted file mode 100644 index f73c3f45f..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v2/736249d0-cea9-46c6-9677-ecae4b410af4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v2", - "id": "sometimesanotion/Qwentinuum-14B-v2", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5378 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6555 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4714 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5409 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v3/ef602cfe-3453-4189-b583-292cf05421d1.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v3/ef602cfe-3453-4189-b583-292cf05421d1.json deleted file mode 100644 index fadcebc67..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v3/ef602cfe-3453-4189-b583-292cf05421d1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v3", - "id": "sometimesanotion/Qwentinuum-14B-v3", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6158 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6539 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v5/559af2c1-deca-4c35-b83a-004c22ac958a.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v5/559af2c1-deca-4c35-b83a-004c22ac958a.json deleted file mode 100644 index a63f22447..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v5/559af2c1-deca-4c35-b83a-004c22ac958a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v5", - "id": "sometimesanotion/Qwentinuum-14B-v5", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6286 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3444 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3876 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4874 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5418 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6-Prose/8d66d895-626a-477f-91b6-2195f35aacb3.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6-Prose/8d66d895-626a-477f-91b6-2195f35aacb3.json deleted file mode 100644 index 6b40b100b..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6-Prose/8d66d895-626a-477f-91b6-2195f35aacb3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v6-Prose/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v6-Prose", - "id": "sometimesanotion/Qwentinuum-14B-v6-Prose", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5643 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6545 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4913 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5392 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6/004df803-70da-4e59-b3ad-f210c790f29e.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6/004df803-70da-4e59-b3ad-f210c790f29e.json deleted file mode 100644 index ff2133b12..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6/004df803-70da-4e59-b3ad-f210c790f29e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v6", - "id": "sometimesanotion/Qwentinuum-14B-v6", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6304 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6545 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.49 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v7/bb2972ca-e673-4be5-bc7e-2689adeac3a9.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v7/bb2972ca-e673-4be5-bc7e-2689adeac3a9.json deleted file mode 100644 index 86db2ec7b..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v7/bb2972ca-e673-4be5-bc7e-2689adeac3a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v7", - "id": "sometimesanotion/Qwentinuum-14B-v7", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6109 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6551 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3573 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3909 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.482 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.541 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v8/eacf2411-a0ea-41fd-8363-e565fce0f26f.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v8/eacf2411-a0ea-41fd-8363-e565fce0f26f.json deleted file mode 100644 index 14b1b0dfd..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v8/eacf2411-a0ea-41fd-8363-e565fce0f26f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v8", - "id": "sometimesanotion/Qwentinuum-14B-v8", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6534 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3912 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4873 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v9/4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v9/4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json deleted file mode 100644 index 470b2bb89..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v9/4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwentinuum-14B-v9", - "id": "sometimesanotion/Qwentinuum-14B-v9", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5107 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.658 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3859 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4781 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5421 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-qv256/f19dab38-48ed-438e-8a62-86e4d111f6c8.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-qv256/f19dab38-48ed-438e-8a62-86e4d111f6c8.json deleted file mode 100644 index 17591ed25..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-qv256/f19dab38-48ed-438e-8a62-86e4d111f6c8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-qv256/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-qv256", - "id": "sometimesanotion/Qwenvergence-14B-qv256", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7006 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6312 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3897 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4926 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5178 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock/ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock/ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json deleted file mode 100644 index 10044a8d2..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock/ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v0.6-004-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v0.6-004-model_stock", - "id": "sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.686 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6249 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4094 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5033 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5193 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v10/9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v10/9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json deleted file mode 100644 index 1827b725f..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v10/9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v10/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v10", - "id": "sometimesanotion/Qwenvergence-14B-v10", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6757 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6316 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4789 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5239 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v11/404e3d61-26d3-4f95-9847-064f0c7c6970.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v11/404e3d61-26d3-4f95-9847-064f0c7c6970.json deleted file mode 100644 index df64804c6..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v11/404e3d61-26d3-4f95-9847-064f0c7c6970.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v11/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v11", - "id": "sometimesanotion/Qwenvergence-14B-v11", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7192 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6368 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4645 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5327 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose-DS/0b4574f2-1b71-427f-9923-17db449be191.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose-DS/0b4574f2-1b71-427f-9923-17db449be191.json deleted file mode 100644 index 12b28bc82..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose-DS/0b4574f2-1b71-427f-9923-17db449be191.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v12-Prose-DS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v12-Prose-DS", - "id": "sometimesanotion/Qwenvergence-14B-v12-Prose-DS", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6173 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6507 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4305 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5151 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose/775b88cd-98e8-4d93-acca-e294f68f2da2.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose/775b88cd-98e8-4d93-acca-e294f68f2da2.json deleted file mode 100644 index c5c86aec1..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose/775b88cd-98e8-4d93-acca-e294f68f2da2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v12-Prose/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v12-Prose", - "id": "sometimesanotion/Qwenvergence-14B-v12-Prose", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5412 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6504 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4991 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v13-Prose-DS/89464568-47cb-4659-af37-8b061d3f0c8c.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v13-Prose-DS/89464568-47cb-4659-af37-8b061d3f0c8c.json deleted file mode 100644 index 84c78b460..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v13-Prose-DS/89464568-47cb-4659-af37-8b061d3f0c8c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v13-Prose-DS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v13-Prose-DS", - "id": "sometimesanotion/Qwenvergence-14B-v13-Prose-DS", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7178 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6405 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.386 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4927 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5349 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v15-Prose-MS/9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v15-Prose-MS/9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json deleted file mode 100644 index 5ec413f82..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v15-Prose-MS/9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v15-Prose-MS/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v15-Prose-MS", - "id": "sometimesanotion/Qwenvergence-14B-v15-Prose-MS", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5032 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4913 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5393 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v2-Prose/c1882335-0df5-4df2-bfa1-c16126c328fb.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v2-Prose/c1882335-0df5-4df2-bfa1-c16126c328fb.json deleted file mode 100644 index a6c3cb27c..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v2-Prose/c1882335-0df5-4df2-bfa1-c16126c328fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v2-Prose/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v2-Prose", - "id": "sometimesanotion/Qwenvergence-14B-v2-Prose", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4705 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6519 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3557 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4926 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Prose/291471ed-3b7c-4bd4-91bb-c27cd74ec460.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Prose/291471ed-3b7c-4bd4-91bb-c27cd74ec460.json deleted file mode 100644 index 556f2a8cf..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Prose/291471ed-3b7c-4bd4-91bb-c27cd74ec460.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v3-Prose/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v3-Prose", - "id": "sometimesanotion/Qwenvergence-14B-v3-Prose", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4918 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6513 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3648 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4939 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.537 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/53565fe4-0368-477b-9916-ac9a4b8a9c7b.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/53565fe4-0368-477b-9916-ac9a4b8a9c7b.json deleted file mode 100644 index 0e8401765..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/53565fe4-0368-477b-9916-ac9a4b8a9c7b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v3-Reason/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v3-Reason", - "id": "sometimesanotion/Qwenvergence-14B-v3-Reason", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6557 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3119 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json deleted file mode 100644 index 49b6488af..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v3-Reason/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v3-Reason", - "id": "sometimesanotion/Qwenvergence-14B-v3-Reason", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5367 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6561 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3867 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5395 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3/e51fee25-7648-49d9-a8da-b8dbc68a722b.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3/e51fee25-7648-49d9-a8da-b8dbc68a722b.json deleted file mode 100644 index 6b7f693f2..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3/e51fee25-7648-49d9-a8da-b8dbc68a722b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v3", - "id": "sometimesanotion/Qwenvergence-14B-v3", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5044 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6548 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4886 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5386 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock/6acdc96b-cfde-439f-b6b3-a66257b3fcde.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock/6acdc96b-cfde-439f-b6b3-a66257b3fcde.json deleted file mode 100644 index 1fad180ca..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock/6acdc96b-cfde-439f-b6b3-a66257b3fcde.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v6-Prose-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v6-Prose-model_stock", - "id": "sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4811 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.653 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3603 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4899 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose/850da8de-ca13-4f15-bb9f-68b910355cfd.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose/850da8de-ca13-4f15-bb9f-68b910355cfd.json deleted file mode 100644 index a338fb9a8..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose/850da8de-ca13-4f15-bb9f-68b910355cfd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v6-Prose/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v6-Prose", - "id": "sometimesanotion/Qwenvergence-14B-v6-Prose", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.599 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4887 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v8/542fbb7a-d4eb-4cbf-b63a-4305cb108361.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v8/542fbb7a-d4eb-4cbf-b63a-4305cb108361.json deleted file mode 100644 index fd2e91afc..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v8/542fbb7a-d4eb-4cbf-b63a-4305cb108361.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v8", - "id": "sometimesanotion/Qwenvergence-14B-v8", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5913 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6522 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4048 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5435 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v9/1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v9/1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json deleted file mode 100644 index e4304c3e3..000000000 --- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v9/1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v9/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenvergence-14B-v9", - "id": "sometimesanotion/Qwenvergence-14B-v9", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6598 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6166 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4139 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5141 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/lamarck-14b-prose-model_stock/6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json b/data/hfopenllm_v2/sometimesanotion/lamarck-14b-prose-model_stock/6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json deleted file mode 100644 index 404e5cee8..000000000 --- a/data/hfopenllm_v2/sometimesanotion/lamarck-14b-prose-model_stock/6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_lamarck-14b-prose-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lamarck-14b-prose-model_stock", - "id": "sometimesanotion/lamarck-14b-prose-model_stock", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4276 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6488 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4846 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5354 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sometimesanotion/lamarck-14b-reason-model_stock/e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json b/data/hfopenllm_v2/sometimesanotion/lamarck-14b-reason-model_stock/e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json deleted file mode 100644 index 92a2a8d29..000000000 --- a/data/hfopenllm_v2/sometimesanotion/lamarck-14b-reason-model_stock/e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sometimesanotion_lamarck-14b-reason-model_stock/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lamarck-14b-reason-model_stock", - "id": "sometimesanotion/lamarck-14b-reason-model_stock", - "developer": "sometimesanotion", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4965 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6569 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4741 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/5113439d-1394-46f2-a38e-34b54e94a9e6.json b/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/5113439d-1394-46f2-a38e-34b54e94a9e6.json deleted file mode 100644 index d02f279fc..000000000 --- a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/5113439d-1394-46f2-a38e-34b54e94a9e6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415", - "id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415", - "developer": "sonthenguyen", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 7.723 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2893 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3804 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3861 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1401 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json b/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json deleted file mode 100644 index 374cc5708..000000000 --- a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205", - "id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205", - "developer": "sonthenguyen", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 7.723 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3199 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3959 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4272 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2124 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json b/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json deleted file mode 100644 index 37b361fb2..000000000 --- a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522", - "id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522", - "developer": "sonthenguyen", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 7.723 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3764 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3828 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4404 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2055 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps/446ac93f-d47c-4207-bf32-0cd94e88a931.json b/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps/446ac93f-d47c-4207-bf32-0cd94e88a931.json deleted file mode 100644 index a216eeafc..000000000 --- a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps/446ac93f-d47c-4207-bf32-0cd94e88a931.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbc-213steps/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-sft-bnb-4bit-DPO-mtbc-213steps", - "id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps", - "developer": "sonthenguyen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4197 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2709 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps/7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json b/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps/7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json deleted file mode 100644 index 51e507e7a..000000000 --- a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps/7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbo-180steps/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-sft-bnb-4bit-DPO-mtbo-180steps", - "id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps", - "developer": "sonthenguyen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4087 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3885 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2748 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps/ca77f821-4722-45b1-b731-7d774232acb4.json b/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps/ca77f821-4722-45b1-b731-7d774232acb4.json deleted file mode 100644 index c5738810a..000000000 --- a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps/ca77f821-4722-45b1-b731-7d774232acb4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbr-180steps/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "zephyr-sft-bnb-4bit-DPO-mtbr-180steps", - "id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps", - "developer": "sonthenguyen", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4032 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4305 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4258 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2711 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sophosympatheia/Midnight-Miqu-70B-v1.5/f32d2a11-edd3-4662-aed7-88c6820b2c2e.json b/data/hfopenllm_v2/sophosympatheia/Midnight-Miqu-70B-v1.5/f32d2a11-edd3-4662-aed7-88c6820b2c2e.json deleted file mode 100644 index fddda10e0..000000000 --- a/data/hfopenllm_v2/sophosympatheia/Midnight-Miqu-70B-v1.5/f32d2a11-edd3-4662-aed7-88c6820b2c2e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sophosympatheia_Midnight-Miqu-70B-v1.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Midnight-Miqu-70B-v1.5", - "id": "sophosympatheia/Midnight-Miqu-70B-v1.5", - "developer": "sophosympatheia", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 68.977 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6118 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5606 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.0-Instruct/71c56883-dd14-4f16-b839-5ce607a4aadb.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2.0-Instruct/71c56883-dd14-4f16-b839-5ce607a4aadb.json deleted file mode 100644 index 12049af61..000000000 --- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.0-Instruct/71c56883-dd14-4f16-b839-5ce607a4aadb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2.0-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bielik-11B-v2.0-Instruct", - "id": "speakleash/Bielik-11B-v2.0-Instruct", - "developer": "speakleash", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 11.169 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5252 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5362 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1186 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3171 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4467 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3351 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.1-Instruct/639004c2-81a5-410d-bd61-e3e263f55335.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2.1-Instruct/639004c2-81a5-410d-bd61-e3e263f55335.json deleted file mode 100644 index e5d5e6b4f..000000000 --- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.1-Instruct/639004c2-81a5-410d-bd61-e3e263f55335.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2.1-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bielik-11B-v2.1-Instruct", - "id": "speakleash/Bielik-11B-v2.1-Instruct", - "developer": "speakleash", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 11.169 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.553 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2666 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4185 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.2-Instruct/5f232a99-07c9-4df7-9d3b-837966ea6de5.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2.2-Instruct/5f232a99-07c9-4df7-9d3b-837966ea6de5.json deleted file mode 100644 index 29b63348b..000000000 --- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.2-Instruct/5f232a99-07c9-4df7-9d3b-837966ea6de5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2.2-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bielik-11B-v2.2-Instruct", - "id": "speakleash/Bielik-11B-v2.2-Instruct", - "developer": "speakleash", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 11.169 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5552 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5597 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2681 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4171 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3487 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.3-Instruct/482e34ee-8974-46c6-b3f4-4cc9872ef562.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2.3-Instruct/482e34ee-8974-46c6-b3f4-4cc9872ef562.json deleted file mode 100644 index d3532936e..000000000 --- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.3-Instruct/482e34ee-8974-46c6-b3f4-4cc9872ef562.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2.3-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bielik-11B-v2.3-Instruct", - "id": "speakleash/Bielik-11B-v2.3-Instruct", - "developer": "speakleash", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 11.169 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5583 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5663 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2085 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4518 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3444 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2/13743252-3ba3-406d-8e95-5a4cd3ac3772.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2/13743252-3ba3-406d-8e95-5a4cd3ac3772.json deleted file mode 100644 index 0c4687f31..000000000 --- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2/13743252-3ba3-406d-8e95-5a4cd3ac3772.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Bielik-11B-v2", - "id": "speakleash/Bielik-11B-v2", - "developer": "speakleash", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 11.169 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2381 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4931 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3924 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3137 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/spmurrayzzz/Mistral-Syndicate-7B/ff25cb66-ed6f-421a-a038-1feb24666645.json b/data/hfopenllm_v2/spmurrayzzz/Mistral-Syndicate-7B/ff25cb66-ed6f-421a-a038-1feb24666645.json deleted file mode 100644 index 7a760f4b4..000000000 --- a/data/hfopenllm_v2/spmurrayzzz/Mistral-Syndicate-7B/ff25cb66-ed6f-421a-a038-1feb24666645.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/spmurrayzzz_Mistral-Syndicate-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-Syndicate-7B", - "id": "spmurrayzzz/Mistral-Syndicate-7B", - "developer": "spmurrayzzz", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4245 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.034 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4386 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2631 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_12B_v2.0/843f0d9a-04e8-4cea-bb18-94651a814d1f.json b/data/hfopenllm_v2/spow12/ChatWaifu_12B_v2.0/843f0d9a-04e8-4cea-bb18-94651a814d1f.json deleted file mode 100644 index 4c523e210..000000000 --- a/data/hfopenllm_v2/spow12/ChatWaifu_12B_v2.0/843f0d9a-04e8-4cea-bb18-94651a814d1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_12B_v2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ChatWaifu_12B_v2.0", - "id": "spow12/ChatWaifu_12B_v2.0", - "developer": "spow12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5208 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.071 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4432 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3388 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_22B_v2.0_preview/fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json b/data/hfopenllm_v2/spow12/ChatWaifu_22B_v2.0_preview/fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json deleted file mode 100644 index f32e11888..000000000 --- a/data/hfopenllm_v2/spow12/ChatWaifu_22B_v2.0_preview/fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_22B_v2.0_preview/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ChatWaifu_22B_v2.0_preview", - "id": "spow12/ChatWaifu_22B_v2.0_preview", - "developer": "spow12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6745 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.617 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3154 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3685 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3988 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_v1.4/ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json b/data/hfopenllm_v2/spow12/ChatWaifu_v1.4/ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json deleted file mode 100644 index 4403618f1..000000000 --- a/data/hfopenllm_v2/spow12/ChatWaifu_v1.4/ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_v1.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ChatWaifu_v1.4", - "id": "spow12/ChatWaifu_v1.4", - "developer": "spow12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5691 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5176 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1057 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4743 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json b/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json deleted file mode 100644 index 83ea96c1c..000000000 --- a/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_v2.0_22B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ChatWaifu_v2.0_22B", - "id": "spow12/ChatWaifu_v2.0_22B", - "developer": "spow12", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6511 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5926 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1858 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3247 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3836 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/bd8fdfa5-bda1-402b-9010-94bf78b0127b.json b/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/bd8fdfa5-bda1-402b-9010-94bf78b0127b.json deleted file mode 100644 index 699016bfe..000000000 --- a/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/bd8fdfa5-bda1-402b-9010-94bf78b0127b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_v2.0_22B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ChatWaifu_v2.0_22B", - "id": "spow12/ChatWaifu_v2.0_22B", - "developer": "spow12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 22.247 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6517 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5908 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2032 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3842 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ssmits/Qwen2.5-95B-Instruct/a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json b/data/hfopenllm_v2/ssmits/Qwen2.5-95B-Instruct/a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json deleted file mode 100644 index 21deb3a14..000000000 --- a/data/hfopenllm_v2/ssmits/Qwen2.5-95B-Instruct/a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ssmits_Qwen2.5-95B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-95B-Instruct", - "id": "ssmits/Qwen2.5-95B-Instruct", - "developer": "ssmits", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 94.648 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8431 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7038 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5217 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/stabilityai/StableBeluga2/dbf4fbac-cd99-426d-b725-600e60af00d2.json b/data/hfopenllm_v2/stabilityai/StableBeluga2/dbf4fbac-cd99-426d-b725-600e60af00d2.json deleted file mode 100644 index e8054b7c1..000000000 --- a/data/hfopenllm_v2/stabilityai/StableBeluga2/dbf4fbac-cd99-426d-b725-600e60af00d2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/stabilityai_StableBeluga2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "StableBeluga2", - "id": "stabilityai/StableBeluga2", - "developer": "stabilityai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 68.977 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3787 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5824 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3163 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.473 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3326 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-12b-chat/f793c471-1638-476a-a050-455a32368e29.json b/data/hfopenllm_v2/stabilityai/stablelm-2-12b-chat/f793c471-1638-476a-a050-455a32368e29.json deleted file mode 100644 index 53354e46f..000000000 --- a/data/hfopenllm_v2/stabilityai/stablelm-2-12b-chat/f793c471-1638-476a-a050-455a32368e29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-12b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stablelm-2-12b-chat", - "id": "stabilityai/stablelm-2-12b-chat", - "developer": "stabilityai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "StableLmForCausalLM", - "params_billions": 12.143 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4082 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4672 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0536 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2734 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-12b/1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json b/data/hfopenllm_v2/stabilityai/stablelm-2-12b/1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json deleted file mode 100644 index 864befec9..000000000 --- a/data/hfopenllm_v2/stabilityai/stablelm-2-12b/1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-12b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stablelm-2-12b", - "id": "stabilityai/stablelm-2-12b", - "developer": "stabilityai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "StableLmForCausalLM", - "params_billions": 12.143 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1569 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4479 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3072 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b-chat/99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json b/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b-chat/99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json deleted file mode 100644 index 28f4bd994..000000000 --- a/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b-chat/99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-1_6b-chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stablelm-2-1_6b-chat", - "id": "stabilityai/stablelm-2-1_6b-chat", - "developer": "stabilityai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "StableLmForCausalLM", - "params_billions": 1.645 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.339 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1622 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b/82a44b46-156f-4232-92e4-6a08d7a4f197.json b/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b/82a44b46-156f-4232-92e4-6a08d7a4f197.json deleted file mode 100644 index c8c4b25df..000000000 --- a/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b/82a44b46-156f-4232-92e4-6a08d7a4f197.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-1_6b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stablelm-2-1_6b", - "id": "stabilityai/stablelm-2-1_6b", - "developer": "stabilityai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "StableLmForCausalLM", - "params_billions": 1.645 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1157 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3385 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1464 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-zephyr-1_6b/3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json b/data/hfopenllm_v2/stabilityai/stablelm-2-zephyr-1_6b/3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json deleted file mode 100644 index c7a4b2631..000000000 --- a/data/hfopenllm_v2/stabilityai/stablelm-2-zephyr-1_6b/3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-zephyr-1_6b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stablelm-2-zephyr-1_6b", - "id": "stabilityai/stablelm-2-zephyr-1_6b", - "developer": "stabilityai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "StableLmForCausalLM", - "params_billions": 1.645 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3279 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3352 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3511 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1714 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/stabilityai/stablelm-3b-4e1t/dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json b/data/hfopenllm_v2/stabilityai/stablelm-3b-4e1t/dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json deleted file mode 100644 index 5c466e1f8..000000000 --- a/data/hfopenllm_v2/stabilityai/stablelm-3b-4e1t/dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-3b-4e1t/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stablelm-3b-4e1t", - "id": "stabilityai/stablelm-3b-4e1t", - "developer": "stabilityai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "StableLmForCausalLM", - "params_billions": 2.795 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2203 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3504 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2374 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3778 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1669 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/stabilityai/stablelm-zephyr-3b/1cffcbeb-ef81-4efe-b883-0a8540a799e7.json b/data/hfopenllm_v2/stabilityai/stablelm-zephyr-3b/1cffcbeb-ef81-4efe-b883-0a8540a799e7.json deleted file mode 100644 index 736b9c1ac..000000000 --- a/data/hfopenllm_v2/stabilityai/stablelm-zephyr-3b/1cffcbeb-ef81-4efe-b883-0a8540a799e7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-zephyr-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stablelm-zephyr-3b", - "id": "stabilityai/stablelm-zephyr-3b", - "developer": "stabilityai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "StableLmForCausalLM", - "params_billions": 2.795 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3866 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2391 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4183 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1768 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno-com/miscii-14b-0130/033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json b/data/hfopenllm_v2/sthenno-com/miscii-14b-0130/033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json deleted file mode 100644 index 436ce33be..000000000 --- a/data/hfopenllm_v2/sthenno-com/miscii-14b-0130/033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno-com_miscii-14b-0130/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "miscii-14b-0130", - "id": "sthenno-com/miscii-14b-0130", - "developer": "sthenno-com", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6647 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6505 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.432 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4912 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5363 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno-com/miscii-14b-0218/bfe654b8-cb79-4845-bf14-85012207ce90.json b/data/hfopenllm_v2/sthenno-com/miscii-14b-0218/bfe654b8-cb79-4845-bf14-85012207ce90.json deleted file mode 100644 index 19981a9c3..000000000 --- a/data/hfopenllm_v2/sthenno-com/miscii-14b-0218/bfe654b8-cb79-4845-bf14-85012207ce90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno-com_miscii-14b-0218/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "miscii-14b-0218", - "id": "sthenno-com/miscii-14b-0218", - "developer": "sthenno-com", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7656 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6559 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3834 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4273 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5298 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno-com/miscii-14b-1028/5c4efc23-9591-447b-aecc-4c82797d7d01.json b/data/hfopenllm_v2/sthenno-com/miscii-14b-1028/5c4efc23-9591-447b-aecc-4c82797d7d01.json deleted file mode 100644 index 49b055e2c..000000000 --- a/data/hfopenllm_v2/sthenno-com/miscii-14b-1028/5c4efc23-9591-447b-aecc-4c82797d7d01.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno-com_miscii-14b-1028/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "miscii-14b-1028", - "id": "sthenno-com/miscii-14b-1028", - "developer": "sthenno-com", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8237 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.503 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4182 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5153 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno-com/miscii-14b-1225/a5fe3fab-95d9-41ac-a95f-66205e489dae.json b/data/hfopenllm_v2/sthenno-com/miscii-14b-1225/a5fe3fab-95d9-41ac-a95f-66205e489dae.json deleted file mode 100644 index 4635e6e86..000000000 --- a/data/hfopenllm_v2/sthenno-com/miscii-14b-1225/a5fe3fab-95d9-41ac-a95f-66205e489dae.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno-com_miscii-14b-1225/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "miscii-14b-1225", - "id": "sthenno-com/miscii-14b-1225", - "developer": "sthenno-com", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7878 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6572 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4517 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5272 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-0120/c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json b/data/hfopenllm_v2/sthenno/tempesthenno-0120/c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json deleted file mode 100644 index c30b2e881..000000000 --- a/data/hfopenllm_v2/sthenno/tempesthenno-0120/c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-0120/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempesthenno-0120", - "id": "sthenno/tempesthenno-0120", - "developer": "sthenno", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.539 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6373 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3943 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4633 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.529 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-fusion-0309/3d556d9f-036b-4368-bb4a-18ad6b444bdf.json b/data/hfopenllm_v2/sthenno/tempesthenno-fusion-0309/3d556d9f-036b-4368-bb4a-18ad6b444bdf.json deleted file mode 100644 index 78afda82b..000000000 --- a/data/hfopenllm_v2/sthenno/tempesthenno-fusion-0309/3d556d9f-036b-4368-bb4a-18ad6b444bdf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-fusion-0309/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempesthenno-fusion-0309", - "id": "sthenno/tempesthenno-fusion-0309", - "developer": "sthenno", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7692 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6581 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4766 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4325 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5258 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-kto-0205-ckpt80/92905e27-1033-4423-b87d-23236f9be964.json b/data/hfopenllm_v2/sthenno/tempesthenno-kto-0205-ckpt80/92905e27-1033-4423-b87d-23236f9be964.json deleted file mode 100644 index 588277559..000000000 --- a/data/hfopenllm_v2/sthenno/tempesthenno-kto-0205-ckpt80/92905e27-1033-4423-b87d-23236f9be964.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-kto-0205-ckpt80/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempesthenno-kto-0205-ckpt80", - "id": "sthenno/tempesthenno-kto-0205-ckpt80", - "developer": "sthenno", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8054 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6543 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4592 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4248 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5286 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-001/17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json b/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-001/17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json deleted file mode 100644 index 752a4f6d2..000000000 --- a/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-001/17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-nuslerp-001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempesthenno-nuslerp-001", - "id": "sthenno/tempesthenno-nuslerp-001", - "developer": "sthenno", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7926 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6578 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4758 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.43 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5257 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-0124/11574f56-6c34-48e4-8fb5-c58d42f07330.json b/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-0124/11574f56-6c34-48e4-8fb5-c58d42f07330.json deleted file mode 100644 index 26b47b3b3..000000000 --- a/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-0124/11574f56-6c34-48e4-8fb5-c58d42f07330.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-nuslerp-0124/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempesthenno-nuslerp-0124", - "id": "sthenno/tempesthenno-nuslerp-0124", - "developer": "sthenno", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7004 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6469 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4116 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3901 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4859 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-ppo-ckpt40/8f728c51-15f9-422d-bbdb-4d976961ab9d.json b/data/hfopenllm_v2/sthenno/tempesthenno-ppo-ckpt40/8f728c51-15f9-422d-bbdb-4d976961ab9d.json deleted file mode 100644 index e24916e51..000000000 --- a/data/hfopenllm_v2/sthenno/tempesthenno-ppo-ckpt40/8f728c51-15f9-422d-bbdb-4d976961ab9d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-ppo-ckpt40/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempesthenno-ppo-ckpt40", - "id": "sthenno/tempesthenno-ppo-ckpt40", - "developer": "sthenno", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7923 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.655 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4736 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3775 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4352 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5292 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-sft-0309-ckpt10/8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json b/data/hfopenllm_v2/sthenno/tempesthenno-sft-0309-ckpt10/8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json deleted file mode 100644 index 884df38c1..000000000 --- a/data/hfopenllm_v2/sthenno/tempesthenno-sft-0309-ckpt10/8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-sft-0309-ckpt10/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempesthenno-sft-0309-ckpt10", - "id": "sthenno/tempesthenno-sft-0309-ckpt10", - "developer": "sthenno", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7744 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6552 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4721 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4364 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5258 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-sft-0314-stage1-ckpt50/5e33bf05-6c67-4ecc-982d-7590e9953145.json b/data/hfopenllm_v2/sthenno/tempesthenno-sft-0314-stage1-ckpt50/5e33bf05-6c67-4ecc-982d-7590e9953145.json deleted file mode 100644 index bdf205c44..000000000 --- a/data/hfopenllm_v2/sthenno/tempesthenno-sft-0314-stage1-ckpt50/5e33bf05-6c67-4ecc-982d-7590e9953145.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-sft-0314-stage1-ckpt50/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempesthenno-sft-0314-stage1-ckpt50", - "id": "sthenno/tempesthenno-sft-0314-stage1-ckpt50", - "developer": "sthenno", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7394 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6601 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4683 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3733 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sthenno/tempestissimo-14b-0309/f55ae879-bd95-409c-a8a3-9a57cd615a31.json b/data/hfopenllm_v2/sthenno/tempestissimo-14b-0309/f55ae879-bd95-409c-a8a3-9a57cd615a31.json deleted file mode 100644 index 63af9669e..000000000 --- a/data/hfopenllm_v2/sthenno/tempestissimo-14b-0309/f55ae879-bd95-409c-a8a3-9a57cd615a31.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sthenno_tempestissimo-14b-0309/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tempestissimo-14b-0309", - "id": "sthenno/tempestissimo-14b-0309", - "developer": "sthenno", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7549 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6587 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4796 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3666 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4312 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5281 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/streamerbtw1002/Nexuim-R1-7B-Instruct/b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json b/data/hfopenllm_v2/streamerbtw1002/Nexuim-R1-7B-Instruct/b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json deleted file mode 100644 index c89456ac8..000000000 --- a/data/hfopenllm_v2/streamerbtw1002/Nexuim-R1-7B-Instruct/b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/streamerbtw1002_Nexuim-R1-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nexuim-R1-7B-Instruct", - "id": "streamerbtw1002/Nexuim-R1-7B-Instruct", - "developer": "streamerbtw1002", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6934 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5175 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4456 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/stupidity-ai/Llama-3-8B-Instruct-MultiMoose/51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json b/data/hfopenllm_v2/stupidity-ai/Llama-3-8B-Instruct-MultiMoose/51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json deleted file mode 100644 index 09f45ca03..000000000 --- a/data/hfopenllm_v2/stupidity-ai/Llama-3-8B-Instruct-MultiMoose/51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/stupidity-ai_Llama-3-8B-Instruct-MultiMoose/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-MultiMoose", - "id": "stupidity-ai/Llama-3-8B-Instruct-MultiMoose", - "developer": "stupidity-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2318 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2823 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3485 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1094 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.1/c46e4fa1-afae-4b68-a13e-034b5cd2b779.json b/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.1/c46e4fa1-afae-4b68-a13e-034b5cd2b779.json deleted file mode 100644 index 516714298..000000000 --- a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.1/c46e4fa1-afae-4b68-a13e-034b5cd2b779.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Clarus-7B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Clarus-7B-v0.1", - "id": "suayptalha/Clarus-7B-v0.1", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7454 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5497 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.443 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.2/42cc06ed-20fc-4e84-836f-3d7243ec336d.json b/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.2/42cc06ed-20fc-4e84-836f-3d7243ec336d.json deleted file mode 100644 index c5b09072a..000000000 --- a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.2/42cc06ed-20fc-4e84-836f-3d7243ec336d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Clarus-7B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Clarus-7B-v0.2", - "id": "suayptalha/Clarus-7B-v0.2", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.613 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7679 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.549 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4856 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.3/aaa53387-af33-4454-95f0-3af85f4778c0.json b/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.3/aaa53387-af33-4454-95f0-3af85f4778c0.json deleted file mode 100644 index d58a5c307..000000000 --- a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.3/aaa53387-af33-4454-95f0-3af85f4778c0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Clarus-7B-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Clarus-7B-v0.3", - "id": "suayptalha/Clarus-7B-v0.3", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5526 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4879 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4385 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/DeepSeek-R1-Distill-Llama-3B/465bca6d-b32a-4d34-9916-fc8b3166faa0.json b/data/hfopenllm_v2/suayptalha/DeepSeek-R1-Distill-Llama-3B/465bca6d-b32a-4d34-9916-fc8b3166faa0.json deleted file mode 100644 index c2621e7be..000000000 --- a/data/hfopenllm_v2/suayptalha/DeepSeek-R1-Distill-Llama-3B/465bca6d-b32a-4d34-9916-fc8b3166faa0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_DeepSeek-R1-Distill-Llama-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "DeepSeek-R1-Distill-Llama-3B", - "id": "suayptalha/DeepSeek-R1-Distill-Llama-3B", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7093 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2092 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Falcon3-Jessi-v0.4-7B-Slerp/bf138f3d-09d9-4dea-aa43-5efc804bc775.json b/data/hfopenllm_v2/suayptalha/Falcon3-Jessi-v0.4-7B-Slerp/bf138f3d-09d9-4dea-aa43-5efc804bc775.json deleted file mode 100644 index 21408cebb..000000000 --- a/data/hfopenllm_v2/suayptalha/Falcon3-Jessi-v0.4-7B-Slerp/bf138f3d-09d9-4dea-aa43-5efc804bc775.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Falcon3-Jessi-v0.4-7B-Slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-Jessi-v0.4-7B-Slerp", - "id": "suayptalha/Falcon3-Jessi-v0.4-7B-Slerp", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7676 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5591 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3965 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4812 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/HomerCreativeAnvita-Mix-Qw7B/cb4e944c-66f6-49f2-b1e0-d90454e34315.json b/data/hfopenllm_v2/suayptalha/HomerCreativeAnvita-Mix-Qw7B/cb4e944c-66f6-49f2-b1e0-d90454e34315.json deleted file mode 100644 index 52dfc2c70..000000000 --- a/data/hfopenllm_v2/suayptalha/HomerCreativeAnvita-Mix-Qw7B/cb4e944c-66f6-49f2-b1e0-d90454e34315.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_HomerCreativeAnvita-Mix-Qw7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HomerCreativeAnvita-Mix-Qw7B", - "id": "suayptalha/HomerCreativeAnvita-Mix-Qw7B", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7808 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5565 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.361 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4416 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4445 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Komodo-Llama-3.2-3B-v2-fp16/b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json b/data/hfopenllm_v2/suayptalha/Komodo-Llama-3.2-3B-v2-fp16/b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json deleted file mode 100644 index 6760ae662..000000000 --- a/data/hfopenllm_v2/suayptalha/Komodo-Llama-3.2-3B-v2-fp16/b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Komodo-Llama-3.2-3B-v2-fp16/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Komodo-Llama-3.2-3B-v2-fp16", - "id": "suayptalha/Komodo-Llama-3.2-3B-v2-fp16", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6341 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4355 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3406 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Lamarckvergence-14B/933f3d40-8726-418f-be2f-1f9686e9ab02.json b/data/hfopenllm_v2/suayptalha/Lamarckvergence-14B/933f3d40-8726-418f-be2f-1f9686e9ab02.json deleted file mode 100644 index e8329fe1f..000000000 --- a/data/hfopenllm_v2/suayptalha/Lamarckvergence-14B/933f3d40-8726-418f-be2f-1f9686e9ab02.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Lamarckvergence-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lamarckvergence-14B", - "id": "suayptalha/Lamarckvergence-14B", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7656 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6517 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4422 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5283 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Lix-14B-v0.1/af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json b/data/hfopenllm_v2/suayptalha/Lix-14B-v0.1/af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json deleted file mode 100644 index 14484ddbd..000000000 --- a/data/hfopenllm_v2/suayptalha/Lix-14B-v0.1/af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Lix-14B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Lix-14B-v0.1", - "id": "suayptalha/Lix-14B-v0.1", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7813 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6608 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4338 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5314 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Luminis-phi-4/43df4336-1eb8-4df7-8309-1199aafc07b1.json b/data/hfopenllm_v2/suayptalha/Luminis-phi-4/43df4336-1eb8-4df7-8309-1199aafc07b1.json deleted file mode 100644 index 1b20a9640..000000000 --- a/data/hfopenllm_v2/suayptalha/Luminis-phi-4/43df4336-1eb8-4df7-8309-1199aafc07b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Luminis-phi-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Luminis-phi-4", - "id": "suayptalha/Luminis-phi-4", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.692 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4572 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5424 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Maestro-10B/44ae222d-407c-4c8b-9b67-75440631f848.json b/data/hfopenllm_v2/suayptalha/Maestro-10B/44ae222d-407c-4c8b-9b67-75440631f848.json deleted file mode 100644 index 352d09f29..000000000 --- a/data/hfopenllm_v2/suayptalha/Maestro-10B/44ae222d-407c-4c8b-9b67-75440631f848.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Maestro-10B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Maestro-10B", - "id": "suayptalha/Maestro-10B", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7768 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5746 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1911 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4397 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4218 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/suayptalha/Rombos-2.5-T.E-8.1/a87db0fe-3727-4ff1-875f-9edd3109f3a2.json b/data/hfopenllm_v2/suayptalha/Rombos-2.5-T.E-8.1/a87db0fe-3727-4ff1-875f-9edd3109f3a2.json deleted file mode 100644 index e6ce27e84..000000000 --- a/data/hfopenllm_v2/suayptalha/Rombos-2.5-T.E-8.1/a87db0fe-3727-4ff1-875f-9edd3109f3a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/suayptalha_Rombos-2.5-T.E-8.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rombos-2.5-T.E-8.1", - "id": "suayptalha/Rombos-2.5-T.E-8.1", - "developer": "suayptalha", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6925 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5515 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4166 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4446 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/Qmerft/0c73e33a-7f6f-4925-970b-db289069d5ca.json b/data/hfopenllm_v2/sumink/Qmerft/0c73e33a-7f6f-4925-970b-db289069d5ca.json deleted file mode 100644 index bcee9059c..000000000 --- a/data/hfopenllm_v2/sumink/Qmerft/0c73e33a-7f6f-4925-970b-db289069d5ca.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_Qmerft/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qmerft", - "id": "sumink/Qmerft", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1564 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2939 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1157 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/Qwenftmodel/02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json b/data/hfopenllm_v2/sumink/Qwenftmodel/02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json deleted file mode 100644 index 4e22ca5b7..000000000 --- a/data/hfopenllm_v2/sumink/Qwenftmodel/02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_Qwenftmodel/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenftmodel", - "id": "sumink/Qwenftmodel", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1729 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3823 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3617 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2339 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/Qwenmplus/590c031c-2aa6-48e6-9b3f-68b1a585dd39.json b/data/hfopenllm_v2/sumink/Qwenmplus/590c031c-2aa6-48e6-9b3f-68b1a585dd39.json deleted file mode 100644 index 9502b1af4..000000000 --- a/data/hfopenllm_v2/sumink/Qwenmplus/590c031c-2aa6-48e6-9b3f-68b1a585dd39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_Qwenmplus/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwenmplus", - "id": "sumink/Qwenmplus", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.543 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.204 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3676 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3828 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1992 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/Qwensci/970c9fb8-c217-444b-a025-f4d9acdd679d.json b/data/hfopenllm_v2/sumink/Qwensci/970c9fb8-c217-444b-a025-f4d9acdd679d.json deleted file mode 100644 index 0a21c2f79..000000000 --- a/data/hfopenllm_v2/sumink/Qwensci/970c9fb8-c217-444b-a025-f4d9acdd679d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_Qwensci/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwensci", - "id": "sumink/Qwensci", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.543 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.174 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3282 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3609 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.126 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/bbhqwen/07a08dd7-822b-49ac-859b-d2fc75b9c88d.json b/data/hfopenllm_v2/sumink/bbhqwen/07a08dd7-822b-49ac-859b-d2fc75b9c88d.json deleted file mode 100644 index c216d7b5c..000000000 --- a/data/hfopenllm_v2/sumink/bbhqwen/07a08dd7-822b-49ac-859b-d2fc75b9c88d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_bbhqwen/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbhqwen", - "id": "sumink/bbhqwen", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1809 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4352 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1617 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/bbhqwen2/0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json b/data/hfopenllm_v2/sumink/bbhqwen2/0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json deleted file mode 100644 index 1dc18132d..000000000 --- a/data/hfopenllm_v2/sumink/bbhqwen2/0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_bbhqwen2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbhqwen2", - "id": "sumink/bbhqwen2", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1149 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/bbhqwen3/2ae306b1-5409-4418-b5e4-50feff9dafe7.json b/data/hfopenllm_v2/sumink/bbhqwen3/2ae306b1-5409-4418-b5e4-50feff9dafe7.json deleted file mode 100644 index d43a52330..000000000 --- a/data/hfopenllm_v2/sumink/bbhqwen3/2ae306b1-5409-4418-b5e4-50feff9dafe7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_bbhqwen3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbhqwen3", - "id": "sumink/bbhqwen3", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1943 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2951 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3796 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/bbhqwen4/44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json b/data/hfopenllm_v2/sumink/bbhqwen4/44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json deleted file mode 100644 index 7b63e0384..000000000 --- a/data/hfopenllm_v2/sumink/bbhqwen4/44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_bbhqwen4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbhqwen4", - "id": "sumink/bbhqwen4", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1449 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3199 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4029 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1509 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/bbhqwen5/e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json b/data/hfopenllm_v2/sumink/bbhqwen5/e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json deleted file mode 100644 index d06e6e52f..000000000 --- a/data/hfopenllm_v2/sumink/bbhqwen5/e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_bbhqwen5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbhqwen5", - "id": "sumink/bbhqwen5", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1522 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2913 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0023 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1131 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/bbhqwen6/6369fceb-148f-4491-9488-420182a9838f.json b/data/hfopenllm_v2/sumink/bbhqwen6/6369fceb-148f-4491-9488-420182a9838f.json deleted file mode 100644 index 80f188511..000000000 --- a/data/hfopenllm_v2/sumink/bbhqwen6/6369fceb-148f-4491-9488-420182a9838f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_bbhqwen6/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "bbhqwen6", - "id": "sumink/bbhqwen6", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1893 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2782 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1153 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/flflmillama/045c814e-a30f-4b6b-b4f4-382dee4063b7.json b/data/hfopenllm_v2/sumink/flflmillama/045c814e-a30f-4b6b-b4f4-382dee4063b7.json deleted file mode 100644 index 05e8e4568..000000000 --- a/data/hfopenllm_v2/sumink/flflmillama/045c814e-a30f-4b6b-b4f4-382dee4063b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_flflmillama/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "flflmillama", - "id": "sumink/flflmillama", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1676 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3851 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2096 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/ftgpt/59d2b375-5696-47d0-9c96-1a826c08bea0.json b/data/hfopenllm_v2/sumink/ftgpt/59d2b375-5696-47d0-9c96-1a826c08bea0.json deleted file mode 100644 index 697ef4b76..000000000 --- a/data/hfopenllm_v2/sumink/ftgpt/59d2b375-5696-47d0-9c96-1a826c08bea0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_ftgpt/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ftgpt", - "id": "sumink/ftgpt", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "GPT2LMHeadModel", - "params_billions": 0.124 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0787 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1172 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/llamaft/ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json b/data/hfopenllm_v2/sumink/llamaft/ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json deleted file mode 100644 index 69e1ab691..000000000 --- a/data/hfopenllm_v2/sumink/llamaft/ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_llamaft/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llamaft", - "id": "sumink/llamaft", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1609 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3763 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3498 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2114 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/llamamerge/8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json b/data/hfopenllm_v2/sumink/llamamerge/8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json deleted file mode 100644 index f90932f9e..000000000 --- a/data/hfopenllm_v2/sumink/llamamerge/8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_llamamerge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llamamerge", - "id": "sumink/llamamerge", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.016 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2672 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4632 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.424 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.259 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/llftfl7/ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json b/data/hfopenllm_v2/sumink/llftfl7/ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json deleted file mode 100644 index 454849f8b..000000000 --- a/data/hfopenllm_v2/sumink/llftfl7/ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_llftfl7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llftfl7", - "id": "sumink/llftfl7", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1714 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3786 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3632 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1743 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/llmer/d69ecbfa-5036-48b8-8fed-f9162e2857f5.json b/data/hfopenllm_v2/sumink/llmer/d69ecbfa-5036-48b8-8fed-f9162e2857f5.json deleted file mode 100644 index 2b5e1209f..000000000 --- a/data/hfopenllm_v2/sumink/llmer/d69ecbfa-5036-48b8-8fed-f9162e2857f5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_llmer/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llmer", - "id": "sumink/llmer", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3191 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4885 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.065 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4039 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/qwft/b5924329-c182-482a-bee8-22fcb348281d.json b/data/hfopenllm_v2/sumink/qwft/b5924329-c182-482a-bee8-22fcb348281d.json deleted file mode 100644 index 734732a61..000000000 --- a/data/hfopenllm_v2/sumink/qwft/b5924329-c182-482a-bee8-22fcb348281d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_qwft/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwft", - "id": "sumink/qwft", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1197 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3002 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3581 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1129 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/qwmer/a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json b/data/hfopenllm_v2/sumink/qwmer/a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json deleted file mode 100644 index 88f04b8bd..000000000 --- a/data/hfopenllm_v2/sumink/qwmer/a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_qwmer/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwmer", - "id": "sumink/qwmer", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2212 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4299 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0008 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4032 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2215 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/solarmer3/b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json b/data/hfopenllm_v2/sumink/solarmer3/b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json deleted file mode 100644 index ac7e81ef1..000000000 --- a/data/hfopenllm_v2/sumink/solarmer3/b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_solarmer3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "solarmer3", - "id": "sumink/solarmer3", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3741 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0582 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3323 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/somer/b5de0218-91dc-487a-be90-70f8bcb64803.json b/data/hfopenllm_v2/sumink/somer/b5de0218-91dc-487a-be90-70f8bcb64803.json deleted file mode 100644 index 3dadaebd8..000000000 --- a/data/hfopenllm_v2/sumink/somer/b5de0218-91dc-487a-be90-70f8bcb64803.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_somer/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "somer", - "id": "sumink/somer", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.299 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5194 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.465 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3447 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/somer2/3870f65b-3429-45c2-846f-6af30155a78b.json b/data/hfopenllm_v2/sumink/somer2/3870f65b-3429-45c2-846f-6af30155a78b.json deleted file mode 100644 index f13d9325b..000000000 --- a/data/hfopenllm_v2/sumink/somer2/3870f65b-3429-45c2-846f-6af30155a78b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_somer2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "somer2", - "id": "sumink/somer2", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3132 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5167 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4663 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3433 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sumink/somerft/d6c33a51-be09-4cb5-9942-4348668d3e5e.json b/data/hfopenllm_v2/sumink/somerft/d6c33a51-be09-4cb5-9942-4348668d3e5e.json deleted file mode 100644 index fd584a2ba..000000000 --- a/data/hfopenllm_v2/sumink/somerft/d6c33a51-be09-4cb5-9942-4348668d3e5e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sumink_somerft/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "somerft", - "id": "sumink/somerft", - "developer": "sumink", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.543 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1431 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4045 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/sunbaby/BrainCog-8B-0.1-Instruct/1ccd36ee-445a-4861-8835-d602973148fc.json b/data/hfopenllm_v2/sunbaby/BrainCog-8B-0.1-Instruct/1ccd36ee-445a-4861-8835-d602973148fc.json deleted file mode 100644 index af3751a29..000000000 --- a/data/hfopenllm_v2/sunbaby/BrainCog-8B-0.1-Instruct/1ccd36ee-445a-4861-8835-d602973148fc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/sunbaby_BrainCog-8B-0.1-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BrainCog-8B-0.1-Instruct", - "id": "sunbaby/BrainCog-8B-0.1-Instruct", - "developer": "sunbaby", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4618 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3656 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2858 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json b/data/hfopenllm_v2/swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json deleted file mode 100644 index 8d0457f9c..000000000 --- a/data/hfopenllm_v2/swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/swap-uniba_LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", - "id": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA", - "developer": "swap-uniba", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4815 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4936 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3723 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/synergetic/FrankenQwen2.5-14B/6a69202c-1c68-43e4-bd45-bbc2ff2db743.json b/data/hfopenllm_v2/synergetic/FrankenQwen2.5-14B/6a69202c-1c68-43e4-bd45-bbc2ff2db743.json deleted file mode 100644 index 5de286878..000000000 --- a/data/hfopenllm_v2/synergetic/FrankenQwen2.5-14B/6a69202c-1c68-43e4-bd45-bbc2ff2db743.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/synergetic_FrankenQwen2.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FrankenQwen2.5-14B", - "id": "synergetic/FrankenQwen2.5-14B", - "developer": "synergetic", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 16.972 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1869 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6048 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3843 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/talha2001/Beast-Soul-new/a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json b/data/hfopenllm_v2/talha2001/Beast-Soul-new/a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json deleted file mode 100644 index 3ca4e52f3..000000000 --- a/data/hfopenllm_v2/talha2001/Beast-Soul-new/a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/talha2001_Beast-Soul-new/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Beast-Soul-new", - "id": "talha2001/Beast-Soul-new", - "developer": "talha2001", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4854 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5227 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4459 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3102 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct/f76d3d30-4fce-48a9-a26b-7d714fff1d29.json b/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct/f76d3d30-4fce-48a9-a26b-7d714fff1d29.json deleted file mode 100644 index 2cab635d2..000000000 --- a/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct/f76d3d30-4fce-48a9-a26b-7d714fff1d29.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tangledgroup_tangled-llama-pints-1.5b-v0.1-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tangled-llama-pints-1.5b-v0.1-instruct", - "id": "tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct", - "developer": "tangledgroup", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1509 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3143 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2399 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3761 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct/eb38a092-1b56-4348-8188-baa2243f7046.json b/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct/eb38a092-1b56-4348-8188-baa2243f7046.json deleted file mode 100644 index 306584ab5..000000000 --- a/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct/eb38a092-1b56-4348-8188-baa2243f7046.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tangledgroup_tangled-llama-pints-1.5b-v0.2-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "tangled-llama-pints-1.5b-v0.2-instruct", - "id": "tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct", - "developer": "tangledgroup", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.5 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1724 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3158 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2416 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3643 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1117 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/1c4cfb94-fc66-4fe2-9879-78683abe654f.json b/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/1c4cfb94-fc66-4fe2-9879-78683abe654f.json deleted file mode 100644 index c4123e916..000000000 --- a/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/1c4cfb94-fc66-4fe2-9879-78683abe654f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tanliboy_lambda-gemma-2-9b-dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lambda-gemma-2-9b-dpo", - "id": "tanliboy/lambda-gemma-2-9b-dpo", - "developer": "tanliboy", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4501 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5472 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4017 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3792 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/2deef730-c37b-46ca-82b7-de38ae724fd4.json b/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/2deef730-c37b-46ca-82b7-de38ae724fd4.json deleted file mode 100644 index a0e743c57..000000000 --- a/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/2deef730-c37b-46ca-82b7-de38ae724fd4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tanliboy_lambda-gemma-2-9b-dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lambda-gemma-2-9b-dpo", - "id": "tanliboy/lambda-gemma-2-9b-dpo", - "developer": "tanliboy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1829 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5488 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4056 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3805 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-14b-dpo-test/13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json b/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-14b-dpo-test/13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json deleted file mode 100644 index 0ef9f7333..000000000 --- a/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-14b-dpo-test/13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tanliboy_lambda-qwen2.5-14b-dpo-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lambda-qwen2.5-14b-dpo-test", - "id": "tanliboy/lambda-qwen2.5-14b-dpo-test", - "developer": "tanliboy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8231 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6394 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5461 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3624 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4848 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-32b-dpo-test/36cf5b59-5369-4baf-80c1-3a47678eb5cb.json b/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-32b-dpo-test/36cf5b59-5369-4baf-80c1-3a47678eb5cb.json deleted file mode 100644 index ca6c94a3a..000000000 --- a/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-32b-dpo-test/36cf5b59-5369-4baf-80c1-3a47678eb5cb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tanliboy_lambda-qwen2.5-32b-dpo-test/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lambda-qwen2.5-32b-dpo-test", - "id": "tanliboy/lambda-qwen2.5-32b-dpo-test", - "developer": "tanliboy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8084 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6764 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6103 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4274 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5657 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tannedbum/Ellaria-9B/fced3ef1-fb69-47fe-bf68-3efe72db3142.json b/data/hfopenllm_v2/tannedbum/Ellaria-9B/fced3ef1-fb69-47fe-bf68-3efe72db3142.json deleted file mode 100644 index 769518cca..000000000 --- a/data/hfopenllm_v2/tannedbum/Ellaria-9B/fced3ef1-fb69-47fe-bf68-3efe72db3142.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tannedbum_Ellaria-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ellaria-9B", - "id": "tannedbum/Ellaria-9B", - "developer": "tannedbum", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7826 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5942 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4151 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4205 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tannedbum/L3-Nymeria-Maid-8B/7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json b/data/hfopenllm_v2/tannedbum/L3-Nymeria-Maid-8B/7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json deleted file mode 100644 index f95541ce4..000000000 --- a/data/hfopenllm_v2/tannedbum/L3-Nymeria-Maid-8B/7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tannedbum_L3-Nymeria-Maid-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Nymeria-Maid-8B", - "id": "tannedbum/L3-Nymeria-Maid-8B", - "developer": "tannedbum", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.725 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5146 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3751 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3747 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tannedbum/L3-Nymeria-v2-8B/6f413d72-cd9f-435c-b13e-9cec14edeb5c.json b/data/hfopenllm_v2/tannedbum/L3-Nymeria-v2-8B/6f413d72-cd9f-435c-b13e-9cec14edeb5c.json deleted file mode 100644 index c7b09040b..000000000 --- a/data/hfopenllm_v2/tannedbum/L3-Nymeria-v2-8B/6f413d72-cd9f-435c-b13e-9cec14edeb5c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tannedbum_L3-Nymeria-v2-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Nymeria-v2-8B", - "id": "tannedbum/L3-Nymeria-v2-8B", - "developer": "tannedbum", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7168 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5224 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0921 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3699 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3753 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tannedbum/L3-Rhaenys-8B/a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json b/data/hfopenllm_v2/tannedbum/L3-Rhaenys-8B/a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json deleted file mode 100644 index c71fb151f..000000000 --- a/data/hfopenllm_v2/tannedbum/L3-Rhaenys-8B/a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tannedbum_L3-Rhaenys-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-Rhaenys-8B", - "id": "tannedbum/L3-Rhaenys-8B", - "developer": "tannedbum", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7363 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5299 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2978 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3725 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3799 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/teknium/CollectiveCognition-v1.1-Mistral-7B/0b19508c-4996-4fb7-b0e0-9fa952854fa3.json b/data/hfopenllm_v2/teknium/CollectiveCognition-v1.1-Mistral-7B/0b19508c-4996-4fb7-b0e0-9fa952854fa3.json deleted file mode 100644 index df3839754..000000000 --- a/data/hfopenllm_v2/teknium/CollectiveCognition-v1.1-Mistral-7B/0b19508c-4996-4fb7-b0e0-9fa952854fa3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/teknium_CollectiveCognition-v1.1-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CollectiveCognition-v1.1-Mistral-7B", - "id": "teknium/CollectiveCognition-v1.1-Mistral-7B", - "developer": "teknium", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.279 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4493 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3869 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2837 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/teknium/OpenHermes-13B/447c22c1-8929-420f-b59b-01ab32a22281.json b/data/hfopenllm_v2/teknium/OpenHermes-13B/447c22c1-8929-420f-b59b-01ab32a22281.json deleted file mode 100644 index 217a4b6f0..000000000 --- a/data/hfopenllm_v2/teknium/OpenHermes-13B/447c22c1-8929-420f-b59b-01ab32a22281.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/teknium_OpenHermes-13B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenHermes-13B", - "id": "teknium/OpenHermes-13B", - "developer": "teknium", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4206 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4043 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/teknium/OpenHermes-2-Mistral-7B/ab3dbe43-658e-4c8a-a399-b3d070d467ba.json b/data/hfopenllm_v2/teknium/OpenHermes-2-Mistral-7B/ab3dbe43-658e-4c8a-a399-b3d070d467ba.json deleted file mode 100644 index 75e18465b..000000000 --- a/data/hfopenllm_v2/teknium/OpenHermes-2-Mistral-7B/ab3dbe43-658e-4c8a-a399-b3d070d467ba.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/teknium_OpenHermes-2-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenHermes-2-Mistral-7B", - "id": "teknium/OpenHermes-2-Mistral-7B", - "developer": "teknium", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5286 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4948 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.452 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2931 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/teknium/OpenHermes-2.5-Mistral-7B/ee5c87a4-aa06-4728-a9bf-2fc35284b987.json b/data/hfopenllm_v2/teknium/OpenHermes-2.5-Mistral-7B/ee5c87a4-aa06-4728-a9bf-2fc35284b987.json deleted file mode 100644 index 1bb9bc4d1..000000000 --- a/data/hfopenllm_v2/teknium/OpenHermes-2.5-Mistral-7B/ee5c87a4-aa06-4728-a9bf-2fc35284b987.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/teknium_OpenHermes-2.5-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenHermes-2.5-Mistral-7B", - "id": "teknium/OpenHermes-2.5-Mistral-7B", - "developer": "teknium", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5571 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.487 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0506 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4242 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/teknium/OpenHermes-7B/6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json b/data/hfopenllm_v2/teknium/OpenHermes-7B/6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json deleted file mode 100644 index 58a599dbe..000000000 --- a/data/hfopenllm_v2/teknium/OpenHermes-7B/6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/teknium_OpenHermes-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenHermes-7B", - "id": "teknium/OpenHermes-7B", - "developer": "teknium", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1813 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.362 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4324 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1933 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v1/9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json b/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v1/9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json deleted file mode 100644 index 052c9b9d5..000000000 --- a/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v1/9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_falcon3-10b-tensopolis-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "falcon3-10b-tensopolis-v1", - "id": "tensopolis/falcon3-10b-tensopolis-v1", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7817 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6182 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2749 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.442 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v2/100cf60a-c43c-4b3a-a667-a45cffdd562a.json b/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v2/100cf60a-c43c-4b3a-a667-a45cffdd562a.json deleted file mode 100644 index 67f93fc67..000000000 --- a/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v2/100cf60a-c43c-4b3a-a667-a45cffdd562a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_falcon3-10b-tensopolis-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "falcon3-10b-tensopolis-v2", - "id": "tensopolis/falcon3-10b-tensopolis-v2", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7792 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6182 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2666 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3272 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4297 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4424 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/lamarckvergence-14b-tensopolis-v1/2088fca7-11d7-47de-808d-d47da0caad0f.json b/data/hfopenllm_v2/tensopolis/lamarckvergence-14b-tensopolis-v1/2088fca7-11d7-47de-808d-d47da0caad0f.json deleted file mode 100644 index 4284ac283..000000000 --- a/data/hfopenllm_v2/tensopolis/lamarckvergence-14b-tensopolis-v1/2088fca7-11d7-47de-808d-d47da0caad0f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_lamarckvergence-14b-tensopolis-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "lamarckvergence-14b-tensopolis-v1", - "id": "tensopolis/lamarckvergence-14b-tensopolis-v1", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6561 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5166 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3607 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4475 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.525 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/mistral-small-2501-tensopolis-v1/bf0b3560-9d38-406a-ad30-5fd157f0fe43.json b/data/hfopenllm_v2/tensopolis/mistral-small-2501-tensopolis-v1/bf0b3560-9d38-406a-ad30-5fd157f0fe43.json deleted file mode 100644 index 26e8accb5..000000000 --- a/data/hfopenllm_v2/tensopolis/mistral-small-2501-tensopolis-v1/bf0b3560-9d38-406a-ad30-5fd157f0fe43.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_mistral-small-2501-tensopolis-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-small-2501-tensopolis-v1", - "id": "tensopolis/mistral-small-2501-tensopolis-v1", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6475 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4441 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.428 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4465 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/mistral-small-r1-tensopolis/9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json b/data/hfopenllm_v2/tensopolis/mistral-small-r1-tensopolis/9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json deleted file mode 100644 index 1b98fd82a..000000000 --- a/data/hfopenllm_v2/tensopolis/mistral-small-r1-tensopolis/9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_mistral-small-r1-tensopolis/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistral-small-r1-tensopolis", - "id": "tensopolis/mistral-small-r1-tensopolis", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 23.572 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4622 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5436 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2908 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4035 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/phi-4-tensopolis-v1/14501de3-dac0-44af-8c17-7abcd9bbba8b.json b/data/hfopenllm_v2/tensopolis/phi-4-tensopolis-v1/14501de3-dac0-44af-8c17-7abcd9bbba8b.json deleted file mode 100644 index 498b1170f..000000000 --- a/data/hfopenllm_v2/tensopolis/phi-4-tensopolis-v1/14501de3-dac0-44af-8c17-7abcd9bbba8b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_phi-4-tensopolis-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-tensopolis-v1", - "id": "tensopolis/phi-4-tensopolis-v1", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6767 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6872 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.494 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/qwen2.5-14b-tensopolis-v1/c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json b/data/hfopenllm_v2/tensopolis/qwen2.5-14b-tensopolis-v1/c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json deleted file mode 100644 index 32b5e1f0e..000000000 --- a/data/hfopenllm_v2/tensopolis/qwen2.5-14b-tensopolis-v1/c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_qwen2.5-14b-tensopolis-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-14b-tensopolis-v1", - "id": "tensopolis/qwen2.5-14b-tensopolis-v1", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.799 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6364 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3347 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4193 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4911 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/qwen2.5-3b-or1-tensopolis/8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json b/data/hfopenllm_v2/tensopolis/qwen2.5-3b-or1-tensopolis/8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json deleted file mode 100644 index a7e6952b8..000000000 --- a/data/hfopenllm_v2/tensopolis/qwen2.5-3b-or1-tensopolis/8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_qwen2.5-3b-or1-tensopolis/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-3b-or1-tensopolis", - "id": "tensopolis/qwen2.5-3b-or1-tensopolis", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.354 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4421 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.173 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3749 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3197 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v1/1326ff61-d0b4-46eb-9bcf-f978166e622b.json b/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v1/1326ff61-d0b4-46eb-9bcf-f978166e622b.json deleted file mode 100644 index 2517471e4..000000000 --- a/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v1/1326ff61-d0b4-46eb-9bcf-f978166e622b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_qwen2.5-7b-tensopolis-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-7b-tensopolis-v1", - "id": "tensopolis/qwen2.5-7b-tensopolis-v1", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7661 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5379 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4339 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4269 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v2/4c9e829f-7a99-4d61-8730-7457215a4fd6.json b/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v2/4c9e829f-7a99-4d61-8730-7457215a4fd6.json deleted file mode 100644 index eb25ba8f2..000000000 --- a/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v2/4c9e829f-7a99-4d61-8730-7457215a4fd6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_qwen2.5-7b-tensopolis-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen2.5-7b-tensopolis-v2", - "id": "tensopolis/qwen2.5-7b-tensopolis-v2", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7521 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5415 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4819 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2903 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4246 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4243 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v1/afc24d42-6d25-4036-8f22-fcf944b481b7.json b/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v1/afc24d42-6d25-4036-8f22-fcf944b481b7.json deleted file mode 100644 index 03ab187f1..000000000 --- a/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v1/afc24d42-6d25-4036-8f22-fcf944b481b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-lite-tensopolis-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "virtuoso-lite-tensopolis-v1", - "id": "tensopolis/virtuoso-lite-tensopolis-v1", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8069 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6102 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2545 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4582 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4435 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v2/6f6db681-991e-408b-8d4e-71fff9e1c974.json b/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v2/6f6db681-991e-408b-8d4e-71fff9e1c974.json deleted file mode 100644 index 21561cd85..000000000 --- a/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v2/6f6db681-991e-408b-8d4e-71fff9e1c974.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-lite-tensopolis-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "virtuoso-lite-tensopolis-v2", - "id": "tensopolis/virtuoso-lite-tensopolis-v2", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8029 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4595 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.444 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v1/f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json b/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v1/f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json deleted file mode 100644 index 636b8d846..000000000 --- a/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v1/f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-small-tensopolis-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "virtuoso-small-tensopolis-v1", - "id": "tensopolis/virtuoso-small-tensopolis-v1", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7856 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6415 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4326 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4968 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v2/77b457d9-4957-4f0d-a8d3-e005ae382239.json b/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v2/77b457d9-4957-4f0d-a8d3-e005ae382239.json deleted file mode 100644 index 243e5f94d..000000000 --- a/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v2/77b457d9-4957-4f0d-a8d3-e005ae382239.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-small-tensopolis-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "virtuoso-small-tensopolis-v2", - "id": "tensopolis/virtuoso-small-tensopolis-v2", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6516 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4352 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5154 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-small-v2-tensopolis-v1/11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json b/data/hfopenllm_v2/tensopolis/virtuoso-small-v2-tensopolis-v1/11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json deleted file mode 100644 index 24a089280..000000000 --- a/data/hfopenllm_v2/tensopolis/virtuoso-small-v2-tensopolis-v1/11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-small-v2-tensopolis-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "virtuoso-small-v2-tensopolis-v1", - "id": "tensopolis/virtuoso-small-v2-tensopolis-v1", - "developer": "tensopolis", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8419 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6545 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4524 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5175 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tensoropera/Fox-1-1.6B/23cc1e7f-0994-43a5-8403-5361a2976285.json b/data/hfopenllm_v2/tensoropera/Fox-1-1.6B/23cc1e7f-0994-43a5-8403-5361a2976285.json deleted file mode 100644 index fa1deaf6f..000000000 --- a/data/hfopenllm_v2/tensoropera/Fox-1-1.6B/23cc1e7f-0994-43a5-8403-5361a2976285.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tensoropera_Fox-1-1.6B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Fox-1-1.6B", - "id": "tensoropera/Fox-1-1.6B", - "developer": "tensoropera", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.665 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2766 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3307 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.355 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1371 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tenyx/Llama3-TenyxChat-70B/88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json b/data/hfopenllm_v2/tenyx/Llama3-TenyxChat-70B/88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json deleted file mode 100644 index f42d9a03a..000000000 --- a/data/hfopenllm_v2/tenyx/Llama3-TenyxChat-70B/88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tenyx_Llama3-TenyxChat-70B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-TenyxChat-70B", - "id": "tenyx/Llama3-TenyxChat-70B", - "developer": "tenyx", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 70.554 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8087 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6511 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2356 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.426 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.521 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theo77186/Qwen2.5-Coder-7B-Instruct-20241106/ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json b/data/hfopenllm_v2/theo77186/Qwen2.5-Coder-7B-Instruct-20241106/ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json deleted file mode 100644 index 21cc408b5..000000000 --- a/data/hfopenllm_v2/theo77186/Qwen2.5-Coder-7B-Instruct-20241106/ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theo77186_Qwen2.5-Coder-7B-Instruct-20241106/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-7B-Instruct-20241106", - "id": "theo77186/Qwen2.5-Coder-7B-Instruct-20241106", - "developer": "theo77186", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6101 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5008 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2919 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/Boptruth-Agatha-7B/3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json b/data/hfopenllm_v2/theprint/Boptruth-Agatha-7B/3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json deleted file mode 100644 index 096a756a8..000000000 --- a/data/hfopenllm_v2/theprint/Boptruth-Agatha-7B/3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_Boptruth-Agatha-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Boptruth-Agatha-7B", - "id": "theprint/Boptruth-Agatha-7B", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3124 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4984 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0551 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4277 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/CleverBoi-7B-v2/a06ad94f-13ee-466c-b25f-87cd87012678.json b/data/hfopenllm_v2/theprint/CleverBoi-7B-v2/a06ad94f-13ee-466c-b25f-87cd87012678.json deleted file mode 100644 index 1d1af6ab3..000000000 --- a/data/hfopenllm_v2/theprint/CleverBoi-7B-v2/a06ad94f-13ee-466c-b25f-87cd87012678.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-7B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CleverBoi-7B-v2", - "id": "theprint/CleverBoi-7B-v2", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 7.736 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.217 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4532 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4695 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2709 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/CleverBoi-7B-v3/9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json b/data/hfopenllm_v2/theprint/CleverBoi-7B-v3/9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json deleted file mode 100644 index 15f7a9b45..000000000 --- a/data/hfopenllm_v2/theprint/CleverBoi-7B-v3/9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-7B-v3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CleverBoi-7B-v3", - "id": "theprint/CleverBoi-7B-v3", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 7.736 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2382 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4414 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2659 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4072 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2868 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-Instruct/7dcd6e37-3685-4b08-b983-b2a711aeaf73.json b/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-Instruct/7dcd6e37-3685-4b08-b983-b2a711aeaf73.json deleted file mode 100644 index a63c67af4..000000000 --- a/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-Instruct/7dcd6e37-3685-4b08-b983-b2a711aeaf73.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-Llama-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CleverBoi-Llama-3.1-8B-Instruct", - "id": "theprint/CleverBoi-Llama-3.1-8B-Instruct", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 16.061 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1682 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.456 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4014 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3075 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-v2/b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json b/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-v2/b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json deleted file mode 100644 index f1cf9de8b..000000000 --- a/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-v2/b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-Llama-3.1-8B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CleverBoi-Llama-3.1-8B-v2", - "id": "theprint/CleverBoi-Llama-3.1-8B-v2", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 9.3 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1961 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4668 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0529 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2861 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3735 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3188 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/CleverBoi-Nemo-12B-v2/4cc037a2-d952-4566-a575-015f8e3a5925.json b/data/hfopenllm_v2/theprint/CleverBoi-Nemo-12B-v2/4cc037a2-d952-4566-a575-015f8e3a5925.json deleted file mode 100644 index 76b2de2d9..000000000 --- a/data/hfopenllm_v2/theprint/CleverBoi-Nemo-12B-v2/4cc037a2-d952-4566-a575-015f8e3a5925.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-Nemo-12B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CleverBoi-Nemo-12B-v2", - "id": "theprint/CleverBoi-Nemo-12B-v2", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 13.933 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2046 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5241 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4187 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3228 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/Code-Llama-Bagel-8B/a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json b/data/hfopenllm_v2/theprint/Code-Llama-Bagel-8B/a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json deleted file mode 100644 index b1815116f..000000000 --- a/data/hfopenllm_v2/theprint/Code-Llama-Bagel-8B/a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_Code-Llama-Bagel-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Code-Llama-Bagel-8B", - "id": "theprint/Code-Llama-Bagel-8B", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.253 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4697 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0612 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.368 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2822 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/Conversely-Mistral-7B/40e452df-8f0a-4473-a3d1-41f9c288c12f.json b/data/hfopenllm_v2/theprint/Conversely-Mistral-7B/40e452df-8f0a-4473-a3d1-41f9c288c12f.json deleted file mode 100644 index 71ebe1082..000000000 --- a/data/hfopenllm_v2/theprint/Conversely-Mistral-7B/40e452df-8f0a-4473-a3d1-41f9c288c12f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_Conversely-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Conversely-Mistral-7B", - "id": "theprint/Conversely-Mistral-7B", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 14.496 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2608 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4672 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4189 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2826 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/Llama-3.2-3B-VanRossum/216020ac-276b-436e-815b-d6968eb83770.json b/data/hfopenllm_v2/theprint/Llama-3.2-3B-VanRossum/216020ac-276b-436e-815b-d6968eb83770.json deleted file mode 100644 index aa1889b1c..000000000 --- a/data/hfopenllm_v2/theprint/Llama-3.2-3B-VanRossum/216020ac-276b-436e-815b-d6968eb83770.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_Llama-3.2-3B-VanRossum/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-VanRossum", - "id": "theprint/Llama-3.2-3B-VanRossum", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 3.696 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4783 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4279 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0974 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3442 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.277 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/ReWiz-7B/1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json b/data/hfopenllm_v2/theprint/ReWiz-7B/1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json deleted file mode 100644 index 28eca97a3..000000000 --- a/data/hfopenllm_v2/theprint/ReWiz-7B/1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_ReWiz-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReWiz-7B", - "id": "theprint/ReWiz-7B", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 7.736 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4048 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4564 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0408 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4612 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.267 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/ReWiz-Llama-3.1-8B-v2/25739611-f690-41b4-87de-9f4ea8b3d815.json b/data/hfopenllm_v2/theprint/ReWiz-Llama-3.1-8B-v2/25739611-f690-41b4-87de-9f4ea8b3d815.json deleted file mode 100644 index f45735570..000000000 --- a/data/hfopenllm_v2/theprint/ReWiz-Llama-3.1-8B-v2/25739611-f690-41b4-87de-9f4ea8b3d815.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Llama-3.1-8B-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReWiz-Llama-3.1-8B-v2", - "id": "theprint/ReWiz-Llama-3.1-8B-v2", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 9.3 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2379 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4632 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3029 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.331 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/ReWiz-Llama-3.2-3B/b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json b/data/hfopenllm_v2/theprint/ReWiz-Llama-3.2-3B/b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json deleted file mode 100644 index e0c5c3d8e..000000000 --- a/data/hfopenllm_v2/theprint/ReWiz-Llama-3.2-3B/b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Llama-3.2-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReWiz-Llama-3.2-3B", - "id": "theprint/ReWiz-Llama-3.2-3B", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1095 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3614 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2887 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/ReWiz-Nemo-12B-Instruct/fa237949-c3ac-482a-8a54-5a2019f24016.json b/data/hfopenllm_v2/theprint/ReWiz-Nemo-12B-Instruct/fa237949-c3ac-482a-8a54-5a2019f24016.json deleted file mode 100644 index 121036620..000000000 --- a/data/hfopenllm_v2/theprint/ReWiz-Nemo-12B-Instruct/fa237949-c3ac-482a-8a54-5a2019f24016.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Nemo-12B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReWiz-Nemo-12B-Instruct", - "id": "theprint/ReWiz-Nemo-12B-Instruct", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 12.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1062 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5092 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1042 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3238 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4096 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3339 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/ReWiz-Qwen-2.5-14B/b60dd828-a3e7-46a8-b4c2-322aeca42faf.json b/data/hfopenllm_v2/theprint/ReWiz-Qwen-2.5-14B/b60dd828-a3e7-46a8-b4c2-322aeca42faf.json deleted file mode 100644 index c794f72d9..000000000 --- a/data/hfopenllm_v2/theprint/ReWiz-Qwen-2.5-14B/b60dd828-a3e7-46a8-b4c2-322aeca42faf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Qwen-2.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReWiz-Qwen-2.5-14B", - "id": "theprint/ReWiz-Qwen-2.5-14B", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 16.743 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2785 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6179 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.38 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5092 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/ReWiz-Worldbuilder-7B/5de9f914-333f-4181-a93f-79257a3daf54.json b/data/hfopenllm_v2/theprint/ReWiz-Worldbuilder-7B/5de9f914-333f-4181-a93f-79257a3daf54.json deleted file mode 100644 index 499adb749..000000000 --- a/data/hfopenllm_v2/theprint/ReWiz-Worldbuilder-7B/5de9f914-333f-4181-a93f-79257a3daf54.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Worldbuilder-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ReWiz-Worldbuilder-7B", - "id": "theprint/ReWiz-Worldbuilder-7B", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.248 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.251 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4636 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.037 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4572 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2971 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/RuDolph-Hermes-7B/e2d23da4-226a-4a02-8390-e8edaea4b65b.json b/data/hfopenllm_v2/theprint/RuDolph-Hermes-7B/e2d23da4-226a-4a02-8390-e8edaea4b65b.json deleted file mode 100644 index b8a758ee2..000000000 --- a/data/hfopenllm_v2/theprint/RuDolph-Hermes-7B/e2d23da4-226a-4a02-8390-e8edaea4b65b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_RuDolph-Hermes-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RuDolph-Hermes-7B", - "id": "theprint/RuDolph-Hermes-7B", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5053 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0514 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3121 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3073 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/WorldBuilder-12B/c64c7470-dcf9-46f8-b789-cab7e902739d.json b/data/hfopenllm_v2/theprint/WorldBuilder-12B/c64c7470-dcf9-46f8-b789-cab7e902739d.json deleted file mode 100644 index 70af1d5c7..000000000 --- a/data/hfopenllm_v2/theprint/WorldBuilder-12B/c64c7470-dcf9-46f8-b789-cab7e902739d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_WorldBuilder-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WorldBuilder-12B", - "id": "theprint/WorldBuilder-12B", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 13.933 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1374 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.501 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4066 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3192 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/theprint/phi-3-mini-4k-python/f6d727a3-19dc-4173-a88f-2c47449896aa.json b/data/hfopenllm_v2/theprint/phi-3-mini-4k-python/f6d727a3-19dc-4173-a88f-2c47449896aa.json deleted file mode 100644 index e5805f869..000000000 --- a/data/hfopenllm_v2/theprint/phi-3-mini-4k-python/f6d727a3-19dc-4173-a88f-2c47449896aa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/theprint_phi-3-mini-4k-python/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-3-mini-4k-python", - "id": "theprint/phi-3-mini-4k-python", - "developer": "theprint", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "?", - "params_billions": 4.132 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2409 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4938 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3922 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3577 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/thinkcoder/llama3-8b-instruct-lora-8-sft/490d14c8-2cb0-4328-9f41-6074b28d6fdc.json b/data/hfopenllm_v2/thinkcoder/llama3-8b-instruct-lora-8-sft/490d14c8-2cb0-4328-9f41-6074b28d6fdc.json deleted file mode 100644 index 440a5ef62..000000000 --- a/data/hfopenllm_v2/thinkcoder/llama3-8b-instruct-lora-8-sft/490d14c8-2cb0-4328-9f41-6074b28d6fdc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/thinkcoder_llama3-8b-instruct-lora-8-sft/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3-8b-instruct-lora-8-sft", - "id": "thinkcoder/llama3-8b-instruct-lora-8-sft", - "developer": "thinkcoder", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.648 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4865 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3235 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3476 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/thirdeyeai/elevate360m/9351b079-7ef5-42ec-bb83-f0d8ec7de479.json b/data/hfopenllm_v2/thirdeyeai/elevate360m/9351b079-7ef5-42ec-bb83-f0d8ec7de479.json deleted file mode 100644 index 2f88fa8a2..000000000 --- a/data/hfopenllm_v2/thirdeyeai/elevate360m/9351b079-7ef5-42ec-bb83-f0d8ec7de479.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/thirdeyeai_elevate360m/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "elevate360m", - "id": "thirdeyeai/elevate360m", - "developer": "thirdeyeai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.362 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2963 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2408 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3462 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1077 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-1_5B/852d5adb-f422-4102-8114-082ab0b3c07d.json b/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-1_5B/852d5adb-f422-4102-8114-082ab0b3c07d.json deleted file mode 100644 index d16c6d65e..000000000 --- a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-1_5B/852d5adb-f422-4102-8114-082ab0b3c07d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/thomas-yanxin_XinYuan-Qwen2-1_5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "XinYuan-Qwen2-1_5B", - "id": "thomas-yanxin/XinYuan-Qwen2-1_5B", - "developer": "thomas-yanxin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.777 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2986 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3635 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0672 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3634 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B-0917/c64e98cd-c022-4834-a3e0-3949416d1fb1.json b/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B-0917/c64e98cd-c022-4834-a3e0-3949416d1fb1.json deleted file mode 100644 index 075ddb870..000000000 --- a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B-0917/c64e98cd-c022-4834-a3e0-3949416d1fb1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/thomas-yanxin_XinYuan-Qwen2-7B-0917/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "XinYuan-Qwen2-7B-0917", - "id": "thomas-yanxin/XinYuan-Qwen2-7B-0917", - "developer": "thomas-yanxin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3719 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5169 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1979 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4245 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B/f101bd15-ac61-49d4-beac-c89bc889b34b.json b/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B/f101bd15-ac61-49d4-beac-c89bc889b34b.json deleted file mode 100644 index eee3e1e34..000000000 --- a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B/f101bd15-ac61-49d4-beac-c89bc889b34b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/thomas-yanxin_XinYuan-Qwen2-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "XinYuan-Qwen2-7B", - "id": "thomas-yanxin/XinYuan-Qwen2-7B", - "developer": "thomas-yanxin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4438 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4937 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1458 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4058 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3925 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2.5-7B-0917/11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json b/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2.5-7B-0917/11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json deleted file mode 100644 index ff36d37cb..000000000 --- a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2.5-7B-0917/11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/thomas-yanxin_XinYuan-Qwen2.5-7B-0917/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "XinYuan-Qwen2.5-7B-0917", - "id": "thomas-yanxin/XinYuan-Qwen2.5-7B-0917", - "developer": "thomas-yanxin", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3577 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5184 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1934 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3676 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tianyil1/MistralForCausalLM_Cal_DPO/f0b57a60-8402-4430-93f3-b846a94113f2.json b/data/hfopenllm_v2/tianyil1/MistralForCausalLM_Cal_DPO/f0b57a60-8402-4430-93f3-b846a94113f2.json deleted file mode 100644 index 06e058942..000000000 --- a/data/hfopenllm_v2/tianyil1/MistralForCausalLM_Cal_DPO/f0b57a60-8402-4430-93f3-b846a94113f2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tianyil1_MistralForCausalLM_Cal_DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MistralForCausalLM_Cal_DPO", - "id": "tianyil1/MistralForCausalLM_Cal_DPO", - "developer": "tianyil1", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5328 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0287 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2763 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-10B-Base/50aa8077-4493-47a9-9cec-014c56343ecf.json b/data/hfopenllm_v2/tiiuae/Falcon3-10B-Base/50aa8077-4493-47a9-9cec-014c56343ecf.json deleted file mode 100644 index 35ab332bf..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-10B-Base/50aa8077-4493-47a9-9cec-014c56343ecf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-10B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-10B-Base", - "id": "tiiuae/Falcon3-10B-Base", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3648 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.595 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4398 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.424 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-10B-Instruct/5e70d00b-c822-4ad6-afe8-3756a7038c57.json b/data/hfopenllm_v2/tiiuae/Falcon3-10B-Instruct/5e70d00b-c822-4ad6-afe8-3756a7038c57.json deleted file mode 100644 index 968a3c904..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-10B-Instruct/5e70d00b-c822-4ad6-afe8-3756a7038c57.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-10B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-10B-Instruct", - "id": "tiiuae/Falcon3-10B-Instruct", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.306 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7817 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.617 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2764 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4429 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-1B-Base/8162ba41-e630-470f-a297-72fb9f2110fd.json b/data/hfopenllm_v2/tiiuae/Falcon3-1B-Base/8162ba41-e630-470f-a297-72fb9f2110fd.json deleted file mode 100644 index e1cc14037..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-1B-Base/8162ba41-e630-470f-a297-72fb9f2110fd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-1B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-1B-Base", - "id": "tiiuae/Falcon3-1B-Base", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.669 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2428 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3571 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4147 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1608 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-1B-Instruct/60dd9d02-476f-459d-a41c-f89f82116dc3.json b/data/hfopenllm_v2/tiiuae/Falcon3-1B-Instruct/60dd9d02-476f-459d-a41c-f89f82116dc3.json deleted file mode 100644 index 3a40efcbb..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-1B-Instruct/60dd9d02-476f-459d-a41c-f89f82116dc3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-1B-Instruct", - "id": "tiiuae/Falcon3-1B-Instruct", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.669 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5557 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3745 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0634 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4189 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1838 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-3B-Base/73e89f21-5799-4835-a0e0-a6664c0483da.json b/data/hfopenllm_v2/tiiuae/Falcon3-3B-Base/73e89f21-5799-4835-a0e0-a6664c0483da.json deleted file mode 100644 index 81b06da6b..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-3B-Base/73e89f21-5799-4835-a0e0-a6664c0483da.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-3B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-3B-Base", - "id": "tiiuae/Falcon3-3B-Base", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.228 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2765 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4421 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1178 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2879 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-3B-Instruct/7f355ad4-9156-486d-8cf4-723117da3bb8.json b/data/hfopenllm_v2/tiiuae/Falcon3-3B-Instruct/7f355ad4-9156-486d-8cf4-723117da3bb8.json deleted file mode 100644 index 2d79d95ab..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-3B-Instruct/7f355ad4-9156-486d-8cf4-723117da3bb8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-3B-Instruct", - "id": "tiiuae/Falcon3-3B-Instruct", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.228 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6977 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4754 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3005 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-7B-Base/4ccc6026-b639-488d-867f-d98ea49cf1b6.json b/data/hfopenllm_v2/tiiuae/Falcon3-7B-Base/4ccc6026-b639-488d-867f-d98ea49cf1b6.json deleted file mode 100644 index e683aaba0..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-7B-Base/4ccc6026-b639-488d-867f-d98ea49cf1b6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-7B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-7B-Base", - "id": "tiiuae/Falcon3-7B-Base", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5099 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1941 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4702 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-7B-Instruct/3cf2e68e-4de0-436e-935e-86935e11f72f.json b/data/hfopenllm_v2/tiiuae/Falcon3-7B-Instruct/3cf2e68e-4de0-436e-935e-86935e11f72f.json deleted file mode 100644 index b188ad079..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-7B-Instruct/3cf2e68e-4de0-436e-935e-86935e11f72f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-7B-Instruct", - "id": "tiiuae/Falcon3-7B-Instruct", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.456 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7612 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5632 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4086 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4827 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4087 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Base/e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json b/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Base/e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json deleted file mode 100644 index 98e4d109f..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Base/e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-Mamba-7B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-Mamba-7B-Base", - "id": "tiiuae/Falcon3-Mamba-7B-Base", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "FalconMambaForCausalLM", - "params_billions": 7.273 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2891 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4699 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1941 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3038 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Instruct/c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json b/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Instruct/c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json deleted file mode 100644 index cfbd00438..000000000 --- a/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Instruct/c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-Mamba-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Falcon3-Mamba-7B-Instruct", - "id": "tiiuae/Falcon3-Mamba-7B-Instruct", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "FalconMambaForCausalLM", - "params_billions": 7.273 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7165 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4679 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3006 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3869 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3369 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/falcon-11B/94fb625d-f58c-4f2e-8268-1dc4472c1cce.json b/data/hfopenllm_v2/tiiuae/falcon-11B/94fb625d-f58c-4f2e-8268-1dc4472c1cce.json deleted file mode 100644 index 50405922d..000000000 --- a/data/hfopenllm_v2/tiiuae/falcon-11B/94fb625d-f58c-4f2e-8268-1dc4472c1cce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_falcon-11B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "falcon-11B", - "id": "tiiuae/falcon-11B", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "FalconForCausalLM", - "params_billions": 11.103 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3261 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0279 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.271 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3986 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/falcon-40b-instruct/4481ddef-2bef-4284-b56d-21054f5a9a97.json b/data/hfopenllm_v2/tiiuae/falcon-40b-instruct/4481ddef-2bef-4284-b56d-21054f5a9a97.json deleted file mode 100644 index fcd44b235..000000000 --- a/data/hfopenllm_v2/tiiuae/falcon-40b-instruct/4481ddef-2bef-4284-b56d-21054f5a9a97.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_falcon-40b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "falcon-40b-instruct", - "id": "tiiuae/falcon-40b-instruct", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "FalconForCausalLM", - "params_billions": 40.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2454 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4054 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3762 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2261 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/falcon-40b/80048c4b-e97b-45c7-aa04-70ce69481a97.json b/data/hfopenllm_v2/tiiuae/falcon-40b/80048c4b-e97b-45c7-aa04-70ce69481a97.json deleted file mode 100644 index a3ef17532..000000000 --- a/data/hfopenllm_v2/tiiuae/falcon-40b/80048c4b-e97b-45c7-aa04-70ce69481a97.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_falcon-40b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "falcon-40b", - "id": "tiiuae/falcon-40b", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "FalconForCausalLM", - "params_billions": 40.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4019 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3631 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/falcon-7b-instruct/d21a2557-2348-4087-b2a6-6e1c0101bccc.json b/data/hfopenllm_v2/tiiuae/falcon-7b-instruct/d21a2557-2348-4087-b2a6-6e1c0101bccc.json deleted file mode 100644 index 556a2f3e8..000000000 --- a/data/hfopenllm_v2/tiiuae/falcon-7b-instruct/d21a2557-2348-4087-b2a6-6e1c0101bccc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_falcon-7b-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "falcon-7b-instruct", - "id": "tiiuae/falcon-7b-instruct", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "FalconForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1969 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3203 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0121 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3634 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1155 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/falcon-7b/76290d4b-5526-400b-8ca4-24d220f7c02d.json b/data/hfopenllm_v2/tiiuae/falcon-7b/76290d4b-5526-400b-8ca4-24d220f7c02d.json deleted file mode 100644 index 24fa62ffd..000000000 --- a/data/hfopenllm_v2/tiiuae/falcon-7b/76290d4b-5526-400b-8ca4-24d220f7c02d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_falcon-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "falcon-7b", - "id": "tiiuae/falcon-7b", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "FalconForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1821 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3285 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3778 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tiiuae/falcon-mamba-7b/3a146535-09b3-4246-8bd8-0e984e0905b1.json b/data/hfopenllm_v2/tiiuae/falcon-mamba-7b/3a146535-09b3-4246-8bd8-0e984e0905b1.json deleted file mode 100644 index 78cc692cd..000000000 --- a/data/hfopenllm_v2/tiiuae/falcon-mamba-7b/3a146535-09b3-4246-8bd8-0e984e0905b1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tiiuae_falcon-mamba-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "falcon-mamba-7b", - "id": "tiiuae/falcon-mamba-7b", - "developer": "tiiuae", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "FalconMambaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4285 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3104 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2302 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/BiBo-v0.3/6683f95c-f97f-4117-b3c5-c1ed9587289e.json b/data/hfopenllm_v2/tinycompany/BiBo-v0.3/6683f95c-f97f-4117-b3c5-c1ed9587289e.json deleted file mode 100644 index 731470475..000000000 --- a/data/hfopenllm_v2/tinycompany/BiBo-v0.3/6683f95c-f97f-4117-b3c5-c1ed9587289e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_BiBo-v0.3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BiBo-v0.3", - "id": "tinycompany/BiBo-v0.3", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.943 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5184 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4642 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.395 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/BiBo-v0.7/bbe74b2b-9e13-4c13-92c8-618078667248.json b/data/hfopenllm_v2/tinycompany/BiBo-v0.7/bbe74b2b-9e13-4c13-92c8-618078667248.json deleted file mode 100644 index 9034a4865..000000000 --- a/data/hfopenllm_v2/tinycompany/BiBo-v0.7/bbe74b2b-9e13-4c13-92c8-618078667248.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_BiBo-v0.7/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BiBo-v0.7", - "id": "tinycompany/BiBo-v0.7", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.943 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4311 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0823 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4044 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.265 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-bgem3/61876ce3-acc4-4619-b0c2-78ac4dff48ea.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-bgem3/61876ce3-acc4-4619-b0c2-78ac4dff48ea.json deleted file mode 100644 index 1ef8a0185..000000000 --- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-bgem3/61876ce3-acc4-4619-b0c2-78ac4dff48ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-bgem3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ShawtyIsBad-bgem3", - "id": "tinycompany/ShawtyIsBad-bgem3", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.436 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2608 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3853 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0483 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3054 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3695 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2583 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-e5-large/b304baee-c9de-4982-801d-2b9e7f1a7334.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-e5-large/b304baee-c9de-4982-801d-2b9e7f1a7334.json deleted file mode 100644 index 6ba06459f..000000000 --- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-e5-large/b304baee-c9de-4982-801d-2b9e7f1a7334.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-e5-large/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ShawtyIsBad-e5-large", - "id": "tinycompany/ShawtyIsBad-e5-large", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.436 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2468 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0453 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2569 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-ib/6f27e746-1bdd-4cec-a955-c27f2f9900ef.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-ib/6f27e746-1bdd-4cec-a955-c27f2f9900ef.json deleted file mode 100644 index 44928ae48..000000000 --- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-ib/6f27e746-1bdd-4cec-a955-c27f2f9900ef.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-ib/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ShawtyIsBad-ib", - "id": "tinycompany/ShawtyIsBad-ib", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.436 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2565 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3641 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2581 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic-moe/30637c5d-1bc0-49dc-8afd-335a9a66f196.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic-moe/30637c5d-1bc0-49dc-8afd-335a9a66f196.json deleted file mode 100644 index 27d6d6ef9..000000000 --- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic-moe/30637c5d-1bc0-49dc-8afd-335a9a66f196.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-nomic-moe/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ShawtyIsBad-nomic-moe", - "id": "tinycompany/ShawtyIsBad-nomic-moe", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.436 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2608 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3878 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.307 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3747 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2572 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic1.5/169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic1.5/169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json deleted file mode 100644 index 354caa8ce..000000000 --- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic1.5/169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-nomic1.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ShawtyIsBad-nomic1.5", - "id": "tinycompany/ShawtyIsBad-nomic1.5", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.436 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2544 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3874 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3112 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3628 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-base/427d32f7-190b-4005-b02c-6a8ce089dbbf.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-base/427d32f7-190b-4005-b02c-6a8ce089dbbf.json deleted file mode 100644 index 13015d742..000000000 --- a/data/hfopenllm_v2/tinycompany/SigmaBoi-base/427d32f7-190b-4005-b02c-6a8ce089dbbf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SigmaBoi-base", - "id": "tinycompany/SigmaBoi-base", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.943 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2447 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4314 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0778 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-bge-m3/de7551a8-63b1-4de3-899f-9d98cb985005.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-bge-m3/de7551a8-63b1-4de3-899f-9d98cb985005.json deleted file mode 100644 index e2bc5d9e6..000000000 --- a/data/hfopenllm_v2/tinycompany/SigmaBoi-bge-m3/de7551a8-63b1-4de3-899f-9d98cb985005.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-bge-m3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SigmaBoi-bge-m3", - "id": "tinycompany/SigmaBoi-bge-m3", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.943 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4383 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-bgem3/eff6f456-906d-4320-8e6f-667fbbf0574a.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-bgem3/eff6f456-906d-4320-8e6f-667fbbf0574a.json deleted file mode 100644 index 942fd8495..000000000 --- a/data/hfopenllm_v2/tinycompany/SigmaBoi-bgem3/eff6f456-906d-4320-8e6f-667fbbf0574a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-bgem3/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SigmaBoi-bgem3", - "id": "tinycompany/SigmaBoi-bgem3", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.943 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.245 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4351 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4383 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-ib/6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-ib/6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json deleted file mode 100644 index 4fc792af1..000000000 --- a/data/hfopenllm_v2/tinycompany/SigmaBoi-ib/6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-ib/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SigmaBoi-ib", - "id": "tinycompany/SigmaBoi-ib", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.943 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2477 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4344 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.074 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2824 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic-moe/7e3d3803-c8d4-4025-8d12-c4c29c49c059.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic-moe/7e3d3803-c8d4-4025-8d12-c4c29c49c059.json deleted file mode 100644 index 954ab673e..000000000 --- a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic-moe/7e3d3803-c8d4-4025-8d12-c4c29c49c059.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-nomic-moe/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SigmaBoi-nomic-moe", - "id": "tinycompany/SigmaBoi-nomic-moe", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.943 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2474 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4334 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2928 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2837 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5-fp32/a43a6ca9-3543-44bc-8511-ee5c45552070.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5-fp32/a43a6ca9-3543-44bc-8511-ee5c45552070.json deleted file mode 100644 index 5ad5da701..000000000 --- a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5-fp32/a43a6ca9-3543-44bc-8511-ee5c45552070.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-nomic1.5-fp32/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SigmaBoi-nomic1.5-fp32", - "id": "tinycompany/SigmaBoi-nomic1.5-fp32", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.943 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2462 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2841 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5/83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5/83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json deleted file mode 100644 index 323eb5e1d..000000000 --- a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5/83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-nomic1.5/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SigmaBoi-nomic1.5", - "id": "tinycompany/SigmaBoi-nomic1.5", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.943 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2447 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2841 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tinycompany/Tamed-Shawty/6e2d4174-303f-437b-9abb-26667b1dd04c.json b/data/hfopenllm_v2/tinycompany/Tamed-Shawty/6e2d4174-303f-437b-9abb-26667b1dd04c.json deleted file mode 100644 index 7acd05352..000000000 --- a/data/hfopenllm_v2/tinycompany/Tamed-Shawty/6e2d4174-303f-437b-9abb-26667b1dd04c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tinycompany_Tamed-Shawty/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Tamed-Shawty", - "id": "tinycompany/Tamed-Shawty", - "developer": "tinycompany", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.562 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3837 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2626 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3501 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tklohj/WindyFloLLM/955e93d0-bec1-483c-b3f0-258e13d5cb16.json b/data/hfopenllm_v2/tklohj/WindyFloLLM/955e93d0-bec1-483c-b3f0-258e13d5cb16.json deleted file mode 100644 index 7110c6cac..000000000 --- a/data/hfopenllm_v2/tklohj/WindyFloLLM/955e93d0-bec1-483c-b3f0-258e13d5cb16.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tklohj_WindyFloLLM/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "WindyFloLLM", - "id": "tklohj/WindyFloLLM", - "developer": "tklohj", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.016 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2669 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4637 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4253 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2581 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/GPT-JT-6B-v1/3065ca79-c5e9-4875-9f81-4231e971d818.json b/data/hfopenllm_v2/togethercomputer/GPT-JT-6B-v1/3065ca79-c5e9-4875-9f81-4231e971d818.json deleted file mode 100644 index 3b3a90f5a..000000000 --- a/data/hfopenllm_v2/togethercomputer/GPT-JT-6B-v1/3065ca79-c5e9-4875-9f81-4231e971d818.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_GPT-JT-6B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-JT-6B-v1", - "id": "togethercomputer/GPT-JT-6B-v1", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTJForCausalLM", - "params_billions": 6.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2061 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3303 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0106 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3737 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1626 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/GPT-NeoXT-Chat-Base-20B/fc7e485f-a416-420b-b43c-e45e502c4a8f.json b/data/hfopenllm_v2/togethercomputer/GPT-NeoXT-Chat-Base-20B/fc7e485f-a416-420b-b43c-e45e502c4a8f.json deleted file mode 100644 index c6293a07a..000000000 --- a/data/hfopenllm_v2/togethercomputer/GPT-NeoXT-Chat-Base-20B/fc7e485f-a416-420b-b43c-e45e502c4a8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_GPT-NeoXT-Chat-Base-20B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "GPT-NeoXT-Chat-Base-20B", - "id": "togethercomputer/GPT-NeoXT-Chat-Base-20B", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 20.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.183 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3321 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3461 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/LLaMA-2-7B-32K/53e882c6-6eb5-4202-a8d0-3a313556c9f4.json b/data/hfopenllm_v2/togethercomputer/LLaMA-2-7B-32K/53e882c6-6eb5-4202-a8d0-3a313556c9f4.json deleted file mode 100644 index f637076fd..000000000 --- a/data/hfopenllm_v2/togethercomputer/LLaMA-2-7B-32K/53e882c6-6eb5-4202-a8d0-3a313556c9f4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_LLaMA-2-7B-32K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LLaMA-2-7B-32K", - "id": "togethercomputer/LLaMA-2-7B-32K", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1865 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.34 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.25 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1768 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/Llama-2-7B-32K-Instruct/ba715669-c0ed-471f-80a6-b67453fb4930.json b/data/hfopenllm_v2/togethercomputer/Llama-2-7B-32K-Instruct/ba715669-c0ed-471f-80a6-b67453fb4930.json deleted file mode 100644 index 37237a2f5..000000000 --- a/data/hfopenllm_v2/togethercomputer/Llama-2-7B-32K-Instruct/ba715669-c0ed-471f-80a6-b67453fb4930.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_Llama-2-7B-32K-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-2-7B-32K-Instruct", - "id": "togethercomputer/Llama-2-7B-32K-Instruct", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.213 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3443 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2517 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4056 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1781 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Base/316cab27-5cac-4d26-90ae-05d1fc3bd14a.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Base/316cab27-5cac-4d26-90ae-05d1fc3bd14a.json deleted file mode 100644 index 26801a949..000000000 --- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Base/316cab27-5cac-4d26-90ae-05d1fc3bd14a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-7B-Base/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-7B-Base", - "id": "togethercomputer/RedPajama-INCITE-7B-Base", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2082 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3195 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0159 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.362 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1197 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Chat/d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Chat/d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json deleted file mode 100644 index 56a01fe9b..000000000 --- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Chat/d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-7B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-7B-Chat", - "id": "togethercomputer/RedPajama-INCITE-7B-Chat", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1558 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3175 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2525 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1121 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Instruct/bf3eabff-fbf7-421c-9e04-548accc7678c.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Instruct/bf3eabff-fbf7-421c-9e04-548accc7678c.json deleted file mode 100644 index ac9fb012b..000000000 --- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Instruct/bf3eabff-fbf7-421c-9e04-548accc7678c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-7B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-7B-Instruct", - "id": "togethercomputer/RedPajama-INCITE-7B-Instruct", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2055 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3377 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2508 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3685 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1272 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Base-3B-v1/b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Base-3B-v1/b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json deleted file mode 100644 index dff22f907..000000000 --- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Base-3B-v1/b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-Base-3B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-Base-3B-v1", - "id": "togethercomputer/RedPajama-INCITE-Base-3B-v1", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2294 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.306 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0144 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2433 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Chat-3B-v1/b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Chat-3B-v1/b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json deleted file mode 100644 index 28081c01d..000000000 --- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Chat-3B-v1/b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-Chat-3B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-Chat-3B-v1", - "id": "togethercomputer/RedPajama-INCITE-Chat-3B-v1", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1652 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3217 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2441 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1127 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Instruct-3B-v1/5b769770-3b63-4863-a723-95212e2be40e.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Instruct-3B-v1/5b769770-3b63-4863-a723-95212e2be40e.json deleted file mode 100644 index 503ea0a24..000000000 --- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Instruct-3B-v1/5b769770-3b63-4863-a723-95212e2be40e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-Instruct-3B-v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RedPajama-INCITE-Instruct-3B-v1", - "id": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1", - "developer": "togethercomputer", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GPTNeoXForCausalLM", - "params_billions": 3.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2124 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2475 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3886 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.111 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1/f2264b41-efa5-4278-91fd-2f454aa91c61.json b/data/hfopenllm_v2/tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1/f2264b41-efa5-4278-91fd-2f454aa91c61.json deleted file mode 100644 index d7ea093eb..000000000 --- a/data/hfopenllm_v2/tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1/f2264b41-efa5-4278-91fd-2f454aa91c61.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tokyotech-llm_Llama-3-Swallow-8B-Instruct-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-Swallow-8B-Instruct-v0.1", - "id": "tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1", - "developer": "tokyotech-llm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5508 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5009 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4357 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3088 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tomasmcm/sky-t1-coder-32b-flash/5c3484b4-6faa-47fd-a1a2-881898450f79.json b/data/hfopenllm_v2/tomasmcm/sky-t1-coder-32b-flash/5c3484b4-6faa-47fd-a1a2-881898450f79.json deleted file mode 100644 index 49195f496..000000000 --- a/data/hfopenllm_v2/tomasmcm/sky-t1-coder-32b-flash/5c3484b4-6faa-47fd-a1a2-881898450f79.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tomasmcm_sky-t1-coder-32b-flash/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sky-t1-coder-32b-flash", - "id": "tomasmcm/sky-t1-coder-32b-flash", - "developer": "tomasmcm", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.778 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6822 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4233 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5782 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/trthminh1112/autotrain-llama32-1b-finetune/326b95f8-9eae-4064-a261-077a957e233c.json b/data/hfopenllm_v2/trthminh1112/autotrain-llama32-1b-finetune/326b95f8-9eae-4064-a261-077a957e233c.json deleted file mode 100644 index 2a5e33192..000000000 --- a/data/hfopenllm_v2/trthminh1112/autotrain-llama32-1b-finetune/326b95f8-9eae-4064-a261-077a957e233c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/trthminh1112_autotrain-llama32-1b-finetune/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "autotrain-llama32-1b-finetune", - "id": "trthminh1112/autotrain-llama32-1b-finetune", - "developer": "trthminh1112", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.1 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1769 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2996 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0151 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2567 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3513 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1099 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1/c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json b/data/hfopenllm_v2/tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1/c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json deleted file mode 100644 index 7f871c221..000000000 --- a/data/hfopenllm_v2/tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1/c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/tugstugi_Qwen2.5-7B-Instruct-QwQ-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-7B-Instruct-QwQ-v0.1", - "id": "tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1", - "developer": "tugstugi", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6017 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5101 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3814 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2685 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4081 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/universalml/NepaliGPT-2.0/89e55482-b762-4f5d-a021-211048719bdc.json b/data/hfopenllm_v2/universalml/NepaliGPT-2.0/89e55482-b762-4f5d-a021-211048719bdc.json deleted file mode 100644 index ae1d50c1b..000000000 --- a/data/hfopenllm_v2/universalml/NepaliGPT-2.0/89e55482-b762-4f5d-a021-211048719bdc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/universalml_NepaliGPT-2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NepaliGPT-2.0", - "id": "universalml/NepaliGPT-2.0", - "developer": "universalml", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0365 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.466 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4657 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.33 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct-no-system-message/81018e12-63f8-4ad8-87c4-181a13202497.json b/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct-no-system-message/81018e12-63f8-4ad8-87c4-181a13202497.json deleted file mode 100644 index 71f6ae6c9..000000000 --- a/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct-no-system-message/81018e12-63f8-4ad8-87c4-181a13202497.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/unsloth_Llama-3.2-1B-Instruct-no-system-message/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-Instruct-no-system-message", - "id": "unsloth/Llama-3.2-1B-Instruct-no-system-message", - "developer": "unsloth", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.565 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2727 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3341 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1669 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct/5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json b/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct/5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json deleted file mode 100644 index 1bdf24114..000000000 --- a/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct/5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/unsloth_Llama-3.2-1B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-1B-Instruct", - "id": "unsloth/Llama-3.2-1B-Instruct", - "developer": "unsloth", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.236 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.581 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3485 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0823 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3196 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1742 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/unsloth/Phi-3-mini-4k-instruct/8b344f21-9038-4b15-aba8-308aa62e4b39.json b/data/hfopenllm_v2/unsloth/Phi-3-mini-4k-instruct/8b344f21-9038-4b15-aba8-308aa62e4b39.json deleted file mode 100644 index 0e0bd0c1a..000000000 --- a/data/hfopenllm_v2/unsloth/Phi-3-mini-4k-instruct/8b344f21-9038-4b15-aba8-308aa62e4b39.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/unsloth_Phi-3-mini-4k-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-mini-4k-instruct", - "id": "unsloth/Phi-3-mini-4k-instruct", - "developer": "unsloth", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.544 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1639 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.323 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/unsloth/phi-4-bnb-4bit/68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json b/data/hfopenllm_v2/unsloth/phi-4-bnb-4bit/68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json deleted file mode 100644 index 04ac53e78..000000000 --- a/data/hfopenllm_v2/unsloth/phi-4-bnb-4bit/68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/unsloth_phi-4-bnb-4bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-bnb-4bit", - "id": "unsloth/phi-4-bnb-4bit", - "developer": "unsloth", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.058 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.677 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4607 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4007 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5256 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/unsloth/phi-4-unsloth-bnb-4bit/df557f25-5505-49dd-a0cb-88fff601c6e2.json b/data/hfopenllm_v2/unsloth/phi-4-unsloth-bnb-4bit/df557f25-5505-49dd-a0cb-88fff601c6e2.json deleted file mode 100644 index 5a64d5d96..000000000 --- a/data/hfopenllm_v2/unsloth/phi-4-unsloth-bnb-4bit/df557f25-5505-49dd-a0cb-88fff601c6e2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/unsloth_phi-4-unsloth-bnb-4bit/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4-unsloth-bnb-4bit", - "id": "unsloth/phi-4-unsloth-bnb-4bit", - "developer": "unsloth", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.483 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6794 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6791 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4034 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5286 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/unsloth/phi-4/a50bf387-bf34-490f-979a-b6217a85a1bd.json b/data/hfopenllm_v2/unsloth/phi-4/a50bf387-bf34-490f-979a-b6217a85a1bd.json deleted file mode 100644 index cbdc7e8b4..000000000 --- a/data/hfopenllm_v2/unsloth/phi-4/a50bf387-bf34-490f-979a-b6217a85a1bd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/unsloth_phi-4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "phi-4", - "id": "unsloth/phi-4", - "developer": "unsloth", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 14.66 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6882 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6886 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3364 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5378 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/upstage/SOLAR-10.7B-Instruct-v1.0/89264aa0-3bed-41d3-b171-2a5434cc990f.json b/data/hfopenllm_v2/upstage/SOLAR-10.7B-Instruct-v1.0/89264aa0-3bed-41d3-b171-2a5434cc990f.json deleted file mode 100644 index efc06b49f..000000000 --- a/data/hfopenllm_v2/upstage/SOLAR-10.7B-Instruct-v1.0/89264aa0-3bed-41d3-b171-2a5434cc990f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/upstage_SOLAR-10.7B-Instruct-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SOLAR-10.7B-Instruct-v1.0", - "id": "upstage/SOLAR-10.7B-Instruct-v1.0", - "developer": "upstage", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4737 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5162 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3899 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3138 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/upstage/SOLAR-10.7B-v1.0/a3272caf-a292-4dc7-8932-636a4099ca6b.json b/data/hfopenllm_v2/upstage/SOLAR-10.7B-v1.0/a3272caf-a292-4dc7-8932-636a4099ca6b.json deleted file mode 100644 index 711e8ad03..000000000 --- a/data/hfopenllm_v2/upstage/SOLAR-10.7B-v1.0/a3272caf-a292-4dc7-8932-636a4099ca6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/upstage_SOLAR-10.7B-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SOLAR-10.7B-v1.0", - "id": "upstage/SOLAR-10.7B-v1.0", - "developer": "upstage", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2421 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5094 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4372 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.34 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/upstage/solar-pro-preview-instruct/c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json b/data/hfopenllm_v2/upstage/solar-pro-preview-instruct/c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json deleted file mode 100644 index 86a94ac3f..000000000 --- a/data/hfopenllm_v2/upstage/solar-pro-preview-instruct/c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/upstage_solar-pro-preview-instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "solar-pro-preview-instruct", - "id": "upstage/solar-pro-preview-instruct", - "developer": "upstage", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "SolarForCausalLM", - "params_billions": 22.14 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6817 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4417 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5273 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/utkmst/chimera-beta-test2-lora-merged/b030646c-5f5c-43ab-bbc4-405f82992265.json b/data/hfopenllm_v2/utkmst/chimera-beta-test2-lora-merged/b030646c-5f5c-43ab-bbc4-405f82992265.json deleted file mode 100644 index 794273d51..000000000 --- a/data/hfopenllm_v2/utkmst/chimera-beta-test2-lora-merged/b030646c-5f5c-43ab-bbc4-405f82992265.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/utkmst_chimera-beta-test2-lora-merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "chimera-beta-test2-lora-merged", - "id": "utkmst/chimera-beta-test2-lora-merged", - "developer": "utkmst", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6054 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4796 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0952 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3037 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4118 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2992 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/uukuguy/speechless-code-mistral-7b-v1.0/399e516c-d8c8-4511-a746-76c81f72b36a.json b/data/hfopenllm_v2/uukuguy/speechless-code-mistral-7b-v1.0/399e516c-d8c8-4511-a746-76c81f72b36a.json deleted file mode 100644 index 8a74f1d3d..000000000 --- a/data/hfopenllm_v2/uukuguy/speechless-code-mistral-7b-v1.0/399e516c-d8c8-4511-a746-76c81f72b36a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/uukuguy_speechless-code-mistral-7b-v1.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "speechless-code-mistral-7b-v1.0", - "id": "uukuguy/speechless-code-mistral-7b-v1.0", - "developer": "uukuguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3665 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4572 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0521 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/uukuguy/speechless-codellama-34b-v2.0/bd8e4424-7903-43e7-8105-269de734582e.json b/data/hfopenllm_v2/uukuguy/speechless-codellama-34b-v2.0/bd8e4424-7903-43e7-8105-269de734582e.json deleted file mode 100644 index eea43c67b..000000000 --- a/data/hfopenllm_v2/uukuguy/speechless-codellama-34b-v2.0/bd8e4424-7903-43e7-8105-269de734582e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/uukuguy_speechless-codellama-34b-v2.0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "speechless-codellama-34b-v2.0", - "id": "uukuguy/speechless-codellama-34b-v2.0", - "developer": "uukuguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 34.0 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4604 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4813 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3787 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/uukuguy/speechless-coder-ds-6.7b/9126e939-3a87-4774-9606-084c5b56e933.json b/data/hfopenllm_v2/uukuguy/speechless-coder-ds-6.7b/9126e939-3a87-4774-9606-084c5b56e933.json deleted file mode 100644 index 9ccbd840a..000000000 --- a/data/hfopenllm_v2/uukuguy/speechless-coder-ds-6.7b/9126e939-3a87-4774-9606-084c5b56e933.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/uukuguy_speechless-coder-ds-6.7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "speechless-coder-ds-6.7b", - "id": "uukuguy/speechless-coder-ds-6.7b", - "developer": "uukuguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 6.7 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4036 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0211 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3819 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1719 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/uukuguy/speechless-instruct-mistral-7b-v0.2/be2ef197-738e-422d-9a88-cafd124584b7.json b/data/hfopenllm_v2/uukuguy/speechless-instruct-mistral-7b-v0.2/be2ef197-738e-422d-9a88-cafd124584b7.json deleted file mode 100644 index 06f2df151..000000000 --- a/data/hfopenllm_v2/uukuguy/speechless-instruct-mistral-7b-v0.2/be2ef197-738e-422d-9a88-cafd124584b7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/uukuguy_speechless-instruct-mistral-7b-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "speechless-instruct-mistral-7b-v0.2", - "id": "uukuguy/speechless-instruct-mistral-7b-v0.2", - "developer": "uukuguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3261 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4607 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2819 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4902 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2902 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/ee22e6c5-8529-4987-86d0-4abf3b525f90.json b/data/hfopenllm_v2/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/ee22e6c5-8529-4987-86d0-4abf3b525f90.json deleted file mode 100644 index 776e410d9..000000000 --- a/data/hfopenllm_v2/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/ee22e6c5-8529-4987-86d0-4abf3b525f90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/uukuguy_speechless-llama2-hermes-orca-platypus-wizardlm-13b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "speechless-llama2-hermes-orca-platypus-wizardlm-13b", - "id": "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b", - "developer": "uukuguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.016 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4846 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0204 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2701 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4655 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json b/data/hfopenllm_v2/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json deleted file mode 100644 index c79782429..000000000 --- a/data/hfopenllm_v2/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/uukuguy_speechless-mistral-dolphin-orca-platypus-samantha-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "speechless-mistral-dolphin-orca-platypus-samantha-7b", - "id": "uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b", - "developer": "uukuguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.37 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4983 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0295 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4361 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.299 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/uukuguy/speechless-zephyr-code-functionary-7b/83294141-a70f-40da-b3f8-21b367098cce.json b/data/hfopenllm_v2/uukuguy/speechless-zephyr-code-functionary-7b/83294141-a70f-40da-b3f8-21b367098cce.json deleted file mode 100644 index 947584201..000000000 --- a/data/hfopenllm_v2/uukuguy/speechless-zephyr-code-functionary-7b/83294141-a70f-40da-b3f8-21b367098cce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/uukuguy_speechless-zephyr-code-functionary-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "speechless-zephyr-code-functionary-7b", - "id": "uukuguy/speechless-zephyr-code-functionary-7b", - "developer": "uukuguy", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2696 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4664 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3003 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4268 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3094 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/v000000/L3-8B-Stheno-v3.2-abliterated/303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json b/data/hfopenllm_v2/v000000/L3-8B-Stheno-v3.2-abliterated/303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json deleted file mode 100644 index ef4193f8a..000000000 --- a/data/hfopenllm_v2/v000000/L3-8B-Stheno-v3.2-abliterated/303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/v000000_L3-8B-Stheno-v3.2-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3-8B-Stheno-v3.2-abliterated", - "id": "v000000/L3-8B-Stheno-v3.2-abliterated", - "developer": "v000000", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6718 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5141 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0695 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3096 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.362 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3604 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/v000000/L3.1-Niitorm-8B-DPO-t0.0001/1b13d76d-259f-41f2-baba-ce96ef0cb937.json b/data/hfopenllm_v2/v000000/L3.1-Niitorm-8B-DPO-t0.0001/1b13d76d-259f-41f2-baba-ce96ef0cb937.json deleted file mode 100644 index 4b7d05df3..000000000 --- a/data/hfopenllm_v2/v000000/L3.1-Niitorm-8B-DPO-t0.0001/1b13d76d-259f-41f2-baba-ce96ef0cb937.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/v000000_L3.1-Niitorm-8B-DPO-t0.0001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Niitorm-8B-DPO-t0.0001", - "id": "v000000/L3.1-Niitorm-8B-DPO-t0.0001", - "developer": "v000000", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7689 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5134 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1624 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.388 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3866 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/v000000/L3.1-Storniitova-8B/b644a420-0a70-4b3d-9a5a-ff91911c857b.json b/data/hfopenllm_v2/v000000/L3.1-Storniitova-8B/b644a420-0a70-4b3d-9a5a-ff91911c857b.json deleted file mode 100644 index 45240285c..000000000 --- a/data/hfopenllm_v2/v000000/L3.1-Storniitova-8B/b644a420-0a70-4b3d-9a5a-ff91911c857b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/v000000_L3.1-Storniitova-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-Storniitova-8B", - "id": "v000000/L3.1-Storniitova-8B", - "developer": "v000000", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7817 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5151 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1465 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4029 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3776 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-1e-Delta/33aaa60f-eb69-4d36-917c-6862121a223e.json b/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-1e-Delta/33aaa60f-eb69-4d36-917c-6862121a223e.json deleted file mode 100644 index c7a731adf..000000000 --- a/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-1e-Delta/33aaa60f-eb69-4d36-917c-6862121a223e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/v000000_Qwen2.5-14B-Gutenberg-1e-Delta/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Gutenberg-1e-Delta", - "id": "v000000/Qwen2.5-14B-Gutenberg-1e-Delta", - "developer": "v000000", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8045 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6398 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5264 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4073 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.493 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json b/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json deleted file mode 100644 index 98637a74d..000000000 --- a/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/v000000_Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-14B-Gutenberg-Instruct-Slerpeno", - "id": "v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno", - "developer": "v000000", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8197 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.639 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3314 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4924 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/v000000/Qwen2.5-Lumen-14B/ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json b/data/hfopenllm_v2/v000000/Qwen2.5-Lumen-14B/ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json deleted file mode 100644 index 13c590e0a..000000000 --- a/data/hfopenllm_v2/v000000/Qwen2.5-Lumen-14B/ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/v000000_Qwen2.5-Lumen-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Lumen-14B", - "id": "v000000/Qwen2.5-Lumen-14B", - "developer": "v000000", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8064 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6391 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5363 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4114 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4903 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vhab10/Llama-3.1-8B-Base-Instruct-SLERP/b8043d04-c3ab-4d6a-97eb-44b195a52710.json b/data/hfopenllm_v2/vhab10/Llama-3.1-8B-Base-Instruct-SLERP/b8043d04-c3ab-4d6a-97eb-44b195a52710.json deleted file mode 100644 index 84ee49bc2..000000000 --- a/data/hfopenllm_v2/vhab10/Llama-3.1-8B-Base-Instruct-SLERP/b8043d04-c3ab-4d6a-97eb-44b195a52710.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vhab10_Llama-3.1-8B-Base-Instruct-SLERP/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-Base-Instruct-SLERP", - "id": "vhab10/Llama-3.1-8B-Base-Instruct-SLERP", - "developer": "vhab10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2907 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5057 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1201 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2961 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4011 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vhab10/Llama-3.2-Instruct-3B-TIES/c6bff6da-382f-4423-ba3a-d987839132e0.json b/data/hfopenllm_v2/vhab10/Llama-3.2-Instruct-3B-TIES/c6bff6da-382f-4423-ba3a-d987839132e0.json deleted file mode 100644 index 44b89e714..000000000 --- a/data/hfopenllm_v2/vhab10/Llama-3.2-Instruct-3B-TIES/c6bff6da-382f-4423-ba3a-d987839132e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vhab10_Llama-3.2-Instruct-3B-TIES/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-Instruct-3B-TIES", - "id": "vhab10/Llama-3.2-Instruct-3B-TIES", - "developer": "vhab10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.848 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4332 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2916 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vhab10/llama-3-8b-merged-linear/f3574ad1-a6d7-47fb-86e7-69c256452dea.json b/data/hfopenllm_v2/vhab10/llama-3-8b-merged-linear/f3574ad1-a6d7-47fb-86e7-69c256452dea.json deleted file mode 100644 index a70dfa80f..000000000 --- a/data/hfopenllm_v2/vhab10/llama-3-8b-merged-linear/f3574ad1-a6d7-47fb-86e7-69c256452dea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vhab10_llama-3-8b-merged-linear/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-merged-linear", - "id": "vhab10/llama-3-8b-merged-linear", - "developer": "vhab10", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 4.65 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5917 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4937 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4191 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3704 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B-truthy/f2e47267-6c40-4d70-8420-295c95b318f3.json b/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B-truthy/f2e47267-6c40-4d70-8420-295c95b318f3.json deleted file mode 100644 index 702223e17..000000000 --- a/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B-truthy/f2e47267-6c40-4d70-8420-295c95b318f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_CarbonBeagle-11B-truthy/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CarbonBeagle-11B-truthy", - "id": "vicgalle/CarbonBeagle-11B-truthy", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5212 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5348 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0491 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2995 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3357 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B/395f246e-34c6-40e6-bfeb-b047aa12cf90.json b/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B/395f246e-34c6-40e6-bfeb-b047aa12cf90.json deleted file mode 100644 index d8ca9e059..000000000 --- a/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B/395f246e-34c6-40e6-bfeb-b047aa12cf90.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_CarbonBeagle-11B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CarbonBeagle-11B", - "id": "vicgalle/CarbonBeagle-11B", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5415 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5294 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.402 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3276 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B/3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json b/data/hfopenllm_v2/vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B/3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json deleted file mode 100644 index ea5259667..000000000 --- a/data/hfopenllm_v2/vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B/3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_Configurable-Hermes-2-Pro-Llama-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Configurable-Hermes-2-Pro-Llama-3-8B", - "id": "vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.031 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5763 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5055 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0763 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.297 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4184 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/Configurable-Llama-3.1-8B-Instruct/97c92043-9bed-460a-8d7b-70ab3584c75b.json b/data/hfopenllm_v2/vicgalle/Configurable-Llama-3.1-8B-Instruct/97c92043-9bed-460a-8d7b-70ab3584c75b.json deleted file mode 100644 index 86af2304f..000000000 --- a/data/hfopenllm_v2/vicgalle/Configurable-Llama-3.1-8B-Instruct/97c92043-9bed-460a-8d7b-70ab3584c75b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_Configurable-Llama-3.1-8B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Configurable-Llama-3.1-8B-Instruct", - "id": "vicgalle/Configurable-Llama-3.1-8B-Instruct", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8312 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5045 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.173 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3845 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3592 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/Configurable-Yi-1.5-9B-Chat/ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json b/data/hfopenllm_v2/vicgalle/Configurable-Yi-1.5-9B-Chat/ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json deleted file mode 100644 index 263e7fc5d..000000000 --- a/data/hfopenllm_v2/vicgalle/Configurable-Yi-1.5-9B-Chat/ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_Configurable-Yi-1.5-9B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Configurable-Yi-1.5-9B-Chat", - "id": "vicgalle/Configurable-Yi-1.5-9B-Chat", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.829 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5452 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2047 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4271 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4015 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/ConfigurableBeagle-11B/f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json b/data/hfopenllm_v2/vicgalle/ConfigurableBeagle-11B/f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json deleted file mode 100644 index 562396548..000000000 --- a/data/hfopenllm_v2/vicgalle/ConfigurableBeagle-11B/f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_ConfigurableBeagle-11B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ConfigurableBeagle-11B", - "id": "vicgalle/ConfigurableBeagle-11B", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5834 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5287 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3953 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3374 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/ConfigurableHermes-7B/cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json b/data/hfopenllm_v2/vicgalle/ConfigurableHermes-7B/cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json deleted file mode 100644 index 4374da041..000000000 --- a/data/hfopenllm_v2/vicgalle/ConfigurableHermes-7B/cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_ConfigurableHermes-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ConfigurableHermes-7B", - "id": "vicgalle/ConfigurableHermes-7B", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4573 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0476 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4057 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3025 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/ConfigurableSOLAR-10.7B/c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json b/data/hfopenllm_v2/vicgalle/ConfigurableSOLAR-10.7B/c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json deleted file mode 100644 index 3fecdf2c2..000000000 --- a/data/hfopenllm_v2/vicgalle/ConfigurableSOLAR-10.7B/c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_ConfigurableSOLAR-10.7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ConfigurableSOLAR-10.7B", - "id": "vicgalle/ConfigurableSOLAR-10.7B", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4867 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0665 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3805 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3173 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/Humanish-RP-Llama-3.1-8B/1b32c387-97a7-42ff-892c-d3bacebbf050.json b/data/hfopenllm_v2/vicgalle/Humanish-RP-Llama-3.1-8B/1b32c387-97a7-42ff-892c-d3bacebbf050.json deleted file mode 100644 index 99c2ecb52..000000000 --- a/data/hfopenllm_v2/vicgalle/Humanish-RP-Llama-3.1-8B/1b32c387-97a7-42ff-892c-d3bacebbf050.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_Humanish-RP-Llama-3.1-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Humanish-RP-Llama-3.1-8B", - "id": "vicgalle/Humanish-RP-Llama-3.1-8B", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6669 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.51 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1518 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2869 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3952 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3477 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/Merge-Mistral-Prometheus-7B/cbea057c-b0f9-48ac-a075-eb28ebbaf358.json b/data/hfopenllm_v2/vicgalle/Merge-Mistral-Prometheus-7B/cbea057c-b0f9-48ac-a075-eb28ebbaf358.json deleted file mode 100644 index f60131bd5..000000000 --- a/data/hfopenllm_v2/vicgalle/Merge-Mistral-Prometheus-7B/cbea057c-b0f9-48ac-a075-eb28ebbaf358.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_Merge-Mistral-Prometheus-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Merge-Mistral-Prometheus-7B", - "id": "vicgalle/Merge-Mistral-Prometheus-7B", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4848 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4201 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0181 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.41 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2717 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/Merge-Mixtral-Prometheus-8x7B/0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json b/data/hfopenllm_v2/vicgalle/Merge-Mixtral-Prometheus-8x7B/0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json deleted file mode 100644 index 956f520a3..000000000 --- a/data/hfopenllm_v2/vicgalle/Merge-Mixtral-Prometheus-8x7B/0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_Merge-Mixtral-Prometheus-8x7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Merge-Mixtral-Prometheus-8x7B", - "id": "vicgalle/Merge-Mixtral-Prometheus-8x7B", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5744 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5351 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0929 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3087 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vicgalle/Roleplay-Llama-3-8B/a86678ad-344c-430f-80c7-02d634b0cd5b.json b/data/hfopenllm_v2/vicgalle/Roleplay-Llama-3-8B/a86678ad-344c-430f-80c7-02d634b0cd5b.json deleted file mode 100644 index 3b3b5675b..000000000 --- a/data/hfopenllm_v2/vicgalle/Roleplay-Llama-3-8B/a86678ad-344c-430f-80c7-02d634b0cd5b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vicgalle_Roleplay-Llama-3-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Roleplay-Llama-3-8B", - "id": "vicgalle/Roleplay-Llama-3-8B", - "developer": "vicgalle", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.732 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5012 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0914 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3529 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/viettelsecurity-ai/security-llama3.2-3b/827f3236-74fa-432b-8177-8785ac25ad76.json b/data/hfopenllm_v2/viettelsecurity-ai/security-llama3.2-3b/827f3236-74fa-432b-8177-8785ac25ad76.json deleted file mode 100644 index 088a8c8aa..000000000 --- a/data/hfopenllm_v2/viettelsecurity-ai/security-llama3.2-3b/827f3236-74fa-432b-8177-8785ac25ad76.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/viettelsecurity-ai_security-llama3.2-3b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "security-llama3.2-3b", - "id": "viettelsecurity-ai/security-llama3.2-3b", - "developer": "viettelsecurity-ai", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5909 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1261 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3379 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2837 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vihangd/smart-dan-sft-v0.1/7f694687-77e5-41d2-923b-f2d5f231729b.json b/data/hfopenllm_v2/vihangd/smart-dan-sft-v0.1/7f694687-77e5-41d2-923b-f2d5f231729b.json deleted file mode 100644 index ee3f7a212..000000000 --- a/data/hfopenllm_v2/vihangd/smart-dan-sft-v0.1/7f694687-77e5-41d2-923b-f2d5f231729b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vihangd_smart-dan-sft-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smart-dan-sft-v0.1", - "id": "vihangd/smart-dan-sft-v0.1", - "developer": "vihangd", - "inference_platform": "unknown", - "additional_details": { - "precision": "4bit", - "architecture": "LlamaForCausalLM", - "params_billions": 0.379 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1576 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3062 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0098 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.255 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1142 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/voidful/smol-360m-ft/daa9d03e-63b0-4c08-ae72-e11041200ac7.json b/data/hfopenllm_v2/voidful/smol-360m-ft/daa9d03e-63b0-4c08-ae72-e11041200ac7.json deleted file mode 100644 index b88c7efbe..000000000 --- a/data/hfopenllm_v2/voidful/smol-360m-ft/daa9d03e-63b0-4c08-ae72-e11041200ac7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/voidful_smol-360m-ft/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "smol-360m-ft", - "id": "voidful/smol-360m-ft", - "developer": "voidful", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.362 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2013 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0083 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2458 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3714 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1087 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vonjack/MobileLLM-125M-HF/1539822f-acc4-4dae-9e61-133da97ebcbe.json b/data/hfopenllm_v2/vonjack/MobileLLM-125M-HF/1539822f-acc4-4dae-9e61-133da97ebcbe.json deleted file mode 100644 index 3d05087f3..000000000 --- a/data/hfopenllm_v2/vonjack/MobileLLM-125M-HF/1539822f-acc4-4dae-9e61-133da97ebcbe.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vonjack_MobileLLM-125M-HF/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MobileLLM-125M-HF", - "id": "vonjack/MobileLLM-125M-HF", - "developer": "vonjack", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.125 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2107 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3027 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0091 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3782 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1164 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vonjack/Phi-3-mini-4k-instruct-LLaMAfied/eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json b/data/hfopenllm_v2/vonjack/Phi-3-mini-4k-instruct-LLaMAfied/eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json deleted file mode 100644 index adf71cb9a..000000000 --- a/data/hfopenllm_v2/vonjack/Phi-3-mini-4k-instruct-LLaMAfied/eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vonjack_Phi-3-mini-4k-instruct-LLaMAfied/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3-mini-4k-instruct-LLaMAfied", - "id": "vonjack/Phi-3-mini-4k-instruct-LLaMAfied", - "developer": "vonjack", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.821 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5787 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5741 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1382 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3305 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3924 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3885 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vonjack/Phi-3.5-mini-instruct-hermes-fc-json/448cac5f-a7d3-41fb-9b49-666758037eb4.json b/data/hfopenllm_v2/vonjack/Phi-3.5-mini-instruct-hermes-fc-json/448cac5f-a7d3-41fb-9b49-666758037eb4.json deleted file mode 100644 index 745062dbb..000000000 --- a/data/hfopenllm_v2/vonjack/Phi-3.5-mini-instruct-hermes-fc-json/448cac5f-a7d3-41fb-9b49-666758037eb4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vonjack_Phi-3.5-mini-instruct-hermes-fc-json/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Phi-3.5-mini-instruct-hermes-fc-json", - "id": "vonjack/Phi-3.5-mini-instruct-hermes-fc-json", - "developer": "vonjack", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "?", - "params_billions": 4.132 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1416 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2975 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0076 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2542 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4041 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1139 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vonjack/Qwen2.5-Coder-0.5B-Merged/5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json b/data/hfopenllm_v2/vonjack/Qwen2.5-Coder-0.5B-Merged/5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json deleted file mode 100644 index 7da1c30b9..000000000 --- a/data/hfopenllm_v2/vonjack/Qwen2.5-Coder-0.5B-Merged/5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vonjack_Qwen2.5-Coder-0.5B-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-Coder-0.5B-Merged", - "id": "vonjack/Qwen2.5-Coder-0.5B-Merged", - "developer": "vonjack", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 0.63 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3076 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3303 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1202 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vonjack/SmolLM2-1.7B-Merged/7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json b/data/hfopenllm_v2/vonjack/SmolLM2-1.7B-Merged/7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json deleted file mode 100644 index a43b36871..000000000 --- a/data/hfopenllm_v2/vonjack/SmolLM2-1.7B-Merged/7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vonjack_SmolLM2-1.7B-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-1.7B-Merged", - "id": "vonjack/SmolLM2-1.7B-Merged", - "developer": "vonjack", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 1.711 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3698 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3587 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0627 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3408 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2048 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vonjack/SmolLM2-135M-Merged/ec4d21be-b1a6-47a9-84a4-1a25249c1768.json b/data/hfopenllm_v2/vonjack/SmolLM2-135M-Merged/ec4d21be-b1a6-47a9-84a4-1a25249c1768.json deleted file mode 100644 index 409a63ac3..000000000 --- a/data/hfopenllm_v2/vonjack/SmolLM2-135M-Merged/ec4d21be-b1a6-47a9-84a4-1a25249c1768.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vonjack_SmolLM2-135M-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-135M-Merged", - "id": "vonjack/SmolLM2-135M-Merged", - "developer": "vonjack", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.135 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2483 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0113 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2383 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3662 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1112 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/vonjack/SmolLM2-360M-Merged/c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json b/data/hfopenllm_v2/vonjack/SmolLM2-360M-Merged/c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json deleted file mode 100644 index 0d45953b7..000000000 --- a/data/hfopenllm_v2/vonjack/SmolLM2-360M-Merged/c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/vonjack_SmolLM2-360M-Merged/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SmolLM2-360M-Merged", - "id": "vonjack/SmolLM2-360M-Merged", - "developer": "vonjack", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 0.362 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3206 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3155 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0174 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2559 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3527 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored/f156ac38-056e-4ef1-bdbe-e83c299a683b.json b/data/hfopenllm_v2/w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored/f156ac38-056e-4ef1-bdbe-e83c299a683b.json deleted file mode 100644 index 1c4c50f83..000000000 --- a/data/hfopenllm_v2/w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored/f156ac38-056e-4ef1-bdbe-e83c299a683b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/w4r10ck_SOLAR-10.7B-Instruct-v1.0-uncensored/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SOLAR-10.7B-Instruct-v1.0-uncensored", - "id": "w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored", - "developer": "w4r10ck", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 10.732 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3884 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5302 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4639 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3344 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp/11d3c8db-300c-4e02-b729-7adba6844ad2.json b/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp/11d3c8db-300c-4e02-b729-7adba6844ad2.json deleted file mode 100644 index 11b77c784..000000000 --- a/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp/11d3c8db-300c-4e02-b729-7adba6844ad2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/wanlige_li-14b-v0.4-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "li-14b-v0.4-slerp", - "id": "wanlige/li-14b-v0.4-slerp", - "developer": "wanlige", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4606 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6587 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4192 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4002 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp0.1/fc75a820-fc0b-4e50-9304-61f0e93795c0.json b/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp0.1/fc75a820-fc0b-4e50-9304-61f0e93795c0.json deleted file mode 100644 index c132d7290..000000000 --- a/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp0.1/fc75a820-fc0b-4e50-9304-61f0e93795c0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/wanlige_li-14b-v0.4-slerp0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "li-14b-v0.4-slerp0.1", - "id": "wanlige/li-14b-v0.4-slerp0.1", - "developer": "wanlige", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7923 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6572 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3591 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4207 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5294 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/wanlige/li-14b-v0.4/bb66896f-799c-4e17-8b54-af5e795699fa.json b/data/hfopenllm_v2/wanlige/li-14b-v0.4/bb66896f-799c-4e17-8b54-af5e795699fa.json deleted file mode 100644 index 3c975e261..000000000 --- a/data/hfopenllm_v2/wanlige/li-14b-v0.4/bb66896f-799c-4e17-8b54-af5e795699fa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/wanlige_li-14b-v0.4/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "li-14b-v0.4", - "id": "wanlige/li-14b-v0.4", - "developer": "wanlige", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8133 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6544 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5574 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3389 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.446 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5167 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/wannaphong/KhanomTanLLM-Instruct/30a1a786-7478-401f-85ae-57037ada3d32.json b/data/hfopenllm_v2/wannaphong/KhanomTanLLM-Instruct/30a1a786-7478-401f-85ae-57037ada3d32.json deleted file mode 100644 index 2276ead59..000000000 --- a/data/hfopenllm_v2/wannaphong/KhanomTanLLM-Instruct/30a1a786-7478-401f-85ae-57037ada3d32.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/wannaphong_KhanomTanLLM-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "KhanomTanLLM-Instruct", - "id": "wannaphong/KhanomTanLLM-Instruct", - "developer": "wannaphong", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.447 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3093 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0136 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2634 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1119 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/waqasali1707/Beast-Soul-new/05430b16-07b6-41a1-ade9-6211cdf8ccf1.json b/data/hfopenllm_v2/waqasali1707/Beast-Soul-new/05430b16-07b6-41a1-ade9-6211cdf8ccf1.json deleted file mode 100644 index 7fbd3aeba..000000000 --- a/data/hfopenllm_v2/waqasali1707/Beast-Soul-new/05430b16-07b6-41a1-ade9-6211cdf8ccf1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/waqasali1707_Beast-Soul-new/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Beast-Soul-new", - "id": "waqasali1707/Beast-Soul-new", - "developer": "waqasali1707", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.503 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5225 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0702 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4486 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3108 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/wave-on-discord/qwent-7b/09bc4d5a-f104-4a36-999c-11e2532eef1e.json b/data/hfopenllm_v2/wave-on-discord/qwent-7b/09bc4d5a-f104-4a36-999c-11e2532eef1e.json deleted file mode 100644 index 3152ab264..000000000 --- a/data/hfopenllm_v2/wave-on-discord/qwent-7b/09bc4d5a-f104-4a36-999c-11e2532eef1e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/wave-on-discord_qwent-7b/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwent-7b", - "id": "wave-on-discord/qwent-7b", - "developer": "wave-on-discord", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2015 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0038 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2651 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1603 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/weathermanj/Menda-3B-500/a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json b/data/hfopenllm_v2/weathermanj/Menda-3B-500/a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json deleted file mode 100644 index cc66f0e58..000000000 --- a/data/hfopenllm_v2/weathermanj/Menda-3B-500/a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/weathermanj_Menda-3B-500/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Menda-3B-500", - "id": "weathermanj/Menda-3B-500", - "developer": "weathermanj", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6353 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4766 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3724 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3968 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3475 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/weathermanj/Menda-3b-750/8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json b/data/hfopenllm_v2/weathermanj/Menda-3b-750/8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json deleted file mode 100644 index ed1d22611..000000000 --- a/data/hfopenllm_v2/weathermanj/Menda-3b-750/8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/weathermanj_Menda-3b-750/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Menda-3b-750", - "id": "weathermanj/Menda-3b-750", - "developer": "weathermanj", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6335 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4737 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3942 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3506 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-100/e4f39815-9704-4d0a-8d9b-39359367adcc.json b/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-100/e4f39815-9704-4d0a-8d9b-39359367adcc.json deleted file mode 100644 index d3fdfc367..000000000 --- a/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-100/e4f39815-9704-4d0a-8d9b-39359367adcc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/weathermanj_Menda-3b-Optim-100/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Menda-3b-Optim-100", - "id": "weathermanj/Menda-3b-Optim-100", - "developer": "weathermanj", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6398 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4735 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3993 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3461 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-200/f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json b/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-200/f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json deleted file mode 100644 index dc2afd5e8..000000000 --- a/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-200/f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/weathermanj_Menda-3b-Optim-200/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Menda-3b-Optim-200", - "id": "weathermanj/Menda-3b-Optim-200", - "developer": "weathermanj", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6375 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4746 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3731 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2827 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4033 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3484 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/win10/ArliAI-RPMax-v1.3-merge-13.3B/398996d9-299b-4120-a757-e2fe14e779ee.json b/data/hfopenllm_v2/win10/ArliAI-RPMax-v1.3-merge-13.3B/398996d9-299b-4120-a757-e2fe14e779ee.json deleted file mode 100644 index 8283795f0..000000000 --- a/data/hfopenllm_v2/win10/ArliAI-RPMax-v1.3-merge-13.3B/398996d9-299b-4120-a757-e2fe14e779ee.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/win10_ArliAI-RPMax-v1.3-merge-13.3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ArliAI-RPMax-v1.3-merge-13.3B", - "id": "win10/ArliAI-RPMax-v1.3-merge-13.3B", - "developer": "win10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.265 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3038 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4581 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4325 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.32 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/win10/Breeze-13B-32k-Instruct-v1_0/4398633e-77b0-4b61-ae85-29b0e5aad38b.json b/data/hfopenllm_v2/win10/Breeze-13B-32k-Instruct-v1_0/4398633e-77b0-4b61-ae85-29b0e5aad38b.json deleted file mode 100644 index 4e29ec855..000000000 --- a/data/hfopenllm_v2/win10/Breeze-13B-32k-Instruct-v1_0/4398633e-77b0-4b61-ae85-29b0e5aad38b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/win10_Breeze-13B-32k-Instruct-v1_0/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Breeze-13B-32k-Instruct-v1_0", - "id": "win10/Breeze-13B-32k-Instruct-v1_0", - "developer": "win10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 12.726 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3584 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4611 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0128 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2643 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2568 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/win10/EVA-Norns-Qwen2.5-v0.1/1bc60148-512f-4830-b541-f30535cf74bf.json b/data/hfopenllm_v2/win10/EVA-Norns-Qwen2.5-v0.1/1bc60148-512f-4830-b541-f30535cf74bf.json deleted file mode 100644 index 2e96decff..000000000 --- a/data/hfopenllm_v2/win10/EVA-Norns-Qwen2.5-v0.1/1bc60148-512f-4830-b541-f30535cf74bf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/win10_EVA-Norns-Qwen2.5-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "EVA-Norns-Qwen2.5-v0.1", - "id": "win10/EVA-Norns-Qwen2.5-v0.1", - "developer": "win10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.622 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5072 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2613 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4045 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3425 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/win10/Llama-3.2-3B-Instruct-24-9-29/a9dfb20a-13e0-4419-a747-7c001b2e9435.json b/data/hfopenllm_v2/win10/Llama-3.2-3B-Instruct-24-9-29/a9dfb20a-13e0-4419-a747-7c001b2e9435.json deleted file mode 100644 index e291ffe11..000000000 --- a/data/hfopenllm_v2/win10/Llama-3.2-3B-Instruct-24-9-29/a9dfb20a-13e0-4419-a747-7c001b2e9435.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/win10_Llama-3.2-3B-Instruct-24-9-29/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.2-3B-Instruct-24-9-29", - "id": "win10/Llama-3.2-3B-Instruct-24-9-29", - "developer": "win10", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7332 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4614 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1707 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3228 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/win10/Norns-Qwen2.5-12B/388e3559-a3b6-4738-9843-9bdd048bae09.json b/data/hfopenllm_v2/win10/Norns-Qwen2.5-12B/388e3559-a3b6-4738-9843-9bdd048bae09.json deleted file mode 100644 index f5e98a734..000000000 --- a/data/hfopenllm_v2/win10/Norns-Qwen2.5-12B/388e3559-a3b6-4738-9843-9bdd048bae09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/win10_Norns-Qwen2.5-12B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Norns-Qwen2.5-12B", - "id": "win10/Norns-Qwen2.5-12B", - "developer": "win10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 12.277 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4897 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4619 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3555 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.266 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/win10/Norns-Qwen2.5-7B/994a6930-42d5-463a-9e7c-0a3070144211.json b/data/hfopenllm_v2/win10/Norns-Qwen2.5-7B/994a6930-42d5-463a-9e7c-0a3070144211.json deleted file mode 100644 index a0a55bfe7..000000000 --- a/data/hfopenllm_v2/win10/Norns-Qwen2.5-7B/994a6930-42d5-463a-9e7c-0a3070144211.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/win10_Norns-Qwen2.5-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Norns-Qwen2.5-7B", - "id": "win10/Norns-Qwen2.5-7B", - "developer": "win10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6122 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2628 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2844 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4085 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3413 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/win10/Qwen2.5-2B-Instruct/cce46320-9794-443a-831a-92e2a21515b0.json b/data/hfopenllm_v2/win10/Qwen2.5-2B-Instruct/cce46320-9794-443a-831a-92e2a21515b0.json deleted file mode 100644 index 0e10fdee0..000000000 --- a/data/hfopenllm_v2/win10/Qwen2.5-2B-Instruct/cce46320-9794-443a-831a-92e2a21515b0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/win10_Qwen2.5-2B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-2B-Instruct", - "id": "win10/Qwen2.5-2B-Instruct", - "developer": "win10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 2.9 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2273 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3706 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0227 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2676 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4378 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1934 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/win10/llama3-13.45b-Instruct/988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json b/data/hfopenllm_v2/win10/llama3-13.45b-Instruct/988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json deleted file mode 100644 index 4c991522b..000000000 --- a/data/hfopenllm_v2/win10/llama3-13.45b-Instruct/988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/win10_llama3-13.45b-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3-13.45b-Instruct", - "id": "win10/llama3-13.45b-Instruct", - "developer": "win10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 13.265 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4144 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4865 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3848 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3345 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/win10/miscii-14b-1M-0128/3c675148-5d09-4778-baad-9295ef8cfc79.json b/data/hfopenllm_v2/win10/miscii-14b-1M-0128/3c675148-5d09-4778-baad-9295ef8cfc79.json deleted file mode 100644 index be52a6670..000000000 --- a/data/hfopenllm_v2/win10/miscii-14b-1M-0128/3c675148-5d09-4778-baad-9295ef8cfc79.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/win10_miscii-14b-1M-0128/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "miscii-14b-1M-0128", - "id": "win10/miscii-14b-1M-0128", - "developer": "win10", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.766 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4181 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5742 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4773 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3826 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4491 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/winglian/Llama-3-8b-64k-PoSE/620b80ba-81ab-4504-9f42-4965014f3cd1.json b/data/hfopenllm_v2/winglian/Llama-3-8b-64k-PoSE/620b80ba-81ab-4504-9f42-4965014f3cd1.json deleted file mode 100644 index 6c33d53ee..000000000 --- a/data/hfopenllm_v2/winglian/Llama-3-8b-64k-PoSE/620b80ba-81ab-4504-9f42-4965014f3cd1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/winglian_Llama-3-8b-64k-PoSE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8b-64k-PoSE", - "id": "winglian/Llama-3-8b-64k-PoSE", - "developer": "winglian", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2857 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3702 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0415 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3396 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2467 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/winglian/llama-3-8b-256k-PoSE/b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json b/data/hfopenllm_v2/winglian/llama-3-8b-256k-PoSE/b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json deleted file mode 100644 index e3d0f5b20..000000000 --- a/data/hfopenllm_v2/winglian/llama-3-8b-256k-PoSE/b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/winglian_llama-3-8b-256k-PoSE/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-256k-PoSE", - "id": "winglian/llama-3-8b-256k-PoSE", - "developer": "winglian", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2909 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3157 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0196 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2576 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3316 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1116 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/wzhouad/gemma-2-9b-it-WPO-HB/19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json b/data/hfopenllm_v2/wzhouad/gemma-2-9b-it-WPO-HB/19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json deleted file mode 100644 index 7af55e4ae..000000000 --- a/data/hfopenllm_v2/wzhouad/gemma-2-9b-it-WPO-HB/19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/wzhouad_gemma-2-9b-it-WPO-HB/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-it-WPO-HB", - "id": "wzhouad/gemma-2-9b-it-WPO-HB", - "developer": "wzhouad", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5437 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5629 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1533 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3498 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3675 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.336 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/x0000001/Deepseek-Lumen-R1-Qwen2.5-14B/7966789d-8ace-4b39-9093-96bbb8e641d8.json b/data/hfopenllm_v2/x0000001/Deepseek-Lumen-R1-Qwen2.5-14B/7966789d-8ace-4b39-9093-96bbb8e641d8.json deleted file mode 100644 index 1d76659b4..000000000 --- a/data/hfopenllm_v2/x0000001/Deepseek-Lumen-R1-Qwen2.5-14B/7966789d-8ace-4b39-9093-96bbb8e641d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/x0000001_Deepseek-Lumen-R1-Qwen2.5-14B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Deepseek-Lumen-R1-Qwen2.5-14B", - "id": "x0000001/Deepseek-Lumen-R1-Qwen2.5-14B", - "developer": "x0000001", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 14.77 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4436 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4569 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2779 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2852 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.474 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xMaulana/FinMatcha-3B-Instruct/5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json b/data/hfopenllm_v2/xMaulana/FinMatcha-3B-Instruct/5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json deleted file mode 100644 index ef5a60545..000000000 --- a/data/hfopenllm_v2/xMaulana/FinMatcha-3B-Instruct/5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xMaulana_FinMatcha-3B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "FinMatcha-3B-Instruct", - "id": "xMaulana/FinMatcha-3B-Instruct", - "developer": "xMaulana", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7548 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4536 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1435 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3182 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xinchen9/Llama3.1_8B_Instruct_CoT/a17563e3-0369-4042-8006-2ec781653f63.json b/data/hfopenllm_v2/xinchen9/Llama3.1_8B_Instruct_CoT/a17563e3-0369-4042-8006-2ec781653f63.json deleted file mode 100644 index 6fe27572a..000000000 --- a/data/hfopenllm_v2/xinchen9/Llama3.1_8B_Instruct_CoT/a17563e3-0369-4042-8006-2ec781653f63.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xinchen9_Llama3.1_8B_Instruct_CoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1_8B_Instruct_CoT", - "id": "xinchen9/Llama3.1_8B_Instruct_CoT", - "developer": "xinchen9", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2974 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4398 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4371 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2879 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xinchen9/Llama3.1_CoT/68369110-e371-4112-ae0a-14f7fe9fc40f.json b/data/hfopenllm_v2/xinchen9/Llama3.1_CoT/68369110-e371-4112-ae0a-14f7fe9fc40f.json deleted file mode 100644 index 9bdeb1364..000000000 --- a/data/hfopenllm_v2/xinchen9/Llama3.1_CoT/68369110-e371-4112-ae0a-14f7fe9fc40f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xinchen9_Llama3.1_CoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1_CoT", - "id": "xinchen9/Llama3.1_CoT", - "developer": "xinchen9", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2246 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4341 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4305 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2739 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xinchen9/Llama3.1_CoT_V1/2a6925d3-992f-4c4f-a57b-3eb41062743b.json b/data/hfopenllm_v2/xinchen9/Llama3.1_CoT_V1/2a6925d3-992f-4c4f-a57b-3eb41062743b.json deleted file mode 100644 index cd826b22d..000000000 --- a/data/hfopenllm_v2/xinchen9/Llama3.1_CoT_V1/2a6925d3-992f-4c4f-a57b-3eb41062743b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xinchen9_Llama3.1_CoT_V1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3.1_CoT_V1", - "id": "xinchen9/Llama3.1_CoT_V1", - "developer": "xinchen9", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2453 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4376 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0332 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4572 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2805 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xinchen9/Mistral-7B-CoT/28290ea9-9ce5-4605-ac5b-aa2d606994d8.json b/data/hfopenllm_v2/xinchen9/Mistral-7B-CoT/28290ea9-9ce5-4605-ac5b-aa2d606994d8.json deleted file mode 100644 index 678ee6b2d..000000000 --- a/data/hfopenllm_v2/xinchen9/Mistral-7B-CoT/28290ea9-9ce5-4605-ac5b-aa2d606994d8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xinchen9_Mistral-7B-CoT/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Mistral-7B-CoT", - "id": "xinchen9/Mistral-7B-CoT", - "developer": "xinchen9", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2783 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3873 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0249 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2492 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3994 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2284 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xinchen9/llama3-b8-ft-dis/eb2ed6eb-4789-400d-aea5-841547a20cd7.json b/data/hfopenllm_v2/xinchen9/llama3-b8-ft-dis/eb2ed6eb-4789-400d-aea5-841547a20cd7.json deleted file mode 100644 index bb558ab87..000000000 --- a/data/hfopenllm_v2/xinchen9/llama3-b8-ft-dis/eb2ed6eb-4789-400d-aea5-841547a20cd7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xinchen9_llama3-b8-ft-dis/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama3-b8-ft-dis", - "id": "xinchen9/llama3-b8-ft-dis", - "developer": "xinchen9", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1546 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4626 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0393 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3129 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3654 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3244 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/873218a0-7ddb-4287-88ce-8c8214e85c85.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/873218a0-7ddb-4287-88ce-8c8214e85c85.json deleted file mode 100644 index 9463f18e7..000000000 --- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/873218a0-7ddb-4287-88ce-8c8214e85c85.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table", - "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table", - "developer": "xkp24", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6375 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4912 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0921 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.382 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3686 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/e4c32b92-46b4-431a-83f2-11499f587534.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/e4c32b92-46b4-431a-83f2-11499f587534.json deleted file mode 100644 index ea401b3b0..000000000 --- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/e4c32b92-46b4-431a-83f2-11499f587534.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table", - "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table", - "developer": "xkp24", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7275 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5057 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3819 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3697 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/a05681a0-07e4-4206-ae89-dee4e9706467.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/a05681a0-07e4-4206-ae89-dee4e9706467.json deleted file mode 100644 index a2cc2910a..000000000 --- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/a05681a0-07e4-4206-ae89-dee4e9706467.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table", - "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table", - "developer": "xkp24", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6569 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4952 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3594 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3702 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/b078f823-d603-4030-81a2-a3ca1a1117f9.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/b078f823-d603-4030-81a2-a3ca1a1117f9.json deleted file mode 100644 index 0aa1684a6..000000000 --- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/b078f823-d603-4030-81a2-a3ca1a1117f9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table", - "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table", - "developer": "xkp24", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6621 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5004 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0861 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3805 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.36 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/26625158-6720-47c7-8c28-46ca7b4b947e.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/26625158-6720-47c7-8c28-46ca7b4b947e.json deleted file mode 100644 index 296a0d099..000000000 --- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/26625158-6720-47c7-8c28-46ca7b4b947e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001", - "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001", - "developer": "xkp24", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6042 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4936 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3793 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3708 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json deleted file mode 100644 index ca115a61a..000000000 --- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002", - "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002", - "developer": "xkp24", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7132 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4996 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0853 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3872 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3664 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json deleted file mode 100644 index fbbc12e22..000000000 --- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001", - "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001", - "developer": "xkp24", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5947 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4899 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1073 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3581 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3704 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/4d99a55e-39c0-41c7-9ef0-494f739ceaec.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/4d99a55e-39c0-41c7-9ef0-494f739ceaec.json deleted file mode 100644 index 60810c2c1..000000000 --- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/4d99a55e-39c0-41c7-9ef0-494f739ceaec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002", - "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002", - "developer": "xkp24", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6453 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4951 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.353 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json deleted file mode 100644 index 9670d371e..000000000 --- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table", - "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table", - "developer": "xukp20", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5756 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4901 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0997 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3659 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json deleted file mode 100644 index 527102943..000000000 --- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table", - "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table", - "developer": "xukp20", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7034 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5092 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0967 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3739 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3693 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json deleted file mode 100644 index dd8c0822b..000000000 --- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table", - "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table", - "developer": "xukp20", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6024 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.497 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1042 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3658 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/0af58746-0492-4ba7-8a17-c0a5c43d0700.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/0af58746-0492-4ba7-8a17-c0a5c43d0700.json deleted file mode 100644 index 27ec692c2..000000000 --- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/0af58746-0492-4ba7-8a17-c0a5c43d0700.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table", - "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table", - "developer": "xukp20", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.662 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3818 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3615 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json deleted file mode 100644 index 0f5aeda25..000000000 --- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001", - "id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001", - "developer": "xukp20", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5336 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4915 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/bc79527d-ae58-4b17-afd8-df931562dbf3.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/bc79527d-ae58-4b17-afd8-df931562dbf3.json deleted file mode 100644 index 8d52c8dfa..000000000 --- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/bc79527d-ae58-4b17-afd8-df931562dbf3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002", - "id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002", - "developer": "xukp20", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6852 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0718 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2584 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3832 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3621 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/3e7423d5-ad7e-48e2-bd25-a4946d443c24.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/3e7423d5-ad7e-48e2-bd25-a4946d443c24.json deleted file mode 100644 index 922c5078c..000000000 --- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/3e7423d5-ad7e-48e2-bd25-a4946d443c24.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001", - "id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001", - "developer": "xukp20", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5482 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4887 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3633 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3671 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/7979fd6a-a886-41cc-987b-356b7c452bff.json b/data/hfopenllm_v2/xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/7979fd6a-a886-41cc-987b-356b7c452bff.json deleted file mode 100644 index 4feb8bd7f..000000000 --- a/data/hfopenllm_v2/xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/7979fd6a-a886-41cc-987b-356b7c452bff.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xukp20_llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table", - "id": "xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table", - "developer": "xukp20", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3673 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xwen-team/Xwen-7B-Chat/2be6bc34-1e61-426f-b963-6e096b5418fb.json b/data/hfopenllm_v2/xwen-team/Xwen-7B-Chat/2be6bc34-1e61-426f-b963-6e096b5418fb.json deleted file mode 100644 index fb936d3e3..000000000 --- a/data/hfopenllm_v2/xwen-team/Xwen-7B-Chat/2be6bc34-1e61-426f-b963-6e096b5418fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xwen-team_Xwen-7B-Chat/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Xwen-7B-Chat", - "id": "xwen-team/Xwen-7B-Chat", - "developer": "xwen-team", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 7.616 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6864 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5068 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4509 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2609 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3914 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.429 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/xxx777xxxASD/L3.1-ClaudeMaid-4x8B/c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json b/data/hfopenllm_v2/xxx777xxxASD/L3.1-ClaudeMaid-4x8B/c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json deleted file mode 100644 index e7afb7649..000000000 --- a/data/hfopenllm_v2/xxx777xxxASD/L3.1-ClaudeMaid-4x8B/c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/xxx777xxxASD_L3.1-ClaudeMaid-4x8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "L3.1-ClaudeMaid-4x8B", - "id": "xxx777xxxASD/L3.1-ClaudeMaid-4x8B", - "developer": "xxx777xxxASD", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 24.942 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6696 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5071 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1412 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2911 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4289 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.358 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yam-peleg/Hebrew-Gemma-11B-Instruct/c845eb10-a028-4cc2-8f64-25d75480c0d5.json b/data/hfopenllm_v2/yam-peleg/Hebrew-Gemma-11B-Instruct/c845eb10-a028-4cc2-8f64-25d75480c0d5.json deleted file mode 100644 index cb76ed21b..000000000 --- a/data/hfopenllm_v2/yam-peleg/Hebrew-Gemma-11B-Instruct/c845eb10-a028-4cc2-8f64-25d75480c0d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yam-peleg_Hebrew-Gemma-11B-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hebrew-Gemma-11B-Instruct", - "id": "yam-peleg/Hebrew-Gemma-11B-Instruct", - "developer": "yam-peleg", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "GemmaForCausalLM", - "params_billions": 10.475 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3021 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4036 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0657 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4089 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2554 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/377e7223-4876-49b6-8057-b1831d7f129b.json b/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/377e7223-4876-49b6-8057-b1831d7f129b.json deleted file mode 100644 index 7e2ef6384..000000000 --- a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/377e7223-4876-49b6-8057-b1831d7f129b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yam-peleg_Hebrew-Mistral-7B-200K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hebrew-Mistral-7B-200K", - "id": "yam-peleg/Hebrew-Mistral-7B-200K", - "developer": "yam-peleg", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.504 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1856 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4149 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0234 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.276 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3765 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2573 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/4ddb9ed6-0599-482e-b12e-bcb01975cc85.json b/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/4ddb9ed6-0599-482e-b12e-bcb01975cc85.json deleted file mode 100644 index d83bce8f2..000000000 --- a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/4ddb9ed6-0599-482e-b12e-bcb01975cc85.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yam-peleg_Hebrew-Mistral-7B-200K/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hebrew-Mistral-7B-200K", - "id": "yam-peleg/Hebrew-Mistral-7B-200K", - "developer": "yam-peleg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.504 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.177 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3411 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.031 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.374 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2529 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B/9d5af106-be69-4b62-99c1-fcfb6091d080.json b/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B/9d5af106-be69-4b62-99c1-fcfb6091d080.json deleted file mode 100644 index 93e4556ea..000000000 --- a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B/9d5af106-be69-4b62-99c1-fcfb6091d080.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yam-peleg_Hebrew-Mistral-7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Hebrew-Mistral-7B", - "id": "yam-peleg/Hebrew-Mistral-7B", - "developer": "yam-peleg", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MistralForCausalLM", - "params_billions": 7.504 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2328 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4334 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0498 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2794 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3977 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.278 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yanng1242/Marcoro14-7B-slerp/2f2d7a55-2838-446d-9487-a6cfa0c03356.json b/data/hfopenllm_v2/yanng1242/Marcoro14-7B-slerp/2f2d7a55-2838-446d-9487-a6cfa0c03356.json deleted file mode 100644 index 348a5fe37..000000000 --- a/data/hfopenllm_v2/yanng1242/Marcoro14-7B-slerp/2f2d7a55-2838-446d-9487-a6cfa0c03356.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yanng1242_Marcoro14-7B-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Marcoro14-7B-slerp", - "id": "yanng1242/Marcoro14-7B-slerp", - "developer": "yanng1242", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MistralForCausalLM", - "params_billions": 7.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.406 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5252 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0748 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3146 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4686 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3168 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yasserrmd/Coder-GRPO-3B/65d20d45-f63b-4b09-b66d-5f53297c0c20.json b/data/hfopenllm_v2/yasserrmd/Coder-GRPO-3B/65d20d45-f63b-4b09-b66d-5f53297c0c20.json deleted file mode 100644 index f3af930c7..000000000 --- a/data/hfopenllm_v2/yasserrmd/Coder-GRPO-3B/65d20d45-f63b-4b09-b66d-5f53297c0c20.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yasserrmd_Coder-GRPO-3B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Coder-GRPO-3B", - "id": "yasserrmd/Coder-GRPO-3B", - "developer": "yasserrmd", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 3.086 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4469 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3202 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4115 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3197 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yasserrmd/Text2SQL-1.5B/4712953f-0777-4b97-8f13-f7309f19f0dc.json b/data/hfopenllm_v2/yasserrmd/Text2SQL-1.5B/4712953f-0777-4b97-8f13-f7309f19f0dc.json deleted file mode 100644 index eca8489e8..000000000 --- a/data/hfopenllm_v2/yasserrmd/Text2SQL-1.5B/4712953f-0777-4b97-8f13-f7309f19f0dc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yasserrmd_Text2SQL-1.5B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Text2SQL-1.5B", - "id": "yasserrmd/Text2SQL-1.5B", - "developer": "yasserrmd", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.544 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2857 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3858 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.068 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2878 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3942 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2363 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/84382308-04b5-439f-b486-b26d20da605a.json b/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/84382308-04b5-439f-b486-b26d20da605a.json deleted file mode 100644 index e45b7fe88..000000000 --- a/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/84382308-04b5-439f-b486-b26d20da605a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ycros_BagelMIsteryTour-v2-8x7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BagelMIsteryTour-v2-8x7B", - "id": "ycros/BagelMIsteryTour-v2-8x7B", - "developer": "ycros", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6262 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5142 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3079 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3481 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/e82be06f-14ed-45e8-a273-d28c50f5212b.json b/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/e82be06f-14ed-45e8-a273-d28c50f5212b.json deleted file mode 100644 index 7ac95c1dd..000000000 --- a/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/e82be06f-14ed-45e8-a273-d28c50f5212b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ycros_BagelMIsteryTour-v2-8x7B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "BagelMIsteryTour-v2-8x7B", - "id": "ycros/BagelMIsteryTour-v2-8x7B", - "developer": "ycros", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "MixtralForCausalLM", - "params_billions": 46.703 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5994 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5159 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0785 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3045 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/5815ba55-40fc-4f8e-ae0b-b329c42fd503.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/5815ba55-40fc-4f8e-ae0b-b329c42fd503.json deleted file mode 100644 index fd37fa6d3..000000000 --- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/5815ba55-40fc-4f8e-ae0b-b329c42fd503.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table", - "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table", - "developer": "yfzp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6709 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4987 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1118 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3727 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3716 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/e58eceb3-b501-4924-9d0d-98d7da3c16c5.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/e58eceb3-b501-4924-9d0d-98d7da3c16c5.json deleted file mode 100644 index 3a674e52d..000000000 --- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/e58eceb3-b501-4924-9d0d-98d7da3c16c5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table", - "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table", - "developer": "yfzp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7333 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1035 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3806 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3748 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/5a88455c-7699-4c49-8a12-76cda15d878c.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/5a88455c-7699-4c49-8a12-76cda15d878c.json deleted file mode 100644 index 6b39085c6..000000000 --- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/5a88455c-7699-4c49-8a12-76cda15d878c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table", - "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table", - "developer": "yfzp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6785 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4941 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3647 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3718 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/122b4c1e-6e6c-4db5-8991-b091361c3ecf.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/122b4c1e-6e6c-4db5-8991-b091361c3ecf.json deleted file mode 100644 index f983fdb6a..000000000 --- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/122b4c1e-6e6c-4db5-8991-b091361c3ecf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table", - "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table", - "developer": "yfzp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7132 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5025 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0989 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3683 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json deleted file mode 100644 index 269e54be1..000000000 --- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001", - "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001", - "developer": "yfzp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6496 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4979 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.378 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.372 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/679f214f-e03f-47a9-8a11-91adbf1c4880.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/679f214f-e03f-47a9-8a11-91adbf1c4880.json deleted file mode 100644 index 7cceda8ce..000000000 --- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/679f214f-e03f-47a9-8a11-91adbf1c4880.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002", - "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002", - "developer": "yfzp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7196 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5045 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0876 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2601 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3831 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3734 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/680e77b8-9c64-4c52-aa83-55236039cef1.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/680e77b8-9c64-4c52-aa83-55236039cef1.json deleted file mode 100644 index 78466338e..000000000 --- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/680e77b8-9c64-4c52-aa83-55236039cef1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001", - "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001", - "developer": "yfzp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6504 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4958 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0937 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.366 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3703 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/c24c471c-14b3-462e-8b81-6548b27e5ffc.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/c24c471c-14b3-462e-8b81-6548b27e5ffc.json deleted file mode 100644 index 3d58afca0..000000000 --- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/c24c471c-14b3-462e-8b81-6548b27e5ffc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002", - "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002", - "developer": "yfzp", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7016 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4992 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0869 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2592 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3779 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3669 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/efa7fa62-2e8b-403c-b345-eef876b48dbd.json b/data/hfopenllm_v2/yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/efa7fa62-2e8b-403c-b345-eef876b48dbd.json deleted file mode 100644 index 183ccc754..000000000 --- a/data/hfopenllm_v2/yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/efa7fa62-2e8b-403c-b345-eef876b48dbd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yifAI_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002", - "id": "yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002", - "developer": "yifAI", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4915 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0755 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3899 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.352 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8/40bae762-65bd-4b4c-b422-ffd0fd3790a9.json b/data/hfopenllm_v2/ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8/40bae762-65bd-4b4c-b422-ffd0fd3790a9.json deleted file mode 100644 index 1a855481e..000000000 --- a/data/hfopenllm_v2/ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8/40bae762-65bd-4b4c-b422-ffd0fd3790a9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ylalain_ECE-PRYMMAL-YL-1B-SLERP-V8/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ECE-PRYMMAL-YL-1B-SLERP-V8", - "id": "ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8", - "developer": "ylalain", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 1.357 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1505 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3976 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0045 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2894 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/Llama-3.1-8B-GRPO-Instruct/596957cc-719c-44c7-8284-06a9ba0d1a30.json b/data/hfopenllm_v2/ymcki/Llama-3.1-8B-GRPO-Instruct/596957cc-719c-44c7-8284-06a9ba0d1a30.json deleted file mode 100644 index a68709211..000000000 --- a/data/hfopenllm_v2/ymcki/Llama-3.1-8B-GRPO-Instruct/596957cc-719c-44c7-8284-06a9ba0d1a30.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_Llama-3.1-8B-GRPO-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-GRPO-Instruct", - "id": "ymcki/Llama-3.1-8B-GRPO-Instruct", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5132 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2945 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3817 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3738 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/Llama-3.1-8B-SFT-GRPO-Instruct/706bbc09-f867-4327-bc4d-b5ede41ebd93.json b/data/hfopenllm_v2/ymcki/Llama-3.1-8B-SFT-GRPO-Instruct/706bbc09-f867-4327-bc4d-b5ede41ebd93.json deleted file mode 100644 index 3032b3ce7..000000000 --- a/data/hfopenllm_v2/ymcki/Llama-3.1-8B-SFT-GRPO-Instruct/706bbc09-f867-4327-bc4d-b5ede41ebd93.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_Llama-3.1-8B-SFT-GRPO-Instruct/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama-3.1-8B-SFT-GRPO-Instruct", - "id": "ymcki/Llama-3.1-8B-SFT-GRPO-Instruct", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3354 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3126 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2534 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3526 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1098 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/8962e9be-75bf-4f57-8ce2-b29523740851.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/8962e9be-75bf-4f57-8ce2-b29523740851.json deleted file mode 100644 index 58ab0bf05..000000000 --- a/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/8962e9be-75bf-4f57-8ce2-b29523740851.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-ORPO-jpn-it-abliterated-18-merge", - "id": "ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5218 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4147 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0544 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2836 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3514 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2461 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18/014f4838-22ff-4802-a887-4d2de01a9256.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18/014f4838-22ff-4802-a887-4d2de01a9256.json deleted file mode 100644 index ff7346956..000000000 --- a/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18/014f4838-22ff-4802-a887-4d2de01a9256.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-ORPO-jpn-it-abliterated-18", - "id": "ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4631 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4053 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0431 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2886 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2345 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24/5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24/5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json deleted file mode 100644 index 758147784..000000000 --- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24/5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-17-18-24/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-jpn-it-abliterated-17-18-24", - "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5055 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0257 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.281 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3502 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2282 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/09b81cf2-3b79-448c-ab8e-87e378c804bb.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/09b81cf2-3b79-448c-ab8e-87e378c804bb.json deleted file mode 100644 index d9039ff8d..000000000 --- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/09b81cf2-3b79-448c-ab8e-87e378c804bb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca", - "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3065 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4072 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0325 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2693 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3969 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2249 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO/28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO/28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json deleted file mode 100644 index 89ebbecdf..000000000 --- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO/28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-jpn-it-abliterated-17-ORPO", - "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4748 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3898 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0619 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2743 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3768 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2191 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17/845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17/845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json deleted file mode 100644 index b0dc22bac..000000000 --- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17/845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-17/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-jpn-it-abliterated-17", - "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5082 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4076 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0385 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2718 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3701 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2455 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO/706737c7-cd1a-4958-9ffc-2655f0b50178.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO/706737c7-cd1a-4958-9ffc-2655f0b50178.json deleted file mode 100644 index 1d4acd4a2..000000000 --- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO/706737c7-cd1a-4958-9ffc-2655f0b50178.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-18-ORPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-jpn-it-abliterated-18-ORPO", - "id": "ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4742 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4039 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0468 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2617 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3953 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2185 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18/5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18/5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json deleted file mode 100644 index 902673fd5..000000000 --- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18/5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-18/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-jpn-it-abliterated-18", - "id": "ymcki/gemma-2-2b-jpn-it-abliterated-18", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5175 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4132 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0446 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2735 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3742 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2505 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-24/d374a68d-b985-47c2-b087-500bffa93c80.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-24/d374a68d-b985-47c2-b087-500bffa93c80.json deleted file mode 100644 index 7ae15b60f..000000000 --- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-24/d374a68d-b985-47c2-b087-500bffa93c80.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-24/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-jpn-it-abliterated-24", - "id": "ymcki/gemma-2-2b-jpn-it-abliterated-24", - "developer": "ymcki", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4979 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.411 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0438 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2777 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3915 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2473 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yuchenxie/ArlowGPT-3B-Multilingual/23fbceb0-b646-4945-b17f-66dde24a0e43.json b/data/hfopenllm_v2/yuchenxie/ArlowGPT-3B-Multilingual/23fbceb0-b646-4945-b17f-66dde24a0e43.json deleted file mode 100644 index 266d9bf50..000000000 --- a/data/hfopenllm_v2/yuchenxie/ArlowGPT-3B-Multilingual/23fbceb0-b646-4945-b17f-66dde24a0e43.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yuchenxie_ArlowGPT-3B-Multilingual/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ArlowGPT-3B-Multilingual", - "id": "yuchenxie/ArlowGPT-3B-Multilingual", - "developer": "yuchenxie", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 3.213 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6395 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4301 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1125 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2802 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3727 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2817 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yuchenxie/ArlowGPT-8B/73d9e204-e829-4159-b340-6d9581c6f0e1.json b/data/hfopenllm_v2/yuchenxie/ArlowGPT-8B/73d9e204-e829-4159-b340-6d9581c6f0e1.json deleted file mode 100644 index 4cc6ee856..000000000 --- a/data/hfopenllm_v2/yuchenxie/ArlowGPT-8B/73d9e204-e829-4159-b340-6d9581c6f0e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yuchenxie_ArlowGPT-8B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ArlowGPT-8B", - "id": "yuchenxie/ArlowGPT-8B", - "developer": "yuchenxie", - "inference_platform": "unknown", - "additional_details": { - "precision": "float16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7847 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.508 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2039 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2936 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3787 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO/a6979dda-fba6-4104-b153-3b0a89de8585.json b/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO/a6979dda-fba6-4104-b153-3b0a89de8585.json deleted file mode 100644 index 9cefceabb..000000000 --- a/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO/a6979dda-fba6-4104-b153-3b0a89de8585.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yuvraj17_Llama3-8B-SuperNova-Spectrum-Hermes-DPO/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-8B-SuperNova-Spectrum-Hermes-DPO", - "id": "yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO", - "developer": "yuvraj17", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4691 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0566 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.302 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4012 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2635 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties/62e04968-0c5c-4aad-a434-d9d24bccbdb8.json b/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties/62e04968-0c5c-4aad-a434-d9d24bccbdb8.json deleted file mode 100644 index 76030c29d..000000000 --- a/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties/62e04968-0c5c-4aad-a434-d9d24bccbdb8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yuvraj17_Llama3-8B-SuperNova-Spectrum-dare_ties/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-8B-SuperNova-Spectrum-dare_ties", - "id": "yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties", - "developer": "yuvraj17", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4013 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4616 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2752 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/yuvraj17/Llama3-8B-abliterated-Spectrum-slerp/bae4064e-b10f-4082-876d-e4168ca1a8cc.json b/data/hfopenllm_v2/yuvraj17/Llama3-8B-abliterated-Spectrum-slerp/bae4064e-b10f-4082-876d-e4168ca1a8cc.json deleted file mode 100644 index 95dd6c5ed..000000000 --- a/data/hfopenllm_v2/yuvraj17/Llama3-8B-abliterated-Spectrum-slerp/bae4064e-b10f-4082-876d-e4168ca1a8cc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/yuvraj17_Llama3-8B-abliterated-Spectrum-slerp/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Llama3-8B-abliterated-Spectrum-slerp", - "id": "yuvraj17/Llama3-8B-abliterated-Spectrum-slerp", - "developer": "yuvraj17", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "LlamaForCausalLM", - "params_billions": 8.03 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2885 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3012 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3998 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3257 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zake7749/gemma-2-2b-it-chinese-kyara-dpo/0040b48c-0f54-4c9b-97ee-1ca833c68e36.json b/data/hfopenllm_v2/zake7749/gemma-2-2b-it-chinese-kyara-dpo/0040b48c-0f54-4c9b-97ee-1ca833c68e36.json deleted file mode 100644 index c07d9eb08..000000000 --- a/data/hfopenllm_v2/zake7749/gemma-2-2b-it-chinese-kyara-dpo/0040b48c-0f54-4c9b-97ee-1ca833c68e36.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zake7749_gemma-2-2b-it-chinese-kyara-dpo/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-2b-it-chinese-kyara-dpo", - "id": "zake7749/gemma-2-2b-it-chinese-kyara-dpo", - "developer": "zake7749", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 2.614 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5382 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4257 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0838 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2668 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4576 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2573 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zake7749/gemma-2-9b-it-chinese-kyara/6050e969-bcde-4594-8e53-05fa74c7287d.json b/data/hfopenllm_v2/zake7749/gemma-2-9b-it-chinese-kyara/6050e969-bcde-4594-8e53-05fa74c7287d.json deleted file mode 100644 index 0bf944e0e..000000000 --- a/data/hfopenllm_v2/zake7749/gemma-2-9b-it-chinese-kyara/6050e969-bcde-4594-8e53-05fa74c7287d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zake7749_gemma-2-9b-it-chinese-kyara/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-9b-it-chinese-kyara", - "id": "zake7749/gemma-2-9b-it-chinese-kyara", - "developer": "zake7749", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 9.242 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1764 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5954 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.105 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4242 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4179 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/Gemma-2-TM-9B/3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json b/data/hfopenllm_v2/zelk12/Gemma-2-TM-9B/3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json deleted file mode 100644 index c972fae87..000000000 --- a/data/hfopenllm_v2/zelk12/Gemma-2-TM-9B/3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_Gemma-2-TM-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Gemma-2-TM-9B", - "id": "zelk12/Gemma-2-TM-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8045 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5987 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2024 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4152 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4088 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Gen1-gemma-2-9B/ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json b/data/hfopenllm_v2/zelk12/MT-Gen1-gemma-2-9B/ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json deleted file mode 100644 index 14766f899..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Gen1-gemma-2-9B/ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen1-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Gen1-gemma-2-9B", - "id": "zelk12/MT-Gen1-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7886 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2221 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4217 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Gen2-GI-gemma-2-9B/4048fa60-7427-4f7e-9939-e270aa5e8b51.json b/data/hfopenllm_v2/zelk12/MT-Gen2-GI-gemma-2-9B/4048fa60-7427-4f7e-9939-e270aa5e8b51.json deleted file mode 100644 index aab0e6abb..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Gen2-GI-gemma-2-9B/4048fa60-7427-4f7e-9939-e270aa5e8b51.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen2-GI-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Gen2-GI-gemma-2-9B", - "id": "zelk12/MT-Gen2-GI-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7914 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4283 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Gen2-gemma-2-9B/f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json b/data/hfopenllm_v2/zelk12/MT-Gen2-gemma-2-9B/f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json deleted file mode 100644 index 1b3fd4238..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Gen2-gemma-2-9B/f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen2-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Gen2-gemma-2-9B", - "id": "zelk12/MT-Gen2-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7907 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Gen3-gemma-2-9B/1da70796-d40b-4f2a-8ce3-b304f414a6d5.json b/data/hfopenllm_v2/zelk12/MT-Gen3-gemma-2-9B/1da70796-d40b-4f2a-8ce3-b304f414a6d5.json deleted file mode 100644 index 9a41654cb..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Gen3-gemma-2-9B/1da70796-d40b-4f2a-8ce3-b304f414a6d5.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen3-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Gen3-gemma-2-9B", - "id": "zelk12/MT-Gen3-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6097 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2296 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4217 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4356 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Gen4-gemma-2-9B/de476f79-2539-4f9e-a1d2-901c6c4342d4.json b/data/hfopenllm_v2/zelk12/MT-Gen4-gemma-2-9B/de476f79-2539-4f9e-a1d2-901c6c4342d4.json deleted file mode 100644 index 6c3bf327c..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Gen4-gemma-2-9B/de476f79-2539-4f9e-a1d2-901c6c4342d4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen4-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Gen4-gemma-2-9B", - "id": "zelk12/MT-Gen4-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7883 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2236 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Gen5-gemma-2-9B/80aee542-c894-46b6-a6ed-9f3400aefa9e.json b/data/hfopenllm_v2/zelk12/MT-Gen5-gemma-2-9B/80aee542-c894-46b6-a6ed-9f3400aefa9e.json deleted file mode 100644 index 758fecb7b..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Gen5-gemma-2-9B/80aee542-c894-46b6-a6ed-9f3400aefa9e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen5-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Gen5-gemma-2-9B", - "id": "zelk12/MT-Gen5-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7923 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6133 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2153 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4402 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Gen6-gemma-2-9B/5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json b/data/hfopenllm_v2/zelk12/MT-Gen6-gemma-2-9B/5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json deleted file mode 100644 index d42787489..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Gen6-gemma-2-9B/5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen6-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Gen6-gemma-2-9B", - "id": "zelk12/MT-Gen6-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1616 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5845 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0823 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3331 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4069 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4166 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Gen6fix-gemma-2-9B/4b019824-8454-4ce8-aa49-d122a2491f9c.json b/data/hfopenllm_v2/zelk12/MT-Gen6fix-gemma-2-9B/4b019824-8454-4ce8-aa49-d122a2491f9c.json deleted file mode 100644 index ea018b3ba..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Gen6fix-gemma-2-9B/4b019824-8454-4ce8-aa49-d122a2491f9c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen6fix-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Gen6fix-gemma-2-9B", - "id": "zelk12/MT-Gen6fix-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1576 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5917 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0816 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3372 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4084 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.412 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Gen7-gemma-2-9B/0dfcd13c-f057-4aec-82ad-b5cf2b266502.json b/data/hfopenllm_v2/zelk12/MT-Gen7-gemma-2-9B/0dfcd13c-f057-4aec-82ad-b5cf2b266502.json deleted file mode 100644 index dd5c80820..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Gen7-gemma-2-9B/0dfcd13c-f057-4aec-82ad-b5cf2b266502.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen7-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Gen7-gemma-2-9B", - "id": "zelk12/MT-Gen7-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1664 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5935 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0891 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3356 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4122 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Max-Merge_02012025163610-gemma-2-9B/927589bf-f6a0-4155-a24b-120231bbf029.json b/data/hfopenllm_v2/zelk12/MT-Max-Merge_02012025163610-gemma-2-9B/927589bf-f6a0-4155-a24b-120231bbf029.json deleted file mode 100644 index 57eea20ea..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Max-Merge_02012025163610-gemma-2-9B/927589bf-f6a0-4155-a24b-120231bbf029.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Max-Merge_02012025163610-gemma-2-9B", - "id": "zelk12/MT-Max-Merge_02012025163610-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7907 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6142 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4396 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Merge-gemma-2-9B/1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json b/data/hfopenllm_v2/zelk12/MT-Merge-gemma-2-9B/1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json deleted file mode 100644 index 3dbd1e532..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Merge-gemma-2-9B/1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Merge-gemma-2-9B", - "id": "zelk12/MT-Merge-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8035 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6118 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4256 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4362 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Merge1-gemma-2-9B/0110d1c9-755e-4f09-888b-0c9c1a263639.json b/data/hfopenllm_v2/zelk12/MT-Merge1-gemma-2-9B/0110d1c9-755e-4f09-888b-0c9c1a263639.json deleted file mode 100644 index d496666d2..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Merge1-gemma-2-9B/0110d1c9-755e-4f09-888b-0c9c1a263639.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge1-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Merge1-gemma-2-9B", - "id": "zelk12/MT-Merge1-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7901 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.61 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2289 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4374 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/cda65781-494c-45bd-8c32-7b1fe987f31c.json b/data/hfopenllm_v2/zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/cda65781-494c-45bd-8c32-7b1fe987f31c.json deleted file mode 100644 index 66adad61e..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/cda65781-494c-45bd-8c32-7b1fe987f31c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Merge2-MU-gemma-2-MTg2MT1g2-9B", - "id": "zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7956 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6084 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4322 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Merge2-gemma-2-9B/2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json b/data/hfopenllm_v2/zelk12/MT-Merge2-gemma-2-9B/2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json deleted file mode 100644 index 6d090a24c..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Merge2-gemma-2-9B/2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge2-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Merge2-gemma-2-9B", - "id": "zelk12/MT-Merge2-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7877 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2349 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4217 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Merge3-gemma-2-9B/acf07f51-5acd-4375-bafa-7a1a244db3c6.json b/data/hfopenllm_v2/zelk12/MT-Merge3-gemma-2-9B/acf07f51-5acd-4375-bafa-7a1a244db3c6.json deleted file mode 100644 index 78c6f3258..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Merge3-gemma-2-9B/acf07f51-5acd-4375-bafa-7a1a244db3c6.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge3-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Merge3-gemma-2-9B", - "id": "zelk12/MT-Merge3-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7859 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6102 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4258 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Merge4-gemma-2-9B/ff985193-ba26-45d3-97be-b7d3b17ab4d7.json b/data/hfopenllm_v2/zelk12/MT-Merge4-gemma-2-9B/ff985193-ba26-45d3-97be-b7d3b17ab4d7.json deleted file mode 100644 index f73843da3..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Merge4-gemma-2-9B/ff985193-ba26-45d3-97be-b7d3b17ab4d7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge4-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Merge4-gemma-2-9B", - "id": "zelk12/MT-Merge4-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7807 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6118 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2168 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4294 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.439 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Merge5-gemma-2-9B/21dbea2c-5cb1-431c-a496-af9b932b3440.json b/data/hfopenllm_v2/zelk12/MT-Merge5-gemma-2-9B/21dbea2c-5cb1-431c-a496-af9b932b3440.json deleted file mode 100644 index 9e45d9643..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Merge5-gemma-2-9B/21dbea2c-5cb1-431c-a496-af9b932b3440.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge5-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Merge5-gemma-2-9B", - "id": "zelk12/MT-Merge5-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7844 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6123 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4281 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-Merge6-gemma-2-9B/1143955c-c32c-4b41-8484-2c77e72f4946.json b/data/hfopenllm_v2/zelk12/MT-Merge6-gemma-2-9B/1143955c-c32c-4b41-8484-2c77e72f4946.json deleted file mode 100644 index 348e60c56..000000000 --- a/data/hfopenllm_v2/zelk12/MT-Merge6-gemma-2-9B/1143955c-c32c-4b41-8484-2c77e72f4946.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge6-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-Merge6-gemma-2-9B", - "id": "zelk12/MT-Merge6-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1695 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5949 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0801 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4098 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4115 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT-gemma-2-9B/94824ceb-08c3-415c-8003-b70a0d9af09d.json b/data/hfopenllm_v2/zelk12/MT-gemma-2-9B/94824ceb-08c3-415c-8003-b70a0d9af09d.json deleted file mode 100644 index 2c62b725d..000000000 --- a/data/hfopenllm_v2/zelk12/MT-gemma-2-9B/94824ceb-08c3-415c-8003-b70a0d9af09d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT-gemma-2-9B", - "id": "zelk12/MT-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7968 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6064 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2054 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4071 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4224 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen1-gemma-2-9B/bf2903cb-b954-4870-98c3-116a96aa49fb.json b/data/hfopenllm_v2/zelk12/MT1-Gen1-gemma-2-9B/bf2903cb-b954-4870-98c3-116a96aa49fb.json deleted file mode 100644 index 8e7b2f927..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-Gen1-gemma-2-9B/bf2903cb-b954-4870-98c3-116a96aa49fb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen1-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-Gen1-gemma-2-9B", - "id": "zelk12/MT1-Gen1-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7974 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6118 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4376 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen2-gemma-2-9B/b089c439-a38c-438d-bdad-1c68a1265d95.json b/data/hfopenllm_v2/zelk12/MT1-Gen2-gemma-2-9B/b089c439-a38c-438d-bdad-1c68a1265d95.json deleted file mode 100644 index 714f19ac3..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-Gen2-gemma-2-9B/b089c439-a38c-438d-bdad-1c68a1265d95.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen2-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-Gen2-gemma-2-9B", - "id": "zelk12/MT1-Gen2-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7984 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2251 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4284 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4355 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen3-gemma-2-9B/c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json b/data/hfopenllm_v2/zelk12/MT1-Gen3-gemma-2-9B/c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json deleted file mode 100644 index e7e79fa7b..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-Gen3-gemma-2-9B/c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen3-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-Gen3-gemma-2-9B", - "id": "zelk12/MT1-Gen3-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.796 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6102 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4243 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4349 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen4-gemma-2-9B/fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json b/data/hfopenllm_v2/zelk12/MT1-Gen4-gemma-2-9B/fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json deleted file mode 100644 index b6d5e1c4b..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-Gen4-gemma-2-9B/fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen4-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-Gen4-gemma-2-9B", - "id": "zelk12/MT1-Gen4-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7941 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6058 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.216 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4286 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B/1c81787b-594e-4bb6-aee1-7f193a628b16.json b/data/hfopenllm_v2/zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B/1c81787b-594e-4bb6-aee1-7f193a628b16.json deleted file mode 100644 index c2e0e4354..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B/1c81787b-594e-4bb6-aee1-7f193a628b16.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen5-IF-gemma-2-S2DMv1-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-Gen5-IF-gemma-2-S2DMv1-9B", - "id": "zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7929 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2032 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4245 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4218 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen5-gemma-2-9B/fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json b/data/hfopenllm_v2/zelk12/MT1-Gen5-gemma-2-9B/fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json deleted file mode 100644 index c299e1f9b..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-Gen5-gemma-2-9B/fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen5-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-Gen5-gemma-2-9B", - "id": "zelk12/MT1-Gen5-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7795 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6017 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2077 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4191 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4222 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen6-gemma-2-9B/0625f09a-3e02-410b-963b-49b83dfc5c8f.json b/data/hfopenllm_v2/zelk12/MT1-Gen6-gemma-2-9B/0625f09a-3e02-410b-963b-49b83dfc5c8f.json deleted file mode 100644 index 550cda232..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-Gen6-gemma-2-9B/0625f09a-3e02-410b-963b-49b83dfc5c8f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen6-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-Gen6-gemma-2-9B", - "id": "zelk12/MT1-Gen6-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1634 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5944 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4044 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4133 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen7-gemma-2-9B/50c1399e-b409-4dff-b4d6-9be01dbb02c7.json b/data/hfopenllm_v2/zelk12/MT1-Gen7-gemma-2-9B/50c1399e-b409-4dff-b4d6-9be01dbb02c7.json deleted file mode 100644 index 2e79a59e4..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-Gen7-gemma-2-9B/50c1399e-b409-4dff-b4d6-9be01dbb02c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen7-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-Gen7-gemma-2-9B", - "id": "zelk12/MT1-Gen7-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1634 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5938 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0831 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.328 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4111 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4145 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B/402bdb4a-b258-40a4-ac9f-de74026c02f3.json b/data/hfopenllm_v2/zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B/402bdb4a-b258-40a4-ac9f-de74026c02f3.json deleted file mode 100644 index 09acf9468..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B/402bdb4a-b258-40a4-ac9f-de74026c02f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-Max-Merge_02012025163610-gemma-2-9B", - "id": "zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7929 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6123 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2228 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4255 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT1-gemma-2-9B/65dcf458-db0f-45cd-a8a4-e16108e51161.json b/data/hfopenllm_v2/zelk12/MT1-gemma-2-9B/65dcf458-db0f-45cd-a8a4-e16108e51161.json deleted file mode 100644 index f7d6bc79e..000000000 --- a/data/hfopenllm_v2/zelk12/MT1-gemma-2-9B/65dcf458-db0f-45cd-a8a4-e16108e51161.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT1-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT1-gemma-2-9B", - "id": "zelk12/MT1-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7947 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6109 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2236 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4322 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4358 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen1-gemma-2-9B/f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json b/data/hfopenllm_v2/zelk12/MT2-Gen1-gemma-2-9B/f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json deleted file mode 100644 index 44374c9d5..000000000 --- a/data/hfopenllm_v2/zelk12/MT2-Gen1-gemma-2-9B/f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen1-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT2-Gen1-gemma-2-9B", - "id": "zelk12/MT2-Gen1-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7856 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6101 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4243 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4377 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen2-gemma-2-9B/11e7b55a-d872-474a-98a6-fc82ce5a863e.json b/data/hfopenllm_v2/zelk12/MT2-Gen2-gemma-2-9B/11e7b55a-d872-474a-98a6-fc82ce5a863e.json deleted file mode 100644 index 3faf801d3..000000000 --- a/data/hfopenllm_v2/zelk12/MT2-Gen2-gemma-2-9B/11e7b55a-d872-474a-98a6-fc82ce5a863e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen2-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT2-Gen2-gemma-2-9B", - "id": "zelk12/MT2-Gen2-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7889 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6093 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2183 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.427 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4388 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen3-gemma-2-9B/19688633-fa6c-412a-8dbc-c16fc49b3276.json b/data/hfopenllm_v2/zelk12/MT2-Gen3-gemma-2-9B/19688633-fa6c-412a-8dbc-c16fc49b3276.json deleted file mode 100644 index b70f067a8..000000000 --- a/data/hfopenllm_v2/zelk12/MT2-Gen3-gemma-2-9B/19688633-fa6c-412a-8dbc-c16fc49b3276.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen3-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT2-Gen3-gemma-2-9B", - "id": "zelk12/MT2-Gen3-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.781 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6105 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2107 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4374 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen4-gemma-2-9B/7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json b/data/hfopenllm_v2/zelk12/MT2-Gen4-gemma-2-9B/7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json deleted file mode 100644 index 70dd1549f..000000000 --- a/data/hfopenllm_v2/zelk12/MT2-Gen4-gemma-2-9B/7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen4-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT2-Gen4-gemma-2-9B", - "id": "zelk12/MT2-Gen4-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7896 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6097 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2236 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4321 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen5-gemma-2-9B/447f880c-643f-4041-8cdb-87697d798085.json b/data/hfopenllm_v2/zelk12/MT2-Gen5-gemma-2-9B/447f880c-643f-4041-8cdb-87697d798085.json deleted file mode 100644 index 2ef223d58..000000000 --- a/data/hfopenllm_v2/zelk12/MT2-Gen5-gemma-2-9B/447f880c-643f-4041-8cdb-87697d798085.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen5-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT2-Gen5-gemma-2-9B", - "id": "zelk12/MT2-Gen5-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7749 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6064 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2107 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4302 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen6-gemma-2-9B/653d459e-f8b7-48bc-a9db-779e515532cf.json b/data/hfopenllm_v2/zelk12/MT2-Gen6-gemma-2-9B/653d459e-f8b7-48bc-a9db-779e515532cf.json deleted file mode 100644 index 828f27f8f..000000000 --- a/data/hfopenllm_v2/zelk12/MT2-Gen6-gemma-2-9B/653d459e-f8b7-48bc-a9db-779e515532cf.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen6-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT2-Gen6-gemma-2-9B", - "id": "zelk12/MT2-Gen6-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1664 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0846 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4137 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen7-gemma-2-9B/4e56faf6-dbde-4059-b502-32c76bdbed2d.json b/data/hfopenllm_v2/zelk12/MT2-Gen7-gemma-2-9B/4e56faf6-dbde-4059-b502-32c76bdbed2d.json deleted file mode 100644 index 64fc4a08f..000000000 --- a/data/hfopenllm_v2/zelk12/MT2-Gen7-gemma-2-9B/4e56faf6-dbde-4059-b502-32c76bdbed2d.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen7-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT2-Gen7-gemma-2-9B", - "id": "zelk12/MT2-Gen7-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6079 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.102 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4203 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4311 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B/f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json b/data/hfopenllm_v2/zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B/f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json deleted file mode 100644 index 565a40eb8..000000000 --- a/data/hfopenllm_v2/zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B/f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT2-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT2-Max-Merge_02012025163610-gemma-2-9B", - "id": "zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7901 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6108 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT2-gemma-2-9B/7d08412d-e987-497f-a6ec-ce0affe0f80f.json b/data/hfopenllm_v2/zelk12/MT2-gemma-2-9B/7d08412d-e987-497f-a6ec-ce0affe0f80f.json deleted file mode 100644 index adfbad27f..000000000 --- a/data/hfopenllm_v2/zelk12/MT2-gemma-2-9B/7d08412d-e987-497f-a6ec-ce0affe0f80f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT2-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT2-gemma-2-9B", - "id": "zelk12/MT2-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7886 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6115 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4217 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen1-gemma-2-9B/f042f897-cfe8-4d8c-b75b-bbfca44505ea.json b/data/hfopenllm_v2/zelk12/MT3-Gen1-gemma-2-9B/f042f897-cfe8-4d8c-b75b-bbfca44505ea.json deleted file mode 100644 index 4eb9a7a77..000000000 --- a/data/hfopenllm_v2/zelk12/MT3-Gen1-gemma-2-9B/f042f897-cfe8-4d8c-b75b-bbfca44505ea.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen1-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT3-Gen1-gemma-2-9B", - "id": "zelk12/MT3-Gen1-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7838 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3465 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4151 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4327 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen2-gemma-2-9B/f24ab334-c022-4e34-a930-3fed6ee18793.json b/data/hfopenllm_v2/zelk12/MT3-Gen2-gemma-2-9B/f24ab334-c022-4e34-a930-3fed6ee18793.json deleted file mode 100644 index 84a45fd26..000000000 --- a/data/hfopenllm_v2/zelk12/MT3-Gen2-gemma-2-9B/f24ab334-c022-4e34-a930-3fed6ee18793.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen2-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT3-Gen2-gemma-2-9B", - "id": "zelk12/MT3-Gen2-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7843 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6091 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2236 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3574 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4111 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4333 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen3-gemma-2-9B/2bd3c620-780f-452d-92d7-d01a04539939.json b/data/hfopenllm_v2/zelk12/MT3-Gen3-gemma-2-9B/2bd3c620-780f-452d-92d7-d01a04539939.json deleted file mode 100644 index 03053bdd1..000000000 --- a/data/hfopenllm_v2/zelk12/MT3-Gen3-gemma-2-9B/2bd3c620-780f-452d-92d7-d01a04539939.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen3-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT3-Gen3-gemma-2-9B", - "id": "zelk12/MT3-Gen3-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7856 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6089 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2153 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4258 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4303 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen4-gemma-2-9B/234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json b/data/hfopenllm_v2/zelk12/MT3-Gen4-gemma-2-9B/234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json deleted file mode 100644 index 283c8ee00..000000000 --- a/data/hfopenllm_v2/zelk12/MT3-Gen4-gemma-2-9B/234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen4-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT3-Gen4-gemma-2-9B", - "id": "zelk12/MT3-Gen4-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7737 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6101 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2062 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4476 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4387 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B/d8e0a32e-f307-4056-b450-47a12a0a7b15.json b/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B/d8e0a32e-f307-4056-b450-47a12a0a7b15.json deleted file mode 100644 index e636eb2e9..000000000 --- a/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B/d8e0a32e-f307-4056-b450-47a12a0a7b15.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen5-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT3-Gen5-gemma-2-9B", - "id": "zelk12/MT3-Gen5-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.799 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6099 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2266 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4191 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4317 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B_v1/9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json b/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B_v1/9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json deleted file mode 100644 index 90b69beb0..000000000 --- a/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B_v1/9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen5-gemma-2-9B_v1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT3-Gen5-gemma-2-9B_v1", - "id": "zelk12/MT3-Gen5-gemma-2-9B_v1", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7996 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6113 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2228 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.349 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4204 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4359 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen6-gemma-2-9B/037787fb-9c61-4c56-a7fc-704c04b519f7.json b/data/hfopenllm_v2/zelk12/MT3-Gen6-gemma-2-9B/037787fb-9c61-4c56-a7fc-704c04b519f7.json deleted file mode 100644 index bff72f6ad..000000000 --- a/data/hfopenllm_v2/zelk12/MT3-Gen6-gemma-2-9B/037787fb-9c61-4c56-a7fc-704c04b519f7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen6-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT3-Gen6-gemma-2-9B", - "id": "zelk12/MT3-Gen6-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0884 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4126 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4102 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B/5df3dd8f-4921-4916-8163-8651b796e478.json b/data/hfopenllm_v2/zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B/5df3dd8f-4921-4916-8163-8651b796e478.json deleted file mode 100644 index 71138b3be..000000000 --- a/data/hfopenllm_v2/zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B/5df3dd8f-4921-4916-8163-8651b796e478.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT3-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT3-Max-Merge_02012025163610-gemma-2-9B", - "id": "zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6123 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1012 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4255 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT3-gemma-2-9B/50463593-3a53-4b3f-9621-d05670309b7e.json b/data/hfopenllm_v2/zelk12/MT3-gemma-2-9B/50463593-3a53-4b3f-9621-d05670309b7e.json deleted file mode 100644 index 16582b121..000000000 --- a/data/hfopenllm_v2/zelk12/MT3-gemma-2-9B/50463593-3a53-4b3f-9621-d05670309b7e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT3-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT3-gemma-2-9B", - "id": "zelk12/MT3-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7786 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6131 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2168 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3448 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4243 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4327 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen1-gemma-2-9B/d7fef356-36c7-488f-8f49-997682a2c01a.json b/data/hfopenllm_v2/zelk12/MT4-Gen1-gemma-2-9B/d7fef356-36c7-488f-8f49-997682a2c01a.json deleted file mode 100644 index 7bdf9dc8f..000000000 --- a/data/hfopenllm_v2/zelk12/MT4-Gen1-gemma-2-9B/d7fef356-36c7-488f-8f49-997682a2c01a.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen1-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT4-Gen1-gemma-2-9B", - "id": "zelk12/MT4-Gen1-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7895 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6094 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2198 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4322 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4389 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen2-gemma-2-9B/42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json b/data/hfopenllm_v2/zelk12/MT4-Gen2-gemma-2-9B/42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json deleted file mode 100644 index 3b4ba860e..000000000 --- a/data/hfopenllm_v2/zelk12/MT4-Gen2-gemma-2-9B/42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen2-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT4-Gen2-gemma-2-9B", - "id": "zelk12/MT4-Gen2-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8051 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6108 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2326 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4257 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen3-gemma-2-9B/b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json b/data/hfopenllm_v2/zelk12/MT4-Gen3-gemma-2-9B/b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json deleted file mode 100644 index 5a8bcf436..000000000 --- a/data/hfopenllm_v2/zelk12/MT4-Gen3-gemma-2-9B/b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen3-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT4-Gen3-gemma-2-9B", - "id": "zelk12/MT4-Gen3-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7841 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6087 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4243 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4381 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen4-gemma-2-9B/e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json b/data/hfopenllm_v2/zelk12/MT4-Gen4-gemma-2-9B/e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json deleted file mode 100644 index fd69e6724..000000000 --- a/data/hfopenllm_v2/zelk12/MT4-Gen4-gemma-2-9B/e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen4-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT4-Gen4-gemma-2-9B", - "id": "zelk12/MT4-Gen4-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7874 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6076 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4244 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen5-gemma-2-9B/731a5f85-a59e-40af-870c-00e519ca0e7e.json b/data/hfopenllm_v2/zelk12/MT4-Gen5-gemma-2-9B/731a5f85-a59e-40af-870c-00e519ca0e7e.json deleted file mode 100644 index bc85cdb46..000000000 --- a/data/hfopenllm_v2/zelk12/MT4-Gen5-gemma-2-9B/731a5f85-a59e-40af-870c-00e519ca0e7e.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen5-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT4-Gen5-gemma-2-9B", - "id": "zelk12/MT4-Gen5-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7789 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6107 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2266 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3565 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4268 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4384 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B/38d93ae8-90ec-473c-8570-33d52c46770b.json b/data/hfopenllm_v2/zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B/38d93ae8-90ec-473c-8570-33d52c46770b.json deleted file mode 100644 index 197b1a326..000000000 --- a/data/hfopenllm_v2/zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B/38d93ae8-90ec-473c-8570-33d52c46770b.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT4-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT4-Max-Merge_02012025163610-gemma-2-9B", - "id": "zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1771 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0952 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4391 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT4-gemma-2-9B/9072fd28-040b-44df-bd58-6e3f59398189.json b/data/hfopenllm_v2/zelk12/MT4-gemma-2-9B/9072fd28-040b-44df-bd58-6e3f59398189.json deleted file mode 100644 index 7380725c6..000000000 --- a/data/hfopenllm_v2/zelk12/MT4-gemma-2-9B/9072fd28-040b-44df-bd58-6e3f59398189.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT4-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT4-gemma-2-9B", - "id": "zelk12/MT4-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6073 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2085 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3381 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4309 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4366 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen1-gemma-2-9B/14827e00-09c5-4ebd-93cb-8e026ac73d20.json b/data/hfopenllm_v2/zelk12/MT5-Gen1-gemma-2-9B/14827e00-09c5-4ebd-93cb-8e026ac73d20.json deleted file mode 100644 index 045a5d93e..000000000 --- a/data/hfopenllm_v2/zelk12/MT5-Gen1-gemma-2-9B/14827e00-09c5-4ebd-93cb-8e026ac73d20.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen1-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT5-Gen1-gemma-2-9B", - "id": "zelk12/MT5-Gen1-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7831 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.611 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2213 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3473 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4204 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4368 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen2-gemma-2-9B/11e76d74-b8e0-408f-b429-566faa5d60a2.json b/data/hfopenllm_v2/zelk12/MT5-Gen2-gemma-2-9B/11e76d74-b8e0-408f-b429-566faa5d60a2.json deleted file mode 100644 index 0398ea5dd..000000000 --- a/data/hfopenllm_v2/zelk12/MT5-Gen2-gemma-2-9B/11e76d74-b8e0-408f-b429-566faa5d60a2.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen2-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT5-Gen2-gemma-2-9B", - "id": "zelk12/MT5-Gen2-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7962 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6105 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4163 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4379 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen3-gemma-2-9B/944c84d8-231d-47ef-85f4-23c0286a4a02.json b/data/hfopenllm_v2/zelk12/MT5-Gen3-gemma-2-9B/944c84d8-231d-47ef-85f4-23c0286a4a02.json deleted file mode 100644 index 1735bd1eb..000000000 --- a/data/hfopenllm_v2/zelk12/MT5-Gen3-gemma-2-9B/944c84d8-231d-47ef-85f4-23c0286a4a02.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen3-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT5-Gen3-gemma-2-9B", - "id": "zelk12/MT5-Gen3-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7825 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.609 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2168 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen4-gemma-2-9B/47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json b/data/hfopenllm_v2/zelk12/MT5-Gen4-gemma-2-9B/47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json deleted file mode 100644 index 4db3a4fe8..000000000 --- a/data/hfopenllm_v2/zelk12/MT5-Gen4-gemma-2-9B/47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen4-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT5-Gen4-gemma-2-9B", - "id": "zelk12/MT5-Gen4-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7835 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6131 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2243 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4397 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen5-gemma-2-9B/ca54a8d4-153b-4169-b6ee-133461a9bedd.json b/data/hfopenllm_v2/zelk12/MT5-Gen5-gemma-2-9B/ca54a8d4-153b-4169-b6ee-133461a9bedd.json deleted file mode 100644 index 08cfd2a70..000000000 --- a/data/hfopenllm_v2/zelk12/MT5-Gen5-gemma-2-9B/ca54a8d4-153b-4169-b6ee-133461a9bedd.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen5-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT5-Gen5-gemma-2-9B", - "id": "zelk12/MT5-Gen5-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7947 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2258 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4191 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4329 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B/652359ec-14f2-4f94-a694-b7dc98819bfc.json b/data/hfopenllm_v2/zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B/652359ec-14f2-4f94-a694-b7dc98819bfc.json deleted file mode 100644 index 5dfae6c49..000000000 --- a/data/hfopenllm_v2/zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B/652359ec-14f2-4f94-a694-b7dc98819bfc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT5-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT5-Max-Merge_02012025163610-gemma-2-9B", - "id": "zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1762 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6127 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0982 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4228 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.439 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MT5-gemma-2-9B/b34f3335-c7a3-431f-b2c8-6f0731a81378.json b/data/hfopenllm_v2/zelk12/MT5-gemma-2-9B/b34f3335-c7a3-431f-b2c8-6f0731a81378.json deleted file mode 100644 index f6eb0dcef..000000000 --- a/data/hfopenllm_v2/zelk12/MT5-gemma-2-9B/b34f3335-c7a3-431f-b2c8-6f0731a81378.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MT5-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MT5-gemma-2-9B", - "id": "zelk12/MT5-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8048 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6112 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2258 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4204 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4367 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MTM-Merge-gemma-2-9B/077306f9-5d40-40dc-9df4-b5ca559af5c7.json b/data/hfopenllm_v2/zelk12/MTM-Merge-gemma-2-9B/077306f9-5d40-40dc-9df4-b5ca559af5c7.json deleted file mode 100644 index 5b5954b79..000000000 --- a/data/hfopenllm_v2/zelk12/MTM-Merge-gemma-2-9B/077306f9-5d40-40dc-9df4-b5ca559af5c7.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MTM-Merge-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MTM-Merge-gemma-2-9B", - "id": "zelk12/MTM-Merge-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7798 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6133 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2175 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3549 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4268 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4388 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B/e0f0fe87-8ed3-4398-8683-65aa042d01d9.json b/data/hfopenllm_v2/zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B/e0f0fe87-8ed3-4398-8683-65aa042d01d9.json deleted file mode 100644 index 15b286068..000000000 --- a/data/hfopenllm_v2/zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B/e0f0fe87-8ed3-4398-8683-65aa042d01d9.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_MTMaMe-Merge_02012025163610-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MTMaMe-Merge_02012025163610-gemma-2-9B", - "id": "zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1786 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6117 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0959 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3523 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4241 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4382 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25-gemma-2-9B/2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json b/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25-gemma-2-9B/2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json deleted file mode 100644 index eeccbef87..000000000 --- a/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25-gemma-2-9B/2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_Rv0.4DMv1t0.25-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rv0.4DMv1t0.25-gemma-2-9B", - "id": "zelk12/Rv0.4DMv1t0.25-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7497 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.607 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2258 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4309 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/db476911-87fb-433f-b164-4435718dab46.json b/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/db476911-87fb-433f-b164-4435718dab46.json deleted file mode 100644 index 48c1ee534..000000000 --- a/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/db476911-87fb-433f-b164-4435718dab46.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rv0.4DMv1t0.25Tt0.25-gemma-2-9B", - "id": "zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7646 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6098 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2069 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3423 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4283 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4347 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/Rv0.4MT4g2-gemma-2-9B/75a967f6-a8ab-435f-999b-4889e8217dce.json b/data/hfopenllm_v2/zelk12/Rv0.4MT4g2-gemma-2-9B/75a967f6-a8ab-435f-999b-4889e8217dce.json deleted file mode 100644 index eb1701501..000000000 --- a/data/hfopenllm_v2/zelk12/Rv0.4MT4g2-gemma-2-9B/75a967f6-a8ab-435f-999b-4889e8217dce.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_Rv0.4MT4g2-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Rv0.4MT4g2-gemma-2-9B", - "id": "zelk12/Rv0.4MT4g2-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.732 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6041 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1949 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3532 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4231 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4417 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/T31122024203920-gemma-2-9B/e072997b-2f79-4d25-b8dc-ebf15ac311e1.json b/data/hfopenllm_v2/zelk12/T31122024203920-gemma-2-9B/e072997b-2f79-4d25-b8dc-ebf15ac311e1.json deleted file mode 100644 index 2fd27ce58..000000000 --- a/data/hfopenllm_v2/zelk12/T31122024203920-gemma-2-9B/e072997b-2f79-4d25-b8dc-ebf15ac311e1.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_T31122024203920-gemma-2-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "T31122024203920-gemma-2-9B", - "id": "zelk12/T31122024203920-gemma-2-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7676 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6096 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2054 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3507 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4322 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4373 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/Test01012025155054/6d681a29-0d1a-4054-8250-5246993509f8.json b/data/hfopenllm_v2/zelk12/Test01012025155054/6d681a29-0d1a-4054-8250-5246993509f8.json deleted file mode 100644 index 5d2d4db3f..000000000 --- a/data/hfopenllm_v2/zelk12/Test01012025155054/6d681a29-0d1a-4054-8250-5246993509f8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_Test01012025155054/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Test01012025155054", - "id": "zelk12/Test01012025155054", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 3.817 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.283 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2416 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/Test01012025155054t0.5_gemma-2/2a6af4ce-e45c-4721-a23c-03071a5e774f.json b/data/hfopenllm_v2/zelk12/Test01012025155054t0.5_gemma-2/2a6af4ce-e45c-4721-a23c-03071a5e774f.json deleted file mode 100644 index e5cfc1c87..000000000 --- a/data/hfopenllm_v2/zelk12/Test01012025155054t0.5_gemma-2/2a6af4ce-e45c-4721-a23c-03071a5e774f.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_Test01012025155054t0.5_gemma-2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Test01012025155054t0.5_gemma-2", - "id": "zelk12/Test01012025155054t0.5_gemma-2", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 3.817 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1555 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.283 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2416 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.367 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.109 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/gemma-2-S2MTM-9B/5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json b/data/hfopenllm_v2/zelk12/gemma-2-S2MTM-9B/5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json deleted file mode 100644 index 18e9a3529..000000000 --- a/data/hfopenllm_v2/zelk12/gemma-2-S2MTM-9B/5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_gemma-2-S2MTM-9B/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemma-2-S2MTM-9B", - "id": "zelk12/gemma-2-S2MTM-9B", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7823 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6061 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2047 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3456 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4218 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4297 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/60052d34-f6a7-4204-baea-532f5ba29880.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/60052d34-f6a7-4204-baea-532f5ba29880.json deleted file mode 100644 index 2e6a12b87..000000000 --- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/60052d34-f6a7-4204-baea-532f5ba29880.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25", - "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7707 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2145 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3431 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.44 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/e1ddd882-f8a1-48d0-bb2a-878f43095895.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/e1ddd882-f8a1-48d0-bb2a-878f43095895.json deleted file mode 100644 index c498d42d2..000000000 --- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/e1ddd882-f8a1-48d0-bb2a-878f43095895.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75", - "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7208 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5995 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2017 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3498 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3951 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4141 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1/d2c3edec-38d8-48e3-9f6d-e26a63442af8.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1/d2c3edec-38d8-48e3-9f6d-e26a63442af8.json deleted file mode 100644 index eeaa69af5..000000000 --- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1/d2c3edec-38d8-48e3-9f6d-e26a63442af8.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-Ataraxy-9B-v0.1", - "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7649 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6075 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2281 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3498 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4136 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4321 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2/dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2/dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json deleted file mode 100644 index d481e65cf..000000000 --- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2/dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-Ataraxy-9B-v0.2", - "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2228 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3482 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.411 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json deleted file mode 100644 index 93f6e8d52..000000000 --- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1", - "id": "zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7615 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6099 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.21 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3414 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4315 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ifable-9B-v0.1/fc262523-dcde-4b45-80ba-2922e66d42c4.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ifable-9B-v0.1/fc262523-dcde-4b45-80ba-2922e66d42c4.json deleted file mode 100644 index 5aa27b1eb..000000000 --- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ifable-9B-v0.1/fc262523-dcde-4b45-80ba-2922e66d42c4.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ifable-9B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-Ifable-9B-v0.1", - "id": "zelk12/recoilme-gemma-2-Ifable-9B-v0.1", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7944 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6064 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2205 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3515 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4323 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/f8d745da-9867-4348-bace-d8052c3b4025.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/f8d745da-9867-4348-bace-d8052c3b4025.json deleted file mode 100644 index 283796808..000000000 --- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/f8d745da-9867-4348-bace-d8052c3b4025.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "recoilme-gemma-2-psy10k-mental_healt-9B-v0.1", - "id": "zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1", - "developer": "zelk12", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Gemma2ForCausalLM", - "params_billions": 10.159 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7445 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5978 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1888 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.344 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4295 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4181 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zetasepic/Qwen2.5-32B-Instruct-abliterated-v2/3d410f0f-6b24-4e86-a353-6142c51b1ecc.json b/data/hfopenllm_v2/zetasepic/Qwen2.5-32B-Instruct-abliterated-v2/3d410f0f-6b24-4e86-a353-6142c51b1ecc.json deleted file mode 100644 index 165cc965d..000000000 --- a/data/hfopenllm_v2/zetasepic/Qwen2.5-32B-Instruct-abliterated-v2/3d410f0f-6b24-4e86-a353-6142c51b1ecc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zetasepic_Qwen2.5-32B-Instruct-abliterated-v2/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-32B-Instruct-abliterated-v2", - "id": "zetasepic/Qwen2.5-32B-Instruct-abliterated-v2", - "developer": "zetasepic", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 32.764 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8334 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6934 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5952 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3674 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4354 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5622 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zetasepic/Qwen2.5-72B-Instruct-abliterated/46329fc3-974f-4d04-be9e-ba85b3816efc.json b/data/hfopenllm_v2/zetasepic/Qwen2.5-72B-Instruct-abliterated/46329fc3-974f-4d04-be9e-ba85b3816efc.json deleted file mode 100644 index 5b3165605..000000000 --- a/data/hfopenllm_v2/zetasepic/Qwen2.5-72B-Instruct-abliterated/46329fc3-974f-4d04-be9e-ba85b3816efc.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zetasepic_Qwen2.5-72B-Instruct-abliterated/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen2.5-72B-Instruct-abliterated", - "id": "zetasepic/Qwen2.5-72B-Instruct-abliterated", - "developer": "zetasepic", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "Qwen2ForCausalLM", - "params_billions": 72.706 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7153 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7152 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5242 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4069 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4719 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5872 - } - } - ] -} \ No newline at end of file diff --git a/data/hfopenllm_v2/zhengr/MixTAO-7Bx2-MoE-v8.1/b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json b/data/hfopenllm_v2/zhengr/MixTAO-7Bx2-MoE-v8.1/b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json deleted file mode 100644 index b0791265c..000000000 --- a/data/hfopenllm_v2/zhengr/MixTAO-7Bx2-MoE-v8.1/b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "hfopenllm_v2/zhengr_MixTAO-7Bx2-MoE-v8.1/1770682486.623709", - "retrieved_timestamp": "1770682486.623709", - "source_metadata": { - "source_name": "HF Open LLM v2", - "source_type": "documentation", - "source_organization_name": "Hugging Face", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "MixTAO-7Bx2-MoE-v8.1", - "id": "zhengr/MixTAO-7Bx2-MoE-v8.1", - "developer": "zhengr", - "inference_platform": "unknown", - "additional_details": { - "precision": "bfloat16", - "architecture": "MixtralForCausalLM", - "params_billions": 12.879 - } - }, - "evaluation_results": [ - { - "evaluation_name": "IFEval", - "source_data": { - "dataset_name": "IFEval", - "source_type": "hf_dataset", - "hf_repo": "google/IFEval" - }, - "metric_config": { - "evaluation_description": "Accuracy on IFEval", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - } - }, - { - "evaluation_name": "BBH", - "source_data": { - "dataset_name": "BBH", - "source_type": "hf_dataset", - "hf_repo": "SaylorTwift/bbh" - }, - "metric_config": { - "evaluation_description": "Accuracy on BBH", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4202 - } - }, - { - "evaluation_name": "MATH Level 5", - "source_data": { - "dataset_name": "MATH Level 5", - "source_type": "hf_dataset", - "hf_repo": "DigitalLearningGmbH/MATH-lighteval" - }, - "metric_config": { - "evaluation_description": "Exact Match on MATH Level 5", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0604 - } - }, - { - "evaluation_name": "GPQA", - "source_data": { - "dataset_name": "GPQA", - "source_type": "hf_dataset", - "hf_repo": "Idavidrein/gpqa" - }, - "metric_config": { - "evaluation_description": "Accuracy on GPQA", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2987 - } - }, - { - "evaluation_name": "MUSR", - "source_data": { - "dataset_name": "MUSR", - "source_type": "hf_dataset", - "hf_repo": "TAUR-Lab/MuSR" - }, - "metric_config": { - "evaluation_description": "Accuracy on MUSR", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3976 - } - }, - { - "evaluation_name": "MMLU-PRO", - "source_data": { - "dataset_name": "MMLU-PRO", - "source_type": "hf_dataset", - "hf_repo": "TIGER-Lab/MMLU-Pro" - }, - "metric_config": { - "evaluation_description": "Accuracy on MMLU-PRO", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2847 - } - } - ] -} \ No newline at end of file diff --git a/data/livecodebenchpro/alibaba/qwen3-235b-a22b-thinking-2507/126326f3-6521-45d1-aa14-5c51335c1929.json b/data/livecodebenchpro/alibaba/qwen3-235b-a22b-thinking-2507/126326f3-6521-45d1-aa14-5c51335c1929.json deleted file mode 100644 index f3d8ed859..000000000 --- a/data/livecodebenchpro/alibaba/qwen3-235b-a22b-thinking-2507/126326f3-6521-45d1-aa14-5c51335c1929.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/qwen3-235b-a22b-thinking-2507/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "qwen3-235b-a22b-thinking-2507", - "developer": "Alibaba", - "inference_platform": "aliyun", - "id": "alibaba/qwen3-235b-a22b-thinking-2507" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.1267605633802817 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.7605633802816901 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/alibaba/qwen3-30b-a3b/b3f5937a-1489-417b-8162-6c62dea0703d.json b/data/livecodebenchpro/alibaba/qwen3-30b-a3b/b3f5937a-1489-417b-8162-6c62dea0703d.json deleted file mode 100644 index 86221fbb2..000000000 --- a/data/livecodebenchpro/alibaba/qwen3-30b-a3b/b3f5937a-1489-417b-8162-6c62dea0703d.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/qwen3-30b-a3b/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "qwen3-30b-a3b", - "developer": "Alibaba", - "inference_platform": "aliyun", - "id": "alibaba/qwen3-30b-a3b" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.028169014084507043 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.5774647887323944 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/alibaba/qwen3-max/f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json b/data/livecodebenchpro/alibaba/qwen3-max/f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json deleted file mode 100644 index f517719a6..000000000 --- a/data/livecodebenchpro/alibaba/qwen3-max/f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/alibaba/qwen3-max/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "alibaba/qwen3-max", - "developer": "Alibaba", - "inference_platform": "openrouter", - "id": "alibaba/qwen3-max" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.04225352112676056 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.36619718309859156 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/alibaba/qwen3-next-80b-a3b-thinking/809a1503-a161-4532-afd3-fdbd6551eb63.json b/data/livecodebenchpro/alibaba/qwen3-next-80b-a3b-thinking/809a1503-a161-4532-afd3-fdbd6551eb63.json deleted file mode 100644 index 3255f5bca..000000000 --- a/data/livecodebenchpro/alibaba/qwen3-next-80b-a3b-thinking/809a1503-a161-4532-afd3-fdbd6551eb63.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/qwen3-next-80b-a3b-thinking/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "qwen3-next-80b-a3b-thinking", - "developer": "Alibaba", - "inference_platform": "aliyun", - "id": "alibaba/qwen3-next-80b-a3b-thinking" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.14084507042253522 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.7464788732394366 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/aliyun/qwen3-next-80b-a3b-thinking/808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json b/data/livecodebenchpro/aliyun/qwen3-next-80b-a3b-thinking/808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json deleted file mode 100644 index 78b03c308..000000000 --- a/data/livecodebenchpro/aliyun/qwen3-next-80b-a3b-thinking/808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/qwen3-next-80b-a3b-thinking/1770683238.099205", - "retrieved_timestamp": "1770683238.099205", - "source_metadata": { - "source_name": "Live Code Bench Pro", - "source_type": "documentation", - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "qwen3-next-80b-a3b-thinking", - "id": "aliyun/qwen3-next-80b-a3b-thinking", - "developer": "aliyun", - "inference_platform": "aliyun" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "Medium Problems", - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0704 - } - }, - { - "evaluation_name": "Easy Problems", - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6901 - } - } - ] -} \ No newline at end of file diff --git a/data/livecodebenchpro/anthropic/claude-3-7-sonnet-20250219/be076445-eb88-49b0-a855-2e0cb1551bab.json b/data/livecodebenchpro/anthropic/claude-3-7-sonnet-20250219/be076445-eb88-49b0-a855-2e0cb1551bab.json deleted file mode 100644 index 6816347c8..000000000 --- a/data/livecodebenchpro/anthropic/claude-3-7-sonnet-20250219/be076445-eb88-49b0-a855-2e0cb1551bab.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/claude-3-7-sonnet-20250219/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "claude-3-7-sonnet-20250219", - "developer": "Anthropic", - "inference_platform": "anthropic", - "id": "anthropic/claude-3-7-sonnet-20250219" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.28169014084507044 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/anthropic/claude-3.7-sonnet/69210faf-04a8-46d4-b92b-94f2ca521c09.json b/data/livecodebenchpro/anthropic/claude-3.7-sonnet/69210faf-04a8-46d4-b92b-94f2ca521c09.json deleted file mode 100644 index 586366674..000000000 --- a/data/livecodebenchpro/anthropic/claude-3.7-sonnet/69210faf-04a8-46d4-b92b-94f2ca521c09.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/anthropic/claude-3.7-sonnet/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "anthropic/claude-3.7-sonnet", - "developer": "Anthropic", - "inference_platform": "openrouter", - "id": "anthropic/claude-3.7-sonnet" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.014084507042253521 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.15492957746478872 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/anthropic/claude-sonnet-4-5-20250929/ed293aa1-f64e-429d-bddf-91a35a4203d1.json b/data/livecodebenchpro/anthropic/claude-sonnet-4-5-20250929/ed293aa1-f64e-429d-bddf-91a35a4203d1.json deleted file mode 100644 index 304dcadde..000000000 --- a/data/livecodebenchpro/anthropic/claude-sonnet-4-5-20250929/ed293aa1-f64e-429d-bddf-91a35a4203d1.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/claude-sonnet-4-5-20250929/1770683238.099205", - "retrieved_timestamp": "1770683238.099205", - "source_metadata": { - "source_name": "Live Code Bench Pro", - "source_type": "documentation", - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "claude-sonnet-4-5-20250929", - "id": "anthropic/claude-sonnet-4-5-20250929", - "developer": "anthropic", - "inference_platform": "anthropic" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "Medium Problems", - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "Easy Problems", - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5352 - } - } - ] -} \ No newline at end of file diff --git a/data/livecodebenchpro/ark/ep-20250603132404-cgpjm/2bddd388-5e9a-423e-8767-37d6f9f69032.json b/data/livecodebenchpro/ark/ep-20250603132404-cgpjm/2bddd388-5e9a-423e-8767-37d6f9f69032.json deleted file mode 100644 index 5c18e44af..000000000 --- a/data/livecodebenchpro/ark/ep-20250603132404-cgpjm/2bddd388-5e9a-423e-8767-37d6f9f69032.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/ep-20250603132404-cgpjm/1770683238.099205", - "retrieved_timestamp": "1770683238.099205", - "source_metadata": { - "source_name": "Live Code Bench Pro", - "source_type": "documentation", - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ep-20250603132404-cgpjm", - "id": "ark/ep-20250603132404-cgpjm", - "developer": "ark", - "inference_platform": "ark" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - } - }, - { - "evaluation_name": "Medium Problems", - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0141 - } - }, - { - "evaluation_name": "Easy Problems", - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.507 - } - } - ] -} \ No newline at end of file diff --git a/data/livecodebenchpro/bytedance/doubao-seed-1-6-thinking-250615/bfd991ca-13e9-4716-b389-11e0d2afe286.json b/data/livecodebenchpro/bytedance/doubao-seed-1-6-thinking-250615/bfd991ca-13e9-4716-b389-11e0d2afe286.json deleted file mode 100644 index 14a043adb..000000000 --- a/data/livecodebenchpro/bytedance/doubao-seed-1-6-thinking-250615/bfd991ca-13e9-4716-b389-11e0d2afe286.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/doubao-seed-1-6-thinking-250615/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "doubao-seed-1-6-thinking-250615", - "developer": "ByteDance", - "inference_platform": "ark", - "id": "bytedance/doubao-seed-1-6-thinking-250615" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.07042253521126761 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.5774647887323944 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/deepseek/chat-v3-0324/b29b7c8e-759e-45fe-a9d3-1054f19af617.json b/data/livecodebenchpro/deepseek/chat-v3-0324/b29b7c8e-759e-45fe-a9d3-1054f19af617.json deleted file mode 100644 index 88ca0a5d0..000000000 --- a/data/livecodebenchpro/deepseek/chat-v3-0324/b29b7c8e-759e-45fe-a9d3-1054f19af617.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/deepseek/chat-v3-0324/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "deepseek/chat-v3-0324", - "developer": "DeepSeek", - "inference_platform": "openrouter", - "id": "deepseek/chat-v3-0324" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.19718309859154928 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/deepseek/ep-20250214004308-p7n89/801d2dc6-17e7-47f1-a54f-87b94a59b508.json b/data/livecodebenchpro/deepseek/ep-20250214004308-p7n89/801d2dc6-17e7-47f1-a54f-87b94a59b508.json deleted file mode 100644 index 078e0a459..000000000 --- a/data/livecodebenchpro/deepseek/ep-20250214004308-p7n89/801d2dc6-17e7-47f1-a54f-87b94a59b508.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/ep-20250214004308-p7n89/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "ep-20250214004308-p7n89", - "developer": "DeepSeek", - "inference_platform": "ark", - "id": "deepseek/ep-20250214004308-p7n89" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.014084507042253521 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.4225352112676056 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/deepseek/ep-20250228232227-z44x5/def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json b/data/livecodebenchpro/deepseek/ep-20250228232227-z44x5/def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json deleted file mode 100644 index 4ea07e2df..000000000 --- a/data/livecodebenchpro/deepseek/ep-20250228232227-z44x5/def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/ep-20250228232227-z44x5/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "ep-20250228232227-z44x5", - "developer": "DeepSeek", - "inference_platform": "ark", - "id": "deepseek/ep-20250228232227-z44x5" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.1267605633802817 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/deepseek/ep-20250603132404-cgpjm/157dd68b-fcc2-416f-a2c0-c9781020e6af.json b/data/livecodebenchpro/deepseek/ep-20250603132404-cgpjm/157dd68b-fcc2-416f-a2c0-c9781020e6af.json deleted file mode 100644 index 114e45638..000000000 --- a/data/livecodebenchpro/deepseek/ep-20250603132404-cgpjm/157dd68b-fcc2-416f-a2c0-c9781020e6af.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/ep-20250603132404-cgpjm/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "ep-20250603132404-cgpjm", - "developer": "DeepSeek", - "inference_platform": "ark", - "id": "deepseek/ep-20250603132404-cgpjm" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.08450704225352113 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.5774647887323944 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/google/gemini-2.5-flash/174f0e23-84f1-43d0-bcdf-11b83c37025a.json b/data/livecodebenchpro/google/gemini-2.5-flash/174f0e23-84f1-43d0-bcdf-11b83c37025a.json deleted file mode 100644 index 57f7f41bd..000000000 --- a/data/livecodebenchpro/google/gemini-2.5-flash/174f0e23-84f1-43d0-bcdf-11b83c37025a.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/google/gemini-2.5-flash/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "google/gemini-2.5-flash", - "developer": "Google", - "inference_platform": "openrouter", - "id": "google/gemini-2.5-flash" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.028169014084507043 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.38028169014084506 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/google/gemini-2.5-pro/bef7254b-549f-4e6b-b5c8-31b84dc6acda.json b/data/livecodebenchpro/google/gemini-2.5-pro/bef7254b-549f-4e6b-b5c8-31b84dc6acda.json deleted file mode 100644 index a5be78bce..000000000 --- a/data/livecodebenchpro/google/gemini-2.5-pro/bef7254b-549f-4e6b-b5c8-31b84dc6acda.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/gemini-2.5-pro/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "gemini-2.5-pro", - "developer": "Google", - "inference_platform": "google", - "id": "google/gemini-2.5-pro" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.014084507042253521 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.2112676056338028 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.7183098591549296 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/kuaishou/kwaipilot-40b-0604/aa236b03-b81f-431b-b049-7101cea165f2.json b/data/livecodebenchpro/kuaishou/kwaipilot-40b-0604/aa236b03-b81f-431b-b049-7101cea165f2.json deleted file mode 100644 index 2cbd5d730..000000000 --- a/data/livecodebenchpro/kuaishou/kwaipilot-40b-0604/aa236b03-b81f-431b-b049-7101cea165f2.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/kwaipilot-40b-0604/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "kwaipilot-40b-0604", - "developer": "Kuaishou", - "inference_platform": "kuaishou", - "id": "kuaishou/kwaipilot-40b-0604" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.07042253521126761 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.056338028169014086 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/meta/llama-4-maverick/abc37028-a362-4e02-8499-1bb7497e0293.json b/data/livecodebenchpro/meta/llama-4-maverick/abc37028-a362-4e02-8499-1bb7497e0293.json deleted file mode 100644 index 949352df3..000000000 --- a/data/livecodebenchpro/meta/llama-4-maverick/abc37028-a362-4e02-8499-1bb7497e0293.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/meta/llama-4-maverick/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "meta/llama-4-maverick", - "developer": "Meta", - "inference_platform": "openrouter", - "id": "meta/llama-4-maverick" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.09859154929577464 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/openai/gpt-4.1/ba46ef91-d157-4984-b3df-ce33d8d97f8e.json b/data/livecodebenchpro/openai/gpt-4.1/ba46ef91-d157-4984-b3df-ce33d8d97f8e.json deleted file mode 100644 index 28d6a0f6c..000000000 --- a/data/livecodebenchpro/openai/gpt-4.1/ba46ef91-d157-4984-b3df-ce33d8d97f8e.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/openai/gpt-4.1/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "openai/gpt-4.1", - "developer": "OpenAI", - "inference_platform": "openrouter", - "id": "openai/gpt-4.1" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.19718309859154928 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/openai/gpt-4o-2024-11-20/e70acf51-30ef-4c20-b7cc-51704d114d70.json b/data/livecodebenchpro/openai/gpt-4o-2024-11-20/e70acf51-30ef-4c20-b7cc-51704d114d70.json deleted file mode 100644 index e67250be3..000000000 --- a/data/livecodebenchpro/openai/gpt-4o-2024-11-20/e70acf51-30ef-4c20-b7cc-51704d114d70.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/openai/gpt-4o-2024-11-20/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "openai/gpt-4o-2024-11-20", - "developer": "OpenAI", - "inference_platform": "openrouter", - "id": "openai/gpt-4o-2024-11-20" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.07042253521126761 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/openai/gpt-5-2025-08-07/0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json b/data/livecodebenchpro/openai/gpt-5-2025-08-07/0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json deleted file mode 100644 index cf3bb7a63..000000000 --- a/data/livecodebenchpro/openai/gpt-5-2025-08-07/0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "gpt-5-2025-08-07", - "developer": "OpenAI", - "inference_platform": "openai", - "id": "openai/gpt-5-2025-08-07" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.04225352112676056 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.4084507042253521 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.8873239436619719 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/openai/gpt-5-2025-08-07/de66cc70-b456-4165-a827-5193dd77e84d.json b/data/livecodebenchpro/openai/gpt-5-2025-08-07/de66cc70-b456-4165-a827-5193dd77e84d.json deleted file mode 100644 index 348bade22..000000000 --- a/data/livecodebenchpro/openai/gpt-5-2025-08-07/de66cc70-b456-4165-a827-5193dd77e84d.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1770683238.099205", - "retrieved_timestamp": "1770683238.099205", - "source_metadata": { - "source_name": "Live Code Bench Pro", - "source_type": "documentation", - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-5-2025-08-07", - "id": "openai/gpt-5-2025-08-07", - "developer": "openai", - "inference_platform": "openai" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0423 - } - }, - { - "evaluation_name": "Medium Problems", - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4085 - } - }, - { - "evaluation_name": "Easy Problems", - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9014 - } - } - ] -} \ No newline at end of file diff --git a/data/livecodebenchpro/openai/gpt-5.2-2025-12-11/e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json b/data/livecodebenchpro/openai/gpt-5.2-2025-12-11/e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json deleted file mode 100644 index 8996fcf9e..000000000 --- a/data/livecodebenchpro/openai/gpt-5.2-2025-12-11/e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/gpt-5.2-2025-12-11/1770683238.099205", - "retrieved_timestamp": "1770683238.099205", - "source_metadata": { - "source_name": "Live Code Bench Pro", - "source_type": "documentation", - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gpt-5.2-2025-12-11", - "id": "openai/gpt-5.2-2025-12-11", - "developer": "openai", - "inference_platform": "openai" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1594 - } - }, - { - "evaluation_name": "Medium Problems", - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5211 - } - }, - { - "evaluation_name": "Easy Problems", - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9014 - } - } - ] -} \ No newline at end of file diff --git a/data/livecodebenchpro/openai/gpt-oss-120b/1dd8c827-72af-4c8f-9ead-989de7105590.json b/data/livecodebenchpro/openai/gpt-oss-120b/1dd8c827-72af-4c8f-9ead-989de7105590.json deleted file mode 100644 index d9a8cbc70..000000000 --- a/data/livecodebenchpro/openai/gpt-oss-120b/1dd8c827-72af-4c8f-9ead-989de7105590.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/openai/gpt-oss-120b/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "openai/gpt-oss-120b", - "developer": "OpenAI", - "inference_platform": "openrouter", - "id": "openai/gpt-oss-120b" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.11267605633802817 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.6619718309859155 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/openai/gpt-oss-20b/ead39f61-b408-42b2-808f-8421a3200c89.json b/data/livecodebenchpro/openai/gpt-oss-20b/ead39f61-b408-42b2-808f-8421a3200c89.json deleted file mode 100644 index fd7123119..000000000 --- a/data/livecodebenchpro/openai/gpt-oss-20b/ead39f61-b408-42b2-808f-8421a3200c89.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/openai/gpt-oss-20b/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "openai/gpt-oss-20b", - "developer": "OpenAI", - "inference_platform": "openrouter", - "id": "openai/gpt-oss-20b" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.056338028169014086 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.5070422535211268 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/openai/o3-2025-04-16/f96bdb35-4d61-4fde-8d91-edf55f13dc03.json b/data/livecodebenchpro/openai/o3-2025-04-16/f96bdb35-4d61-4fde-8d91-edf55f13dc03.json deleted file mode 100644 index 5fc307953..000000000 --- a/data/livecodebenchpro/openai/o3-2025-04-16/f96bdb35-4d61-4fde-8d91-edf55f13dc03.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/o3-2025-04-16/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "o3-2025-04-16", - "developer": "OpenAI", - "inference_platform": "openai", - "id": "openai/o3-2025-04-16" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.22535211267605634 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.7183098591549296 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/openai/o4-mini-2025-04-16/5516f77c-932a-4eaa-ac31-dda9260ce82d.json b/data/livecodebenchpro/openai/o4-mini-2025-04-16/5516f77c-932a-4eaa-ac31-dda9260ce82d.json deleted file mode 100644 index 21df96195..000000000 --- a/data/livecodebenchpro/openai/o4-mini-2025-04-16/5516f77c-932a-4eaa-ac31-dda9260ce82d.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/o4-mini-2025-04-16/1770683238.099205", - "retrieved_timestamp": "1770683238.099205", - "source_metadata": { - "source_name": "Live Code Bench Pro", - "source_type": "documentation", - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "o4-mini-2025-04-16", - "id": "openai/o4-mini-2025-04-16", - "developer": "openai", - "inference_platform": "openai" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0143 - } - }, - { - "evaluation_name": "Medium Problems", - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2923 - } - }, - { - "evaluation_name": "Easy Problems", - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - }, - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8571 - } - } - ] -} \ No newline at end of file diff --git a/data/livecodebenchpro/openai/o4-mini-2025-04-16/8992cef5-df7e-40a1-b099-331532c3deb0.json b/data/livecodebenchpro/openai/o4-mini-2025-04-16/8992cef5-df7e-40a1-b099-331532c3deb0.json deleted file mode 100644 index 824e5dc57..000000000 --- a/data/livecodebenchpro/openai/o4-mini-2025-04-16/8992cef5-df7e-40a1-b099-331532c3deb0.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/o4-mini-2025-04-16/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "o4-mini-2025-04-16", - "developer": "OpenAI", - "inference_platform": "openai", - "id": "openai/o4-mini-2025-04-16" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.014084507042253521 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.30985915492957744 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.8873239436619719 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/livecodebenchpro/z-ai/glm-4.5/a77c08d6-a782-440c-b545-c60b6169712d.json b/data/livecodebenchpro/z-ai/glm-4.5/a77c08d6-a782-440c-b545-c60b6169712d.json deleted file mode 100644 index 013991ae1..000000000 --- a/data/livecodebenchpro/z-ai/glm-4.5/a77c08d6-a782-440c-b545-c60b6169712d.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "livecodebenchpro/z-ai/glm-4.5/1760492095.8105888", - "retrieved_timestamp": "1760492095.8105888", - "source_metadata": { - "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy", - "evaluator_relationship": "third_party", - "source_name": "Live Code Bench Pro", - "source_type": "documentation" - }, - "model_info": { - "name": "z-ai/glm-4.5", - "developer": "Z.AI", - "inference_platform": "openrouter", - "id": "z-ai/glm-4.5" - }, - "evaluation_results": [ - { - "evaluation_name": "Hard Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Hard Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "Hard Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Medium Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Medium Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.028169014084507043 - }, - "source_data": { - "dataset_name": "Medium Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live" - ] - } - }, - { - "evaluation_name": "Easy Problems", - "metric_config": { - "evaluation_description": "Pass@1 on Easy Problems", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0, - "max_score": 1 - }, - "score_details": { - "score": 0.1267605633802817 - }, - "source_data": { - "dataset_name": "Easy Problems", - "source_type": "url", - "url": [ - "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live" - ] - } - } - ] -} diff --git a/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json b/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json deleted file mode 100644 index cef912137..000000000 --- a/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/0-hero_Matter-0.1-7B-DPO-preview/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "0-hero/Matter-0.1-7B-DPO-preview", - "id": "0-hero/Matter-0.1-7B-DPO-preview", - "developer": "0-hero", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7247 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8939 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5768 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6378 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8854 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5348 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json b/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json deleted file mode 100644 index 2e9c3f43d..000000000 --- a/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/0-hero_Matter-0.1-7B-boost-DPO-preview/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "0-hero/Matter-0.1-7B-boost-DPO-preview", - "id": "0-hero/Matter-0.1-7B-boost-DPO-preview", - "developer": "0-hero", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7448 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9106 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6096 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7135 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8395 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5566 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json b/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json deleted file mode 100644 index 89456cf7f..000000000 --- a/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ahjeong_MMPO_Gemma_7b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ahjeong/MMPO_Gemma_7b", - "id": "Ahjeong/MMPO_Gemma_7b", - "developer": "Ahjeong", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7587 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.614 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7135 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7756 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6831 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json b/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json deleted file mode 100644 index f147a68de..000000000 --- a/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ahjeong_MMPO_Gemma_7b_gamma1.1_epoch3/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3", - "id": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3", - "developer": "Ahjeong", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7652 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9721 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6338 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7635 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7284 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6913 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json b/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json deleted file mode 100644 index 9aaa4ec32..000000000 --- a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/anthropic_claude-3-5-sonnet-20240620/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "anthropic/claude-3-5-sonnet-20240620", - "id": "anthropic/claude-3-5-sonnet-20240620", - "developer": "anthropic", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6466 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5683 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8519 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8697 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json b/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json deleted file mode 100644 index 4e001bb6c..000000000 --- a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Anthropic_claude-3-5-sonnet-20240620/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Anthropic/claude-3-5-sonnet-20240620", - "id": "Anthropic/claude-3-5-sonnet-20240620", - "developer": "Anthropic", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8417 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9637 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7401 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8162 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8469 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json b/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json deleted file mode 100644 index 47b1297ca..000000000 --- a/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/anthropic_claude-3-7-sonnet-20250219/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "anthropic/claude-3-7-sonnet-20250219", - "id": "anthropic/claude-3-7-sonnet-20250219", - "developer": "anthropic", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7539 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7326 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5437 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.75 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9033 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9212 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6723 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json b/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json deleted file mode 100644 index 6e0f3e1b3..000000000 --- a/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/anthropic_claude-3-haiku-20240307/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "anthropic/claude-3-haiku-20240307", - "id": "anthropic/claude-3-haiku-20240307", - "developer": "anthropic", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3711 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4042 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2812 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3552 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.595 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.501 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0899 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json b/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json deleted file mode 100644 index 16656cf8a..000000000 --- a/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Anthropic_claude-3-haiku-20240307/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Anthropic/claude-3-haiku-20240307", - "id": "Anthropic/claude-3-haiku-20240307", - "developer": "Anthropic", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7289 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9274 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5197 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7953 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.706 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6635 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json b/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json deleted file mode 100644 index dd51285f1..000000000 --- a/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Anthropic_claude-3-opus-20240229/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Anthropic/claude-3-opus-20240229", - "id": "Anthropic/claude-3-opus-20240229", - "developer": "Anthropic", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8008 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9469 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6031 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8662 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7868 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json b/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json deleted file mode 100644 index 1c912c7c4..000000000 --- a/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/anthropic_claude-3-opus-20240229/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "anthropic/claude-3-opus-20240229", - "id": "anthropic/claude-3-opus-20240229", - "developer": "anthropic", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5744 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5389 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5137 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8378 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6646 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5601 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json b/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json deleted file mode 100644 index 3721b0f48..000000000 --- a/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Anthropic_claude-3-sonnet-20240229/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Anthropic/claude-3-sonnet-20240229", - "id": "Anthropic/claude-3-sonnet-20240229", - "developer": "Anthropic", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7458 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9344 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5658 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8169 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6907 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6963 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json b/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json deleted file mode 100644 index 8dffb9139..000000000 --- a/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/anthropic_claude-opus-4-20250514/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "anthropic/claude-opus-4-20250514", - "id": "anthropic/claude-opus-4-20250514", - "developer": "anthropic", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7648 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8267 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7491 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8954 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8616 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json b/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json deleted file mode 100644 index 45d756c32..000000000 --- a/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/anthropic_claude-sonnet-4-20250514/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "anthropic/claude-sonnet-4-20250514", - "id": "anthropic/claude-sonnet-4-20250514", - "developer": "anthropic", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7117 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3594 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7049 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8909 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7939 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json b/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json deleted file mode 100644 index e1c13041b..000000000 --- a/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/AtlaAI_Selene-1-Mini-Llama-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AtlaAI/Selene-1-Mini-Llama-3.1-8B", - "id": "AtlaAI/Selene-1-Mini-Llama-3.1-8B", - "developer": "AtlaAI", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8913 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9358 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7939 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8926 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9429 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json b/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json deleted file mode 100644 index e90407d26..000000000 --- a/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/AtlaAI_Selene-1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "AtlaAI/Selene-1", - "id": "AtlaAI/Selene-1", - "developer": "AtlaAI", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9241 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9777 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8399 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9216 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9572 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json b/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json deleted file mode 100644 index dd6dc0bf7..000000000 --- a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CIR-AMS/BTRM_Qwen2_7b_0613", - "id": "CIR-AMS/BTRM_Qwen2_7b_0613", - "developer": "CIR-AMS", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5736 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5347 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7178 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5737 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6527 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json b/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json deleted file mode 100644 index 7179fab0e..000000000 --- a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CIR-AMS/BTRM_Qwen2_7b_0613", - "id": "CIR-AMS/BTRM_Qwen2_7b_0613", - "developer": "CIR-AMS", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8172 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9749 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5724 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9014 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8775 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7029 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json b/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json deleted file mode 100644 index 10fb8ed6c..000000000 --- a/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/CohereForAI_c4ai-command-r-plus/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "CohereForAI/c4ai-command-r-plus", - "id": "CohereForAI/c4ai-command-r-plus", - "developer": "CohereForAI", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7057 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9511 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5986 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.704 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6924 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json b/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json deleted file mode 100644 index ff7ea07a0..000000000 --- a/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/ContextualAI_LMUnit-llama3.1-70b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/LMUnit-llama3.1-70b", - "id": "ContextualAI/LMUnit-llama3.1-70b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8054 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8463 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7158 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9067 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9697 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9063 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json b/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json deleted file mode 100644 index 8597afb51..000000000 --- a/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/ContextualAI_LMUnit-qwen2.5-72b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/LMUnit-qwen2.5-72b", - "id": "ContextualAI/LMUnit-qwen2.5-72b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8208 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8716 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5437 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7268 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9133 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9677 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9014 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json deleted file mode 100644 index 4aa411ea6..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_llama13b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-dpo_llama13b", - "id": "ContextualAI/archangel_sft-dpo_llama13b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.54 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7123 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4298 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5649 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4401 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5656 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json deleted file mode 100644 index fefd98c33..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_llama30b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-dpo_llama30b", - "id": "ContextualAI/archangel_sft-dpo_llama30b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5618 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6927 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4474 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4745 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5705 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json deleted file mode 100644 index 69dc72884..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_llama7b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-dpo_llama7b", - "id": "ContextualAI/archangel_sft-dpo_llama7b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5304 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5782 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5203 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5658 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5544 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json deleted file mode 100644 index c640a7456..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia1-4b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-dpo_pythia1-4b", - "id": "ContextualAI/archangel_sft-dpo_pythia1-4b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5233 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6397 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5041 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5672 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5427 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json deleted file mode 100644 index ed72e23b9..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia12-0b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-dpo_pythia12-0b", - "id": "ContextualAI/archangel_sft-dpo_pythia12-0b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5009 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6676 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5432 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4139 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5303 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json deleted file mode 100644 index 10908a053..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia2-8b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-dpo_pythia2-8b", - "id": "ContextualAI/archangel_sft-dpo_pythia2-8b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5286 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8073 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3355 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4473 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5135 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5501 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json deleted file mode 100644 index 40f3a091a..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia6-9b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-dpo_pythia6-9b", - "id": "ContextualAI/archangel_sft-dpo_pythia6-9b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5263 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7486 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3421 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5176 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4847 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.551 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json b/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json deleted file mode 100644 index 22b4b63bd..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_llama13b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-kto_llama13b", - "id": "ContextualAI/archangel_sft-kto_llama13b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5952 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8408 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3772 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4649 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7077 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.576 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json b/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json deleted file mode 100644 index ca6ff0f55..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_llama30b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-kto_llama30b", - "id": "ContextualAI/archangel_sft-kto_llama30b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5901 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8436 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4057 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6054 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5862 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json b/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json deleted file mode 100644 index ac5acffb2..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_llama7b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-kto_llama7b", - "id": "ContextualAI/archangel_sft-kto_llama7b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5388 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5587 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4364 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4568 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6941 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5575 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json deleted file mode 100644 index 36044c710..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia1-4b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-kto_pythia1-4b", - "id": "ContextualAI/archangel_sft-kto_pythia1-4b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5581 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6844 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3794 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5257 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6447 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5546 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json deleted file mode 100644 index 16ed21233..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia12-0b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-kto_pythia12-0b", - "id": "ContextualAI/archangel_sft-kto_pythia12-0b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5053 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7486 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4127 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.55 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json deleted file mode 100644 index 4c1047aa1..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia2-8b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-kto_pythia2-8b", - "id": "ContextualAI/archangel_sft-kto_pythia2-8b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5497 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3421 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4743 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6216 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.557 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json deleted file mode 100644 index 521c30c11..000000000 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia6-9b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ContextualAI/archangel_sft-kto_pythia6-9b", - "id": "ContextualAI/archangel_sft-kto_pythia6-9b", - "developer": "ContextualAI", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5561 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7765 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3618 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5365 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5415 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5723 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json b/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json deleted file mode 100644 index 5d22f5c2e..000000000 --- a/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Databricks-Mosaic-Research_PGRM/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Databricks-Mosaic-Research/PGRM", - "id": "Databricks-Mosaic-Research/PGRM", - "developer": "Databricks-Mosaic-Research", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8002 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7404 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9289 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9424 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8893 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json b/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json deleted file mode 100644 index 7ccfc23f1..000000000 --- a/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/HFXM_RAMO-Llama3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HFXM/RAMO-Llama3.1-8B", - "id": "HFXM/RAMO-Llama3.1-8B", - "developer": "HFXM", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6917 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6547 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5628 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9756 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9071 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6752 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json b/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json deleted file mode 100644 index a6b1abca8..000000000 --- a/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/HuggingFaceH4_starchat2-15b-v0.1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HuggingFaceH4/starchat2-15b-v0.1", - "id": "HuggingFaceH4/starchat2-15b-v0.1", - "developer": "HuggingFaceH4", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7322 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9385 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5548 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7095 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8159 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5525 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json b/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json deleted file mode 100644 index b313fb87e..000000000 --- a/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/HuggingFaceH4_zephyr-7b-alpha/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HuggingFaceH4/zephyr-7b-alpha", - "id": "HuggingFaceH4/zephyr-7b-alpha", - "developer": "HuggingFaceH4", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7392 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9162 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.625 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7662 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7514 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5353 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json b/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json deleted file mode 100644 index 7d0709109..000000000 --- a/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/HuggingFaceH4_zephyr-7b-beta/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HuggingFaceH4/zephyr-7b-beta", - "id": "HuggingFaceH4/zephyr-7b-beta", - "developer": "HuggingFaceH4", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7281 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9525 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6272 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6568 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7789 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5216 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json b/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json deleted file mode 100644 index 89a96432c..000000000 --- a/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/HuggingFaceH4_zephyr-7b-gemma-v0.1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "HuggingFaceH4/zephyr-7b-gemma-v0.1", - "id": "HuggingFaceH4/zephyr-7b-gemma-v0.1", - "developer": "HuggingFaceH4", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6758 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9581 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4956 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5824 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7463 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5171 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json b/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json deleted file mode 100644 index 73199f618..000000000 --- a/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/IDEA-CCNL_Ziya-LLaMA-7B-Reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "IDEA-CCNL/Ziya-LLaMA-7B-Reward", - "id": "IDEA-CCNL/Ziya-LLaMA-7B-Reward", - "developer": "IDEA-CCNL", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6378 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8687 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4605 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6405 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5775 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6461 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json b/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json deleted file mode 100644 index 2f1093e30..000000000 --- a/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LxzGordon/URM-LLaMa-3-8B", - "id": "LxzGordon/URM-LLaMa-3-8B", - "developer": "LxzGordon", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8991 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7873 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8824 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9574 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json b/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json deleted file mode 100644 index d56005468..000000000 --- a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LxzGordon/URM-LLaMa-3.1-8B", - "id": "LxzGordon/URM-LLaMa-3.1-8B", - "developer": "LxzGordon", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9294 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9553 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8816 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9108 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9698 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json b/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json deleted file mode 100644 index a57e5caa3..000000000 --- a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "LxzGordon/URM-LLaMa-3.1-8B", - "id": "LxzGordon/URM-LLaMa-3.1-8B", - "developer": "LxzGordon", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7394 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6884 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6393 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9178 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9758 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7653 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json deleted file mode 100644 index a9c1eb53c..000000000 --- a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/NCSOFT_Llama-3-OffsetBias-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NCSOFT/Llama-3-OffsetBias-8B", - "id": "NCSOFT/Llama-3-OffsetBias-8B", - "developer": "NCSOFT", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8397 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9246 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8026 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8676 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7639 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json deleted file mode 100644 index f38bf29f7..000000000 --- a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/NCSOFT_Llama-3-OffsetBias-RM-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NCSOFT/Llama-3-OffsetBias-RM-8B", - "id": "NCSOFT/Llama-3-OffsetBias-RM-8B", - "developer": "NCSOFT", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8942 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9721 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.818 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8676 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9192 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json deleted file mode 100644 index ec0f3756c..000000000 --- a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/NCSOFT_Llama-3-OffsetBias-RM-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NCSOFT/Llama-3-OffsetBias-RM-8B", - "id": "NCSOFT/Llama-3-OffsetBias-RM-8B", - "developer": "NCSOFT", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.648 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6084 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5191 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6786 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json b/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json deleted file mode 100644 index 3b7921590..000000000 --- a/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Nexusflow_Starling-RM-34B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nexusflow/Starling-RM-34B", - "id": "Nexusflow/Starling-RM-34B", - "developer": "Nexusflow", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8133 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5724 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8845 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7137 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json b/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json deleted file mode 100644 index a2c665080..000000000 --- a/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Nexusflow_Starling-RM-34B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Nexusflow/Starling-RM-34B", - "id": "Nexusflow/Starling-RM-34B", - "developer": "Nexusflow", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4553 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4589 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3187 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7556 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4808 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1004 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json b/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json deleted file mode 100644 index 734675894..000000000 --- a/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/NousResearch_Hermes-3-Llama-3.1-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NousResearch/Hermes-3-Llama-3.1-70B", - "id": "NousResearch/Hermes-3-Llama-3.1-70B", - "developer": "NousResearch", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7847 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9623 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5669 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.823 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7867 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json b/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json deleted file mode 100644 index 0e770043c..000000000 --- a/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/NousResearch_Nous-Hermes-2-Mistral-7B-DPO/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NousResearch/Nous-Hermes-2-Mistral-7B-DPO", - "id": "NousResearch/Nous-Hermes-2-Mistral-7B-DPO", - "developer": "NousResearch", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7481 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9218 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6053 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8243 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7375 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.555 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json b/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json deleted file mode 100644 index 99623d8ff..000000000 --- a/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", - "id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO", - "developer": "NousResearch", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7138 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9162 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6053 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8149 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6126 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5266 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json b/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json deleted file mode 100644 index 31e05eb58..000000000 --- a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1", - "id": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1", - "developer": "OpenAssistant", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.615 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9246 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5446 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5855 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6801 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json b/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json deleted file mode 100644 index dfed08cd8..000000000 --- a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1", - "id": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1", - "developer": "OpenAssistant", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2653 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3979 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.377 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3289 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1535 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.047 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json b/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json deleted file mode 100644 index 85e007109..000000000 --- a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5", - "id": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5", - "developer": "OpenAssistant", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6901 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8855 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4868 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6311 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7752 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6533 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json b/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json deleted file mode 100644 index 38eca68e4..000000000 --- a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5", - "id": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5", - "developer": "OpenAssistant", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2648 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3179 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3934 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3244 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2707 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0198 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json b/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json deleted file mode 100644 index 365667594..000000000 --- a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenAssistant/reward-model-deberta-v3-large-v2", - "id": "OpenAssistant/reward-model-deberta-v3-large-v2", - "developer": "OpenAssistant", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.32 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3853 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2687 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5027 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2768 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.12 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json b/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json deleted file mode 100644 index 4712b7f92..000000000 --- a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "OpenAssistant/reward-model-deberta-v3-large-v2", - "id": "OpenAssistant/reward-model-deberta-v3-large-v2", - "developer": "OpenAssistant", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6126 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8939 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4518 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7338 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3855 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5836 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json deleted file mode 100644 index ae927c59e..000000000 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PKU-Alignment/beaver-7b-v1.0-cost", - "id": "PKU-Alignment/beaver-7b-v1.0-cost", - "developer": "PKU-Alignment", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5798 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6173 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7351 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5482 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.57 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json deleted file mode 100644 index 9e4c95ee2..000000000 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PKU-Alignment/beaver-7b-v1.0-cost", - "id": "PKU-Alignment/beaver-7b-v1.0-cost", - "developer": "PKU-Alignment", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3332 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3263 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2313 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3989 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7589 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2939 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -0.01 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json deleted file mode 100644 index fc41926be..000000000 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PKU-Alignment/beaver-7b-v1.0-reward", - "id": "PKU-Alignment/beaver-7b-v1.0-reward", - "developer": "PKU-Alignment", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1606 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2105 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2938 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0646 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -0.01 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json deleted file mode 100644 index d06d08a3c..000000000 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PKU-Alignment/beaver-7b-v1.0-reward", - "id": "PKU-Alignment/beaver-7b-v1.0-reward", - "developer": "PKU-Alignment", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4727 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8184 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2873 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.346 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5993 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json deleted file mode 100644 index 3868ab64e..000000000 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PKU-Alignment/beaver-7b-v2.0-cost", - "id": "PKU-Alignment/beaver-7b-v2.0-cost", - "developer": "PKU-Alignment", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3326 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3789 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.275 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3333 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7356 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2828 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -0.01 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json deleted file mode 100644 index 4af7bc7a5..000000000 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PKU-Alignment/beaver-7b-v2.0-cost", - "id": "PKU-Alignment/beaver-7b-v2.0-cost", - "developer": "PKU-Alignment", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5957 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5726 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4561 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7608 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6211 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5397 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json deleted file mode 100644 index 376b686c0..000000000 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PKU-Alignment/beaver-7b-v2.0-reward", - "id": "PKU-Alignment/beaver-7b-v2.0-reward", - "developer": "PKU-Alignment", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2544 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2168 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2562 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3825 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3156 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2606 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0944 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json deleted file mode 100644 index 49fb24c89..000000000 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PKU-Alignment/beaver-7b-v2.0-reward", - "id": "PKU-Alignment/beaver-7b-v2.0-reward", - "developer": "PKU-Alignment", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6366 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8994 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.364 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6041 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6887 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6171 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json b/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json deleted file mode 100644 index f9da09026..000000000 --- a/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/PoLL_gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...", - "id": "PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...", - "developer": "PoLL", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7578 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9525 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8034 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7346 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json b/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json deleted file mode 100644 index 8753b5ea8..000000000 --- a/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Qwen_Qwen1.5-0.5B-Chat/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen/Qwen1.5-0.5B-Chat", - "id": "Qwen/Qwen1.5-0.5B-Chat", - "developer": "Qwen", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5298 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3547 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6294 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5703 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5984 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4629 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json b/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json deleted file mode 100644 index 48dfa65fd..000000000 --- a/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Qwen_Qwen1.5-1.8B-Chat/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen/Qwen1.5-1.8B-Chat", - "id": "Qwen/Qwen1.5-1.8B-Chat", - "developer": "Qwen", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5615 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6031 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4838 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7793 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4453 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json b/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json deleted file mode 100644 index f34eee3d4..000000000 --- a/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Qwen_Qwen1.5-14B-Chat/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen/Qwen1.5-14B-Chat", - "id": "Qwen/Qwen1.5-14B-Chat", - "developer": "Qwen", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6864 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5726 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7018 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7122 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8961 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4123 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json b/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json deleted file mode 100644 index 85d507824..000000000 --- a/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Qwen_Qwen1.5-4B-Chat/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen/Qwen1.5-4B-Chat", - "id": "Qwen/Qwen1.5-4B-Chat", - "developer": "Qwen", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5477 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3883 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6272 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5568 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6689 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.447 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json b/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json deleted file mode 100644 index f3cc894c3..000000000 --- a/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Qwen_Qwen1.5-72B-Chat/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen/Qwen1.5-72B-Chat", - "id": "Qwen/Qwen1.5-72B-Chat", - "developer": "Qwen", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6723 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6229 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6601 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8554 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4226 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json b/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json deleted file mode 100644 index 2373972cd..000000000 --- a/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Qwen_Qwen1.5-7B-Chat/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen/Qwen1.5-7B-Chat", - "id": "Qwen/Qwen1.5-7B-Chat", - "developer": "Qwen", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.675 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5363 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6908 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6919 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9041 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4288 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json b/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json deleted file mode 100644 index 7daa3735e..000000000 --- a/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Qwen_Qwen1.5-MoE-A2.7B-Chat/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen/Qwen1.5-MoE-A2.7B-Chat", - "id": "Qwen/Qwen1.5-MoE-A2.7B-Chat", - "developer": "Qwen", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6644 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7291 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6316 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.774 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4536 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json b/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json deleted file mode 100644 index 6ee54b6e7..000000000 --- a/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Qwen_WorldPM-72B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Qwen/WorldPM-72B", - "id": "Qwen/WorldPM-72B", - "developer": "Qwen", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6333 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7074 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3125 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6557 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8533 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9172 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3535 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json b/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json deleted file mode 100644 index d48e6bfec..000000000 --- a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/R-I-S-E_RISE-Judge-Qwen2.5-32B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "R-I-S-E/RISE-Judge-Qwen2.5-32B", - "id": "R-I-S-E/RISE-Judge-Qwen2.5-32B", - "developer": "R-I-S-E", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9266 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9665 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8333 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9189 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9877 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json b/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json deleted file mode 100644 index 2418db79b..000000000 --- a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/R-I-S-E_RISE-Judge-Qwen2.5-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "R-I-S-E/RISE-Judge-Qwen2.5-7B", - "id": "R-I-S-E/RISE-Judge-Qwen2.5-7B", - "developer": "R-I-S-E", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8819 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9218 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7654 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8797 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9608 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json b/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json deleted file mode 100644 index 7370b5a14..000000000 --- a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/RLHFlow_ArmoRM-Llama3-8B-v0.1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RLHFlow/ArmoRM-Llama3-8B-v0.1", - "id": "RLHFlow/ArmoRM-Llama3-8B-v0.1", - "developer": "RLHFlow", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6646 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6568 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7657 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6629 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json b/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json deleted file mode 100644 index 41532e2cf..000000000 --- a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/RLHFlow_ArmoRM-Llama3-8B-v0.1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RLHFlow/ArmoRM-Llama3-8B-v0.1", - "id": "RLHFlow/ArmoRM-Llama3-8B-v0.1", - "developer": "RLHFlow", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7675 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9054 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9735 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7429 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json b/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json deleted file mode 100644 index e8c6cfc5b..000000000 --- a/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/RLHFlow_LLaMA3-iterative-DPO-final/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RLHFlow/LLaMA3-iterative-DPO-final", - "id": "RLHFlow/LLaMA3-iterative-DPO-final", - "developer": "RLHFlow", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6783 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.838 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5921 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7865 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6161 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json b/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json deleted file mode 100644 index 35d7a9ebf..000000000 --- a/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/RLHFlow_RewardModel-Mistral-7B-for-DPA-v1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RLHFlow/RewardModel-Mistral-7B-for-DPA-v1", - "id": "RLHFlow/RewardModel-Mistral-7B-for-DPA-v1", - "developer": "RLHFlow", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6633 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8799 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7068 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5971 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6068 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json b/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json deleted file mode 100644 index ee0769a51..000000000 --- a/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/RLHFlow_pair-preference-model-LLaMA3-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "RLHFlow/pair-preference-model-LLaMA3-8B", - "id": "RLHFlow/pair-preference-model-LLaMA3-8B", - "developer": "RLHFlow", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8575 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9832 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6579 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8973 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9473 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7458 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json b/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json deleted file mode 100644 index 05c7b0ff2..000000000 --- a/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_GRM-Gemma-2B-rewardmodel-ft/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-Gemma-2B-rewardmodel-ft", - "id": "Ray2333/GRM-Gemma-2B-rewardmodel-ft", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8447 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8939 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7522 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8446 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8881 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json b/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json deleted file mode 100644 index 8a5814b51..000000000 --- a/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_GRM-Gemma-2B-sftreg/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-Gemma-2B-sftreg", - "id": "Ray2333/GRM-Gemma-2B-sftreg", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7451 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9553 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4868 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7932 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7684 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6983 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json b/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json deleted file mode 100644 index ae24803ba..000000000 --- a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_GRM-Llama3-8B-rewardmodel-ft/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-Llama3-8B-rewardmodel-ft", - "id": "Ray2333/GRM-Llama3-8B-rewardmodel-ft", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9154 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9553 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8618 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9081 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9362 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json b/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json deleted file mode 100644 index 2f035232b..000000000 --- a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Ray2333_GRM-Llama3-8B-rewardmodel-ft/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-Llama3-8B-rewardmodel-ft", - "id": "Ray2333/GRM-Llama3-8B-rewardmodel-ft", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6766 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6274 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5847 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8929 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6824 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json b/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json deleted file mode 100644 index fa7e8dccb..000000000 --- a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft", - "id": "Ray2333/GRM-gemma2-2B-rewardmodel-ft", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5966 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5305 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3125 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5902 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7455 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4788 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json b/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json deleted file mode 100644 index 79c0a560c..000000000 --- a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft", - "id": "Ray2333/GRM-gemma2-2B-rewardmodel-ft", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8839 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9302 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7719 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9216 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.912 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json b/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json deleted file mode 100644 index 3c94abda8..000000000 --- a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-distill/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-llama3-8B-distill", - "id": "Ray2333/GRM-llama3-8B-distill", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8464 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9832 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6842 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8676 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9133 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7209 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json b/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json deleted file mode 100644 index 7a518a591..000000000 --- a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-distill/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-llama3-8B-distill", - "id": "Ray2333/GRM-llama3-8B-distill", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5874 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5902 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6727 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5743 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json b/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json deleted file mode 100644 index fd63fed4b..000000000 --- a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-llama3-8B-sftreg", - "id": "Ray2333/GRM-llama3-8B-sftreg", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8542 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.986 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6776 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8919 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9229 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7309 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json b/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json deleted file mode 100644 index c42486675..000000000 --- a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-llama3-8B-sftreg", - "id": "Ray2333/GRM-llama3-8B-sftreg", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6089 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6189 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5792 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7867 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6828 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5981 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json b/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json deleted file mode 100644 index ef4f104a0..000000000 --- a/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_GRM-llama3.2-3B-rewardmodel-ft/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/GRM-llama3.2-3B-rewardmodel-ft", - "id": "Ray2333/GRM-llama3.2-3B-rewardmodel-ft", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9092 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9162 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8487 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.945 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json b/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json deleted file mode 100644 index 429660d5b..000000000 --- a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_Gemma-2B-rewardmodel-baseline/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/Gemma-2B-rewardmodel-baseline", - "id": "Ray2333/Gemma-2B-rewardmodel-baseline", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.729 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9413 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7865 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7384 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6897 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json b/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json deleted file mode 100644 index f7eece540..000000000 --- a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_Gemma-2B-rewardmodel-ft/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/Gemma-2B-rewardmodel-ft", - "id": "Ray2333/Gemma-2B-rewardmodel-ft", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8048 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7793 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7478 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8527 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8393 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json b/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json deleted file mode 100644 index 9a4e578d4..000000000 --- a/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Ray2333_reward-model-Mistral-7B-instruct-Unifie.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Ray2333/reward-model-Mistral-7B-instruct-Unifie...", - "id": "Ray2333/reward-model-Mistral-7B-instruct-Unifie...", - "developer": "Ray2333", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7661 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9777 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5066 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8527 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7389 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7434 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json b/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json deleted file mode 100644 index c50e15fdc..000000000 --- a/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/SF-Foundation_TextEval-Llama3.1-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SF-Foundation/TextEval-Llama3.1-70B", - "id": "SF-Foundation/TextEval-Llama3.1-70B", - "developer": "SF-Foundation", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9348 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9413 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9013 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9324 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9641 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json b/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json deleted file mode 100644 index b71080064..000000000 --- a/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/SF-Foundation_TextEval-OffsetBias-12B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SF-Foundation/TextEval-OffsetBias-12B", - "id": "SF-Foundation/TextEval-OffsetBias-12B", - "developer": "SF-Foundation", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9105 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.919 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8662 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9203 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9365 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json b/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json deleted file mode 100644 index 49c043587..000000000 --- a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Salesforce_SFR-LLaMa-3.1-70B-Judge-r/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Salesforce/SFR-LLaMa-3.1-70B-Judge-r", - "id": "Salesforce/SFR-LLaMa-3.1-70B-Judge-r", - "developer": "Salesforce", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9272 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8476 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9162 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json b/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json deleted file mode 100644 index deced96e1..000000000 --- a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Salesforce_SFR-LLaMa-3.1-8B-Judge-r/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Salesforce/SFR-LLaMa-3.1-8B-Judge-r", - "id": "Salesforce/SFR-LLaMa-3.1-8B-Judge-r", - "developer": "Salesforce", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8865 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9553 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7774 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8622 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9513 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json b/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json deleted file mode 100644 index 616e9bc30..000000000 --- a/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Salesforce_SFR-nemo-12B-Judge-r/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Salesforce/SFR-nemo-12B-Judge-r", - "id": "Salesforce/SFR-nemo-12B-Judge-r", - "developer": "Salesforce", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9027 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9721 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8224 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8649 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9513 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json b/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json deleted file mode 100644 index 4492a4262..000000000 --- a/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Schrieffer_Llama-SARM-4B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Schrieffer/Llama-SARM-4B", - "id": "Schrieffer/Llama-SARM-4B", - "developer": "Schrieffer", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7379 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6874 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4281 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9178 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9556 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7939 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json b/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json deleted file mode 100644 index 6723992e6..000000000 --- a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1", - "id": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1", - "developer": "ShikaiChen", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9499 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9637 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9079 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9378 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9903 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json b/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json deleted file mode 100644 index e51beb588..000000000 --- a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1", - "id": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1", - "developer": "ShikaiChen", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7249 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7558 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9131 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7633 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json b/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json deleted file mode 100644 index 7f469a316..000000000 --- a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Skywork_Skywork-Critic-Llama-3.1-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Critic-Llama-3.1-70B", - "id": "Skywork/Skywork-Critic-Llama-3.1-70B", - "developer": "Skywork", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9331 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9665 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8794 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9311 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9554 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json b/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json deleted file mode 100644 index cf3327493..000000000 --- a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Skywork_Skywork-Critic-Llama-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Critic-Llama-3.1-8B", - "id": "Skywork/Skywork-Critic-Llama-3.1-8B", - "developer": "Skywork", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8896 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9358 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8136 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9108 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json deleted file mode 100644 index 4ac0f4414..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2", - "id": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7531 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7674 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6721 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9689 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9172 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8182 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json deleted file mode 100644 index 5c04a1152..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2", - "id": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9426 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9609 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8991 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9297 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9807 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json deleted file mode 100644 index 08b4c8323..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-Gemma-2-27B", - "id": "Skywork/Skywork-Reward-Gemma-2-27B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7576 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7368 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4031 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7049 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9323 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8261 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json deleted file mode 100644 index 22de7e431..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-Gemma-2-27B", - "id": "Skywork/Skywork-Reward-Gemma-2-27B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9581 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9145 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9189 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9606 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json deleted file mode 100644 index e5a811527..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", - "id": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9313 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9469 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8838 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9675 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json deleted file mode 100644 index 1941ebc04..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", - "id": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6968 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6011 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9414 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7169 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json deleted file mode 100644 index ba30f4f9f..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-Llama-3.1-8B", - "id": "Skywork/Skywork-Reward-Llama-3.1-8B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7314 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6989 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9333 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9616 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.741 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json deleted file mode 100644 index 03903b4ec..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-Llama-3.1-8B", - "id": "Skywork/Skywork-Reward-Llama-3.1-8B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9252 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9581 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8728 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9081 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.962 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json deleted file mode 100644 index b19a61534..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Llama-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", - "id": "Skywork/Skywork-Reward-V2-Llama-3.1-8B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8413 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8463 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.776 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9838 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8124 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json deleted file mode 100644 index 2ff90cff2..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Llama-3.2-1B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-V2-Llama-3.2-1B", - "id": "Skywork/Skywork-Reward-V2-Llama-3.2-1B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6438 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6084 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6011 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8733 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8929 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4306 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json deleted file mode 100644 index 9f8069f50..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Llama-3.2-3B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-V2-Llama-3.2-3B", - "id": "Skywork/Skywork-Reward-V2-Llama-3.2-3B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7466 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7621 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4562 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9311 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6768 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json deleted file mode 100644 index 44ea9887d..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-0.6B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-V2-Qwen3-0.6B", - "id": "Skywork/Skywork-Reward-V2-Qwen3-0.6B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6125 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7158 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8444 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7949 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3397 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json deleted file mode 100644 index f670ad051..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-1.7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-V2-Qwen3-1.7B", - "id": "Skywork/Skywork-Reward-V2-Qwen3-1.7B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6818 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6568 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7268 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8911 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8848 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4872 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json deleted file mode 100644 index 6f6900c68..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-4B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-V2-Qwen3-4B", - "id": "Skywork/Skywork-Reward-V2-Qwen3-4B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7551 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7737 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7322 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9657 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6743 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json deleted file mode 100644 index 1c01babb8..000000000 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-Reward-V2-Qwen3-8B", - "id": "Skywork/Skywork-Reward-V2-Qwen3-8B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7837 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7989 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7705 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.94 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9636 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7294 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json b/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json deleted file mode 100644 index 47757e3b6..000000000 --- a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Skywork_Skywork-VL-Reward-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-VL-Reward-7B", - "id": "Skywork/Skywork-VL-Reward-7B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9007 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8994 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.875 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9108 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9176 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json b/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json deleted file mode 100644 index adb50e622..000000000 --- a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/Skywork_Skywork-VL-Reward-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Skywork/Skywork-VL-Reward-7B", - "id": "Skywork/Skywork-VL-Reward-7B", - "developer": "Skywork", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6885 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6063 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8911 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8909 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7586 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json b/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json deleted file mode 100644 index 16bfc7b82..000000000 --- a/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/SultanR_SmolTulu-1.7b-RM/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "SultanR/SmolTulu-1.7b-RM", - "id": "SultanR/SmolTulu-1.7b-RM", - "developer": "SultanR", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5094 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.743 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4408 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5716 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2821 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json b/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json deleted file mode 100644 index 5ae0638d6..000000000 --- a/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ZiyiYe_Con-J-Qwen2-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ZiyiYe/Con-J-Qwen2-7B", - "id": "ZiyiYe/Con-J-Qwen2-7B", - "developer": "ZiyiYe", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8712 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.919 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8026 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8824 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8808 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json b/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json deleted file mode 100644 index 400642b9b..000000000 --- a/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_llama-2-chat-7b-nectar-3.8m.json/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/llama-2-chat-7b-nectar-3.8m.json", - "id": "ai2/llama-2-chat-7b-nectar-3.8m.json", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5843 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8631 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2654 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6243 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json b/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json deleted file mode 100644 index 26ba58fae..000000000 --- a/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_llama-2-chat-nectar-180k.json/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/llama-2-chat-nectar-180k.json", - "id": "ai2/llama-2-chat-nectar-180k.json", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5235 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8827 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2851 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4027 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json b/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json deleted file mode 100644 index 4b539edf5..000000000 --- a/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_llama-2-chat-ultrafeedback-60k.jsonl/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/llama-2-chat-ultrafeedback-60k.jsonl", - "id": "ai2/llama-2-chat-ultrafeedback-60k.jsonl", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.644 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9441 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5338 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json deleted file mode 100644 index 277116c3d..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7058 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9525 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3947 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7703 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json deleted file mode 100644 index dfc9008b5..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7004 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9413 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7716 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json deleted file mode 100644 index 974cd9980..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6905 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9441 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3596 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7676 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json deleted file mode 100644 index fb2652a50..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6945 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9385 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3706 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7743 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json deleted file mode 100644 index a8d6993af..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6808 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9302 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3596 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7527 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json deleted file mode 100644 index 4d645ea3b..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6895 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9385 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3706 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7595 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json deleted file mode 100644 index f0096c309..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7019 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9497 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7811 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json deleted file mode 100644 index 7ccfca2e6..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7008 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9385 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3882 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json deleted file mode 100644 index dddd173cb..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6924 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9441 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3575 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json deleted file mode 100644 index fbf11359b..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-700k.json/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7127 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9358 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4079 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7946 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json deleted file mode 100644 index 3770ce48e..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized.json/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json", - "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6756 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9134 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3904 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.723 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json deleted file mode 100644 index ffe63e3e4..000000000 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0.json/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "ai2/tulu-2-7b-rm-v0.json", - "id": "ai2/tulu-2-7b-rm-v0.json", - "developer": "ai2", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6655 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.933 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4539 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6095 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json b/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json deleted file mode 100644 index fef53f2e4..000000000 --- a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2", - "id": "allenai/Llama-3.1-70B-Instruct-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7606 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8126 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6995 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8844 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8646 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8835 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json b/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json deleted file mode 100644 index 419aa0a24..000000000 --- a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2", - "id": "allenai/Llama-3.1-70B-Instruct-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9021 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9665 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8355 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9095 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8969 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json b/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json deleted file mode 100644 index 77a854ced..000000000 --- a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-8B-Base-RM-RB2", - "id": "allenai/Llama-3.1-8B-Base-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.649 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8267 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8323 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5406 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json b/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json deleted file mode 100644 index a01839a2f..000000000 --- a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-8B-Base-RM-RB2", - "id": "allenai/Llama-3.1-8B-Base-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8463 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.933 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7785 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8851 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7886 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json b/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json deleted file mode 100644 index 7175068fd..000000000 --- a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2", - "id": "allenai/Llama-3.1-8B-Instruct-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8885 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9581 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8158 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8932 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.887 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json b/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json deleted file mode 100644 index 095adf95a..000000000 --- a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2", - "id": "allenai/Llama-3.1-8B-Instruct-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7285 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7432 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8956 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9071 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7638 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json deleted file mode 100644 index f3bf51149..000000000 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", - "id": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.722 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8084 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6776 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8689 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7778 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8308 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json deleted file mode 100644 index d0492cb5e..000000000 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", - "id": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8892 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8268 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9027 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8583 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json deleted file mode 100644 index 042aac2cb..000000000 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", - "id": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.687 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7516 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.86 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8545 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6397 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json deleted file mode 100644 index 8adafbc18..000000000 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", - "id": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8431 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9553 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.761 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8662 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7898 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json deleted file mode 100644 index 98a6ce817..000000000 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", - "id": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8369 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9469 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7588 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8703 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7715 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json deleted file mode 100644 index d3b513f5d..000000000 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", - "id": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6871 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7642 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8644 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8485 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6281 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json deleted file mode 100644 index e3e043728..000000000 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-RM/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-Tulu-3-8B-RM", - "id": "allenai/Llama-3.1-Tulu-3-8B-RM", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7453 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3469 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5364 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5243 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json deleted file mode 100644 index 44e1a6e59..000000000 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", - "id": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8551 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9497 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7917 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8784 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8005 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json deleted file mode 100644 index 674d59e88..000000000 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", - "id": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6821 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7326 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5792 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8978 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8889 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6063 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json b/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json deleted file mode 100644 index f8ff8a104..000000000 --- a/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_OLMo-7B-Instruct/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/OLMo-7B-Instruct", - "id": "allenai/OLMo-7B-Instruct", - "developer": "allenai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6727 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8966 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5066 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6486 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7168 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5173 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json b/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json deleted file mode 100644 index 5110dfc2e..000000000 --- a/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-70b-uf-mean-rm/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/llama-3-tulu-2-70b-uf-mean-rm", - "id": "allenai/llama-3-tulu-2-70b-uf-mean-rm", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7019 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8631 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5614 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6095 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8268 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5957 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json b/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json deleted file mode 100644 index e1917bdfd..000000000 --- a/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-8b-uf-mean-rm/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/llama-3-tulu-2-8b-uf-mean-rm", - "id": "allenai/llama-3-tulu-2-8b-uf-mean-rm", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7342 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9525 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5921 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6162 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8212 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6434 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json b/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json deleted file mode 100644 index bef93ec9e..000000000 --- a/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-dpo-70b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/llama-3-tulu-2-dpo-70b", - "id": "allenai/llama-3-tulu-2-dpo-70b", - "developer": "allenai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7496 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9637 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5746 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7486 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5687 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json b/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json deleted file mode 100644 index a54ed9cc6..000000000 --- a/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-dpo-8b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/llama-3-tulu-2-dpo-8b", - "id": "allenai/llama-3-tulu-2-dpo-8b", - "developer": "allenai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7275 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9525 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5351 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6649 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8663 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5097 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json deleted file mode 100644 index 264f422e1..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739590997/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739590997", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739590997", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6004 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7032 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7867 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.598 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5165 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json deleted file mode 100644 index 1d7e43d9e..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739871066/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739871066", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739871066", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6012 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6989 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7978 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.604 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4527 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json deleted file mode 100644 index ccb6f9252..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739925892/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739925892", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739925892", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6345 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7432 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8111 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7131 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5606 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json deleted file mode 100644 index 8e0cbdf9e..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739943850/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739943850", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739943850", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5726 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3125 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5191 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6489 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3114 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json deleted file mode 100644 index b0fd4a9be..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739943881/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739943881", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739943881", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5998 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7032 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3187 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5792 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6727 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5025 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json deleted file mode 100644 index c0ad13c6c..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739943972/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739943972", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739943972", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5289 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6168 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5738 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6844 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5657 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3577 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json deleted file mode 100644 index 6eb775d20..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739957701/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739957701", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739957701", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6194 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6779 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6011 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8022 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.697 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5822 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json deleted file mode 100644 index c409d89eb..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739971507/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739971507", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739971507", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5717 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5475 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4545 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json deleted file mode 100644 index c8518a2e2..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739971529/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739971529", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739971529", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5564 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6568 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5956 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7533 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5737 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4027 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json deleted file mode 100644 index b41f9844e..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739998765/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1739998765", - "id": "allenai/open_instruct_dev-reward_modeling__1__1739998765", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6008 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7095 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8022 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5859 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4883 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json deleted file mode 100644 index 04e0eceb6..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1740005072/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1740005072", - "id": "allenai/open_instruct_dev-reward_modeling__1__1740005072", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6097 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7137 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7778 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6343 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5047 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json deleted file mode 100644 index a39194f60..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1740129284/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1740129284", - "id": "allenai/open_instruct_dev-reward_modeling__1__1740129284", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6129 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7116 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8022 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6101 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4652 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json deleted file mode 100644 index 6657f9ea6..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1741286813/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1741286813", - "id": "allenai/open_instruct_dev-reward_modeling__1__1741286813", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6557 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6295 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9111 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8263 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5365 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json deleted file mode 100644 index 56ed0daae..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1741287363/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1741287363", - "id": "allenai/open_instruct_dev-reward_modeling__1__1741287363", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6672 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6295 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9374 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5748 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json deleted file mode 100644 index d217f0d3d..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1741292911/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1741292911", - "id": "allenai/open_instruct_dev-reward_modeling__1__1741292911", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6607 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6589 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9089 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8869 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5028 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json deleted file mode 100644 index 4897a8825..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1742338142/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1742338142", - "id": "allenai/open_instruct_dev-reward_modeling__1__1742338142", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6344 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7326 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7049 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.88 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6323 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.475 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json deleted file mode 100644 index e2f564fd7..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1742519610/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1742519610", - "id": "allenai/open_instruct_dev-reward_modeling__1__1742519610", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6361 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7074 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6721 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6444 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5915 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json deleted file mode 100644 index e92fdfc16..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1742519628/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-reward_modeling__1__1742519628", - "id": "allenai/open_instruct_dev-reward_modeling__1__1742519628", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5609 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5179 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8356 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5071 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5254 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json deleted file mode 100644 index c75b5a4dd..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455", - "id": "allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0576 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1313 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0546 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0489 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0808 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -0.01 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json deleted file mode 100644 index d30d94d18..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511", - "id": "allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5499 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6821 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5956 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7356 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5212 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3711 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json deleted file mode 100644 index d5744d0a2..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406", - "id": "allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5054 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6358 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6867 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4424 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2922 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json deleted file mode 100644 index 1263be4bc..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136", - "id": "allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.478 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6442 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6356 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2707 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3496 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json deleted file mode 100644 index 8bd48e77c..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398", - "id": "allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.219 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2484 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2812 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1717 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.008 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json deleted file mode 100644 index a81128f70..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535", - "id": "allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6821 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6011 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7511 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5313 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.403 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json deleted file mode 100644 index 53333e181..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054", - "id": "allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5759 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7074 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7578 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5333 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json deleted file mode 100644 index ba5b508f6..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271", - "id": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6057 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5053 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5902 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7798 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5419 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json deleted file mode 100644 index 55a7568d2..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181", - "id": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6535 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7137 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8244 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7737 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6101 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json deleted file mode 100644 index b781deca8..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_rl__1__1743551221/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221", - "id": "allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5799 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7116 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5374 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.461 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json deleted file mode 100644 index 9f032475e..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262", - "id": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5903 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4863 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5738 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8489 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7778 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4926 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json deleted file mode 100644 index 9d860b297..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523", - "id": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6483 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7074 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7758 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6044 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json deleted file mode 100644 index 7961d2bec..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750", - "id": "allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5157 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6084 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7089 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3791 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json deleted file mode 100644 index 9a7aa8751..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427", - "id": "allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6009 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7263 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5902 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7933 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7273 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3931 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json deleted file mode 100644 index 0629539f9..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446", - "id": "allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5716 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6779 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5464 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7533 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7051 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3534 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json deleted file mode 100644 index bca56ed2b..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094", - "id": "allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5151 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6484 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5574 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7289 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4889 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3357 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json deleted file mode 100644 index 54fc2bfbe..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636", - "id": "allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6119 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8067 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6889 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.421 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json deleted file mode 100644 index 55ddbdfe8..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325", - "id": "allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6008 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7179 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5956 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6707 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4707 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json deleted file mode 100644 index 4b4ef4368..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_rl__1__1743551238/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238", - "id": "allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5965 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7095 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3438 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8044 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6566 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.453 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json deleted file mode 100644 index 2cf126b25..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906", - "id": "allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5574 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6526 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6011 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7711 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5051 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4208 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json deleted file mode 100644 index 8dc1a9073..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529", - "id": "allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0719 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0421 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0601 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0378 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0949 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -0.01 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json deleted file mode 100644 index 5f62f67ab..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305", - "id": "allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.553 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6674 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6733 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5697 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4227 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json deleted file mode 100644 index 431df7a47..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778", - "id": "allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4955 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6189 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5792 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6378 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5657 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2466 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json deleted file mode 100644 index 6552eeea3..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459", - "id": "allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4198 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5747 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5464 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4933 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2073 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json deleted file mode 100644 index 6c4f2b7cc..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747", - "id": "allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5465 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6821 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7333 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5051 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3713 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json deleted file mode 100644 index aabf71c3b..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935", - "id": "allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5197 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6126 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5847 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7333 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4646 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3855 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json deleted file mode 100644 index a1a7dd25e..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360", - "id": "allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4555 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5495 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4262 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5711 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6101 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2696 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json deleted file mode 100644 index f5aa5f436..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366", - "id": "allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5053 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4044 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6646 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1991 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json deleted file mode 100644 index a0af647f5..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352", - "id": "allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.341 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4674 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3333 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3711 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3919 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.195 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json deleted file mode 100644 index c1a864b6a..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634", - "id": "allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4698 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5853 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2562 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5027 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6489 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5697 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2562 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json deleted file mode 100644 index 4f624253c..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988", - "id": "allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4791 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6421 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3125 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.541 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6911 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4182 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.27 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json deleted file mode 100644 index da7381bc1..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103", - "id": "allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0607 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0274 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0656 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.04 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0788 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": -0.01 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json deleted file mode 100644 index a04867707..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835", - "id": "allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6089 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7622 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6444 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4686 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json deleted file mode 100644 index de5939755..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221", - "id": "allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6032 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7158 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7778 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5859 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5051 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json deleted file mode 100644 index 7f0d231a3..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826", - "id": "allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5831 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6947 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5758 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4465 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json deleted file mode 100644 index 25ae9470b..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363", - "id": "allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5268 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5792 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7178 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4343 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3809 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json deleted file mode 100644 index 0676aafc3..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498", - "id": "allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6093 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7326 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4313 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7578 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5859 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5143 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json deleted file mode 100644 index 42e9ce0e8..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1__2__1743897475/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1__2__1743897475", - "id": "allenai/open_instruct_dev-rm_3e-6_1__2__1743897475", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6122 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7368 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8044 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.602 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5071 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json deleted file mode 100644 index c9eb27faa..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1__3__1744311421/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1__3__1744311421", - "id": "allenai/open_instruct_dev-rm_3e-6_1__3__1744311421", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5995 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7179 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6323 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.503 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json deleted file mode 100644 index 86624d603..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903", - "id": "allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6154 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7326 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7778 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6061 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5043 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json deleted file mode 100644 index ade9d6695..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368", - "id": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6604 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6316 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5792 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9044 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8929 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5604 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json deleted file mode 100644 index c5d2892d2..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182", - "id": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6783 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7705 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.84 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8101 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6427 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json deleted file mode 100644 index e2e75967a..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012", - "id": "allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5911 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7347 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.604 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json deleted file mode 100644 index 2d3cbe13a..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765", - "id": "allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5926 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7263 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7889 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5879 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4733 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json deleted file mode 100644 index d305acb78..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_rl__1__1743551527/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527", - "id": "allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6126 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7411 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7822 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5939 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5104 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json deleted file mode 100644 index bf0b750c2..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236", - "id": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6525 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6021 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5792 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8933 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8626 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.59 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json deleted file mode 100644 index bf056a3a5..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530", - "id": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6849 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7453 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8404 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6885 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json deleted file mode 100644 index b8d8f17e6..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417", - "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.586 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6632 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.425 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6557 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7778 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5172 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.477 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json deleted file mode 100644 index 5bc8800b1..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486", - "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6773 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7432 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6626 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json deleted file mode 100644 index 620c1403f..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745", - "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6793 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7558 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8311 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8061 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6485 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json deleted file mode 100644 index 93ad54ca5..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661", - "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6611 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6393 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8444 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7636 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6428 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json deleted file mode 100644 index 12de82f8f..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472", - "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5778 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6674 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6011 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7933 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5172 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5003 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json deleted file mode 100644 index 7f64660cc..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267", - "id": "allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5746 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6505 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5082 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7844 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7414 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4128 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json deleted file mode 100644 index cd842e3b9..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759", - "id": "allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6065 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7116 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5792 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8178 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7152 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.465 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json deleted file mode 100644 index eb5fdd21c..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905", - "id": "allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5305 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5832 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.459 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7178 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7071 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3849 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json deleted file mode 100644 index f42f0f831..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363", - "id": "allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4436 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5411 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3115 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6267 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5414 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.31 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json deleted file mode 100644 index f6b4dff52..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505", - "id": "allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5925 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.68 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5519 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.78 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7434 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.431 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json deleted file mode 100644 index d291b202f..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180", - "id": "allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6198 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7263 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8133 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7232 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4908 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json deleted file mode 100644 index 0d4d4902f..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187", - "id": "allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6763 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7411 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8844 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8545 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5908 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json deleted file mode 100644 index 6717d04eb..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_rl__1__1743551509/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509", - "id": "allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6245 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7242 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8178 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7253 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5124 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json deleted file mode 100644 index 7e1595151..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498", - "id": "allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6673 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7326 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3438 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8622 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8566 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5911 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json deleted file mode 100644 index 599698f0e..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926", - "id": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5863 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6674 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5515 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4768 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json deleted file mode 100644 index aa704ecb8..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661", - "id": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.589 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6842 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6393 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7867 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6081 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.447 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json deleted file mode 100644 index 1c2c98ef1..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598", - "id": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7306 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7474 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8622 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8061 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8992 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json deleted file mode 100644 index a60528170..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923", - "id": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7573 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8168 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7049 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8733 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8545 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8814 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json deleted file mode 100644 index 7523cf126..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_1__1__1743896628/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628", - "id": "allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6637 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6947 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7273 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6834 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json deleted file mode 100644 index 884f2c1d7..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999", - "id": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6665 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5979 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8956 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8606 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json deleted file mode 100644 index 58d203adf..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777", - "id": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7038 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6947 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6557 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8867 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8586 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7331 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json deleted file mode 100644 index eb2f9451c..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_2__1__1743896638/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638", - "id": "allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6754 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6716 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8756 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7737 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6976 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json deleted file mode 100644 index b741da56e..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938", - "id": "allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7241 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7305 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9414 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6635 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json deleted file mode 100644 index 4c6ec4ab3..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885", - "id": "allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6716 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6632 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.82 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8303 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json deleted file mode 100644 index 92fb6052e..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773", - "id": "allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6207 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6358 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.375 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5902 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8267 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4948 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json deleted file mode 100644 index 5c7a0911d..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867", - "id": "allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.719 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7263 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6393 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8956 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9273 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.738 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json deleted file mode 100644 index 8b895898b..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1__1__1743929424/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424", - "id": "allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6572 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7305 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8289 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.703 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6837 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json deleted file mode 100644 index cba9daba8..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1__2__1744311395/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395", - "id": "allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6938 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7537 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.45 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6393 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7616 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6913 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json deleted file mode 100644 index acca1c710..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1__3__1744311491/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491", - "id": "allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6754 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7242 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8422 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7535 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6976 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json deleted file mode 100644 index d8abd1886..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787", - "id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7045 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6253 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3812 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9232 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7109 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json deleted file mode 100644 index 68019510c..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461", - "id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7189 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7305 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8978 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9374 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7475 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json deleted file mode 100644 index cb7ea678a..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780", - "id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7172 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7242 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4313 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8778 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.897 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7555 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json deleted file mode 100644 index 462dccb75..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_2__1__1743896489/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489", - "id": "allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6813 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7137 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8644 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6781 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json deleted file mode 100644 index 1f6674a88..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713", - "id": "allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7209 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7116 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9067 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9172 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7414 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json deleted file mode 100644 index f774c416d..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911", - "id": "allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7266 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7347 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4313 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6339 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8933 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.897 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7697 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json deleted file mode 100644 index 41b8b3c61..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412", - "id": "allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5342 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6042 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.275 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5818 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3935 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json deleted file mode 100644 index 19e1f2b82..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922", - "id": "allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6111 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6884 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8289 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7576 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4628 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json deleted file mode 100644 index 504f0108c..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495", - "id": "allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5825 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6379 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5355 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7051 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4691 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json deleted file mode 100644 index b50170dc8..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507", - "id": "allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5598 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5495 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5902 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7273 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3754 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json deleted file mode 100644 index 2ce4a6752..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507", - "id": "allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6101 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6632 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6175 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7778 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7111 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5408 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json deleted file mode 100644 index aef503803..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917", - "id": "allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7185 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7305 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7158 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7933 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8545 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.804 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json deleted file mode 100644 index 3c5aab68c..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961", - "id": "allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7325 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7474 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4437 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7158 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7978 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8141 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8763 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json deleted file mode 100644 index 4f30313c2..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830", - "id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6022 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.694 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7556 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7616 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5486 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json deleted file mode 100644 index e82661177..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024", - "id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5948 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5579 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2875 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6776 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.72 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7394 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5863 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json deleted file mode 100644 index 0ba320e56..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914", - "id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6492 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6084 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.35 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6776 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.76 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.699 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json deleted file mode 100644 index 4d27acb60..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091", - "id": "allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6764 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7074 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6885 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8622 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6984 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json deleted file mode 100644 index 68ba40d15..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829", - "id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6408 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6337 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6831 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8467 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5529 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json deleted file mode 100644 index 69c6a7c36..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050", - "id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6452 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6063 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3187 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7158 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8356 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8343 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5603 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json deleted file mode 100644 index 3817a0a83..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916", - "id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7013 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7263 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3438 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6995 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8222 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8444 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7714 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json deleted file mode 100644 index 05ff3f527..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576", - "id": "allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6369 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6905 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3187 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7844 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6236 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json deleted file mode 100644 index 2295c0011..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619", - "id": "allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6221 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6674 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7978 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7455 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5852 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json deleted file mode 100644 index afd3a2b0e..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583", - "id": "allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5735 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5895 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6889 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6727 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5823 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json deleted file mode 100644 index ce70a17a6..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604", - "id": "allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6336 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6337 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3063 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6885 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7244 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.802 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6465 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json deleted file mode 100644 index 23d89ab11..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738", - "id": "allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6824 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6989 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6831 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8311 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8081 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7107 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json deleted file mode 100644 index b6f1f989c..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191", - "id": "allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6392 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6589 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6995 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7933 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7717 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5804 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json deleted file mode 100644 index 9f16f2dae..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737", - "id": "allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.664 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6821 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8133 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8061 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json deleted file mode 100644 index d9c996fb0..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138", - "id": "allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6678 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6505 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6831 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7978 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8808 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6632 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json b/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json deleted file mode 100644 index 61f3caac6..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_tulu3_70b_1__8__1742924455/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455", - "id": "allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6618 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7958 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.325 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6557 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8311 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6323 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7311 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json b/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json deleted file mode 100644 index 2b5278d3d..000000000 --- a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_tulu3_70b_2__8__1742982964/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964", - "id": "allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6605 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7789 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6448 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8844 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6195 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json b/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json deleted file mode 100644 index 16dc6eef6..000000000 --- a/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_tulu-2-dpo-13b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/tulu-2-dpo-13b", - "id": "allenai/tulu-2-dpo-13b", - "developer": "allenai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7368 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9581 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5833 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7946 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7323 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4947 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json b/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json deleted file mode 100644 index d68ff3ff8..000000000 --- a/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_tulu-2-dpo-70b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/tulu-2-dpo-70b", - "id": "allenai/tulu-2-dpo-70b", - "developer": "allenai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7621 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9749 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6053 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8446 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7407 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5278 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json b/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json deleted file mode 100644 index bef43cd19..000000000 --- a/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_tulu-2-dpo-7b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/tulu-2-dpo-7b", - "id": "allenai/tulu-2-dpo-7b", - "developer": "allenai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7212 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9749 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5614 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7527 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7176 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4774 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json b/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json deleted file mode 100644 index 15c29fb58..000000000 --- a/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_tulu-v2.5-13b-preference-mix-rm/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/tulu-v2.5-13b-preference-mix-rm", - "id": "allenai/tulu-v2.5-13b-preference-mix-rm", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8027 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9358 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.682 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.773 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.885 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6724 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json b/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json deleted file mode 100644 index 817d26686..000000000 --- a/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_tulu-v2.5-13b-uf-rm/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/tulu-v2.5-13b-uf-rm", - "id": "allenai/tulu-v2.5-13b-uf-rm", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4806 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3939 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5554 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4737 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6326 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json b/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json deleted file mode 100644 index 9fc720998..000000000 --- a/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_tulu-v2.5-70b-preference-mix-rm/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/tulu-v2.5-70b-preference-mix-rm", - "id": "allenai/tulu-v2.5-70b-preference-mix-rm", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6516 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7737 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5921 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8486 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4138 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6079 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json b/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json deleted file mode 100644 index b30d36361..000000000 --- a/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/allenai_tulu-v2.5-70b-uf-rm/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "allenai/tulu-v2.5-70b-uf-rm", - "id": "allenai/tulu-v2.5-70b-uf-rm", - "developer": "allenai", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7398 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8659 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7171 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7014 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json b/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json deleted file mode 100644 index 66d40e95d..000000000 --- a/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/berkeley-nest_Starling-RM-7B-alpha/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "berkeley-nest/Starling-RM-7B-alpha", - "id": "berkeley-nest/Starling-RM-7B-alpha", - "developer": "berkeley-nest", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7113 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9804 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4561 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8446 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.58 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6794 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json b/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json deleted file mode 100644 index 96835a8ce..000000000 --- a/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/facebook_Self-taught-Llama-3-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "facebook/Self-taught-Llama-3-70B", - "id": "facebook/Self-taught-Llama-3-70B", - "developer": "facebook", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8863 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8399 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9108 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8251 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json b/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json deleted file mode 100644 index a0b337292..000000000 --- a/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/facebook_Self-taught-evaluator-llama3.1-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "facebook/Self-taught-evaluator-llama3.1-70B", - "id": "facebook/Self-taught-evaluator-llama3.1-70B", - "developer": "facebook", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9001 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8509 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8959 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8844 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json b/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json deleted file mode 100644 index 30c26da89..000000000 --- a/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/general-preference_GPM-Gemma-2B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "general-preference/GPM-Gemma-2B", - "id": "general-preference/GPM-Gemma-2B", - "developer": "general-preference", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7449 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7151 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6974 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8122 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json b/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json deleted file mode 100644 index d66a7ae70..000000000 --- a/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/general-preference_GPM-Llama-3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "general-preference/GPM-Llama-3.1-8B", - "id": "general-preference/GPM-Llama-3.1-8B", - "developer": "general-preference", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9224 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.933 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.886 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9108 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9597 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json b/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json deleted file mode 100644 index 4f1439052..000000000 --- a/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/google_flame-1.0-24B-july-2024/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/flame-1.0-24B-july-2024", - "id": "google/flame-1.0-24B-july-2024", - "developer": "google", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8781 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9218 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7566 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8959 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.938 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json b/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json deleted file mode 100644 index 0c59d6494..000000000 --- a/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/google_gemini-1.5-flash-001/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/gemini-1.5-flash-001", - "id": "google/gemini-1.5-flash-001", - "developer": "google", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8054 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9218 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6349 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8696 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8512 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6937 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json b/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json deleted file mode 100644 index 11fe9a9c7..000000000 --- a/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/google_gemini-1.5-flash-8b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/gemini-1.5-flash-8b", - "id": "google/gemini-1.5-flash-8b", - "developer": "google", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4851 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4611 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5082 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6622 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6747 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2421 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json b/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json deleted file mode 100644 index 1faa0442f..000000000 --- a/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/google_gemini-1.5-pro-0514/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/gemini-1.5-pro-0514", - "id": "google/gemini-1.5-pro-0514", - "developer": "google", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.882 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9232 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8059 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8791 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9199 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json b/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json deleted file mode 100644 index 2eb44d882..000000000 --- a/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/google_gemini-1.5-pro-0924/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/gemini-1.5-pro-0924", - "id": "google/gemini-1.5-pro-0924", - "developer": "google", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8678 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9413 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7697 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8581 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9022 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json b/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json deleted file mode 100644 index 8abf7e861..000000000 --- a/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/google_gemini-2.5-flash-preview-04-17/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/gemini-2.5-flash-preview-04-17", - "id": "google/gemini-2.5-flash-preview-04-17", - "developer": "google", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7721 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6574 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5531 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8115 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9094 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8672 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8341 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json b/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json deleted file mode 100644 index 5a4a0577c..000000000 --- a/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/google_gemini-2.5-flash/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/gemini-2.5-flash", - "id": "google/gemini-2.5-flash", - "developer": "google", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7767 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.674 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.575 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.852 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.909 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.841 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json b/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json deleted file mode 100644 index a4d4ee1dd..000000000 --- a/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/google_gemini-2.5-pro-preview-05-06/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/gemini-2.5-pro-preview-05-06", - "id": "google/gemini-2.5-pro-preview-05-06", - "developer": "google", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6775 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6532 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4688 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5342 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8806 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8308 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6973 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json b/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json deleted file mode 100644 index f67d63bbb..000000000 --- a/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/google_gemini-2.5-pro/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/gemini-2.5-pro", - "id": "google/gemini-2.5-pro", - "developer": "google", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7948 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.755 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.619 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.898 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.881 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.805 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.811 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json b/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json deleted file mode 100644 index 5bc50f14c..000000000 --- a/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/google_gemma-2-27b-it/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "google/gemma-2-27b-it", - "id": "google/gemma-2-27b-it", - "developer": "google", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.809 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9483 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.591 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8635 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.833 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json b/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json deleted file mode 100644 index b2d697ddd..000000000 --- a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0", - "id": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0", - "developer": "hendrydong", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7847 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9832 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5789 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.85 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7434 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7508 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json b/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json deleted file mode 100644 index 53579af56..000000000 --- a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0", - "id": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0", - "developer": "hendrydong", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5851 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5779 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6011 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6956 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6747 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5988 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json b/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json deleted file mode 100644 index 4bce79497..000000000 --- a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/infly_INF-ORM-Llama3.1-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "infly/INF-ORM-Llama3.1-70B", - "id": "infly/INF-ORM-Llama3.1-70B", - "developer": "infly", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7648 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7411 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6995 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9644 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.903 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8622 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json b/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json deleted file mode 100644 index d0e17fefb..000000000 --- a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/infly_INF-ORM-Llama3.1-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "infly/INF-ORM-Llama3.1-70B", - "id": "infly/INF-ORM-Llama3.1-70B", - "developer": "infly", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9511 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9665 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9101 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9365 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9912 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json b/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json deleted file mode 100644 index 009a9841f..000000000 --- a/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/internlm_internlm2-1_8b-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm/internlm2-1_8b-reward", - "id": "internlm/internlm2-1_8b-reward", - "developer": "internlm", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3902 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2758 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4426 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4711 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1934 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json b/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json deleted file mode 100644 index c3fbd28d8..000000000 --- a/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/internlm_internlm2-1_8b-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm/internlm2-1_8b-reward", - "id": "internlm/internlm2-1_8b-reward", - "developer": "internlm", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8217 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9358 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6623 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8162 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8724 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json b/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json deleted file mode 100644 index 332851441..000000000 --- a/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/internlm_internlm2-20b-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm/internlm2-20b-reward", - "id": "internlm/internlm2-20b-reward", - "developer": "internlm", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9016 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9888 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7654 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8946 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9576 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json b/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json deleted file mode 100644 index ceaeec27a..000000000 --- a/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/internlm_internlm2-20b-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm/internlm2-20b-reward", - "id": "internlm/internlm2-20b-reward", - "developer": "internlm", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5628 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5558 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3625 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5738 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6111 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7253 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5483 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json b/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json deleted file mode 100644 index 3c136cd53..000000000 --- a/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/internlm_internlm2-7b-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm/internlm2-7b-reward", - "id": "internlm/internlm2-7b-reward", - "developer": "internlm", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5335 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4211 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5628 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5956 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7051 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5164 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json b/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json deleted file mode 100644 index 2273cb29a..000000000 --- a/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/internlm_internlm2-7b-reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "internlm/internlm2-7b-reward", - "id": "internlm/internlm2-7b-reward", - "developer": "internlm", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8759 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9916 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6952 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8716 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9453 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json b/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json deleted file mode 100644 index 090fb16d8..000000000 --- a/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/jondurbin_bagel-dpo-34b-v0.5/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "jondurbin/bagel-dpo-34b-v0.5", - "id": "jondurbin/bagel-dpo-34b-v0.5", - "developer": "jondurbin", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7215 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9385 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5504 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6446 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8889 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4487 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json b/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json deleted file mode 100644 index ed400926f..000000000 --- a/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/llm-blender_PairRM-hf/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "llm-blender/PairRM-hf", - "id": "llm-blender/PairRM-hf", - "developer": "llm-blender", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6087 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9022 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5219 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.477 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4898 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6961 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json b/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json deleted file mode 100644 index 48058a174..000000000 --- a/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/mattshumer_Reflection-70B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mattshumer/Reflection-70B", - "id": "mattshumer/Reflection-70B", - "developer": "mattshumer", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8422 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9749 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7061 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8318 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8562 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json b/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json deleted file mode 100644 index d29586253..000000000 --- a/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3-70B-Instruct/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "meta-llama/Meta-Llama-3-70B-Instruct", - "id": "meta-llama/Meta-Llama-3-70B-Instruct", - "developer": "meta-llama", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7627 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9763 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5888 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7297 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7854 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7035 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json b/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json deleted file mode 100644 index 21b28918d..000000000 --- a/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3-8B-Instruct/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "meta-llama/Meta-Llama-3-8B-Instruct", - "id": "meta-llama/Meta-Llama-3-8B-Instruct", - "developer": "meta-llama", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.645 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8547 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4156 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6797 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6482 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6082 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json deleted file mode 100644 index 2f6cdcbbc..000000000 --- a/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-405B-Instruct-Turbo/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", - "id": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo", - "developer": "meta-llama", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8412 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9721 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7456 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8715 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json deleted file mode 100644 index 1181fd3d1..000000000 --- a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-70B-Instruct-Turbo/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", - "id": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", - "developer": "meta-llama", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7808 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6689 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7507 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.828 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json deleted file mode 100644 index 2f41e3adc..000000000 --- a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-70B-Instruct/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "id": "meta-llama/Meta-Llama-3.1-70B-Instruct", - "developer": "meta-llama", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8405 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9721 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7018 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8284 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8599 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json deleted file mode 100644 index 47e395707..000000000 --- a/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-8B-Instruct-Turbo/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "id": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", - "developer": "meta-llama", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6565 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8073 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6399 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6811 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json b/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json deleted file mode 100644 index 5a185daab..000000000 --- a/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/meta-metrics_MetaMetrics-RM-v1.0/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "meta-metrics/MetaMetrics-RM-v1.0", - "id": "meta-metrics/MetaMetrics-RM-v1.0", - "developer": "meta-metrics", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9342 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9832 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.864 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9081 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9816 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json b/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json deleted file mode 100644 index 46baee169..000000000 --- a/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/mightbe_Better-PairRM/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mightbe/Better-PairRM", - "id": "mightbe/Better-PairRM", - "developer": "mightbe", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9553 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3925 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8203 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4983 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.724 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json b/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json deleted file mode 100644 index 83daa1359..000000000 --- a/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/mistralai_Mixtral-8x7B-Instruct-v0.1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "id": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "developer": "mistralai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7455 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9497 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6404 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7257 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7872 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5033 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json b/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json deleted file mode 100644 index 366e80763..000000000 --- a/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/my_model_/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "my_model/", - "id": "my_model/", - "developer": "my_model", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5267 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4553 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5592 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4392 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6532 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json b/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json deleted file mode 100644 index e2fe3a9e8..000000000 --- a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/nicolinho_QRM-Gemma-2-27B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nicolinho/QRM-Gemma-2-27B", - "id": "nicolinho/QRM-Gemma-2-27B", - "developer": "nicolinho", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7853 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3719 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6995 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9578 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9535 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8321 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json b/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json deleted file mode 100644 index d61d3e0af..000000000 --- a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/nicolinho_QRM-Gemma-2-27B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nicolinho/QRM-Gemma-2-27B", - "id": "nicolinho/QRM-Gemma-2-27B", - "developer": "nicolinho", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9444 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9665 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9013 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.927 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9826 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json b/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json deleted file mode 100644 index 52e654851..000000000 --- a/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/nicolinho_QRM-Llama3-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nicolinho/QRM-Llama3-8B", - "id": "nicolinho/QRM-Llama3-8B", - "developer": "nicolinho", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.911 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9581 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8114 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8986 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9758 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json b/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json deleted file mode 100644 index aef21f0ec..000000000 --- a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nicolinho/QRM-Llama3.1-8B-v2", - "id": "nicolinho/QRM-Llama3.1-8B-v2", - "developer": "nicolinho", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7074 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6653 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4062 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.612 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9467 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8909 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7234 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json b/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json deleted file mode 100644 index 45f32ccdc..000000000 --- a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nicolinho/QRM-Llama3.1-8B-v2", - "id": "nicolinho/QRM-Llama3.1-8B-v2", - "developer": "nicolinho", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9314 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9637 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8684 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9257 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9677 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json b/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json deleted file mode 100644 index d0517b2db..000000000 --- a/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nicolinho/QRM-Llama3.1-8B", - "id": "nicolinho/QRM-Llama3.1-8B", - "developer": "nicolinho", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9306 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9441 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8969 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.923 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9583 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json b/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json deleted file mode 100644 index 639ea033b..000000000 --- a/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/nvidia_Llama-3.1-Nemotron-70B-Reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nvidia/Llama-3.1-Nemotron-70B-Reward", - "id": "nvidia/Llama-3.1-Nemotron-70B-Reward", - "developer": "nvidia", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9411 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9749 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8575 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9514 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9807 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json b/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json deleted file mode 100644 index 34cb0b116..000000000 --- a/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/nvidia_Llama3-70B-SteerLM-RM/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nvidia/Llama3-70B-SteerLM-RM", - "id": "nvidia/Llama3-70B-SteerLM-RM", - "developer": "nvidia", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8877 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9134 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8026 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9284 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9064 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json b/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json deleted file mode 100644 index 81a8ec028..000000000 --- a/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/nvidia_Nemotron-4-340B-Reward/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "nvidia/Nemotron-4-340B-Reward", - "id": "nvidia/Nemotron-4-340B-Reward", - "developer": "nvidia", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.92 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9581 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8706 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9149 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9363 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json b/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json deleted file mode 100644 index 53333618b..000000000 --- a/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openai_gpt-3.5-turbo-0125/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-3.5-turbo-0125", - "id": "openai/gpt-3.5-turbo-0125", - "developer": "openai", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6534 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9218 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4452 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6547 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5912 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6548 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json b/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json deleted file mode 100644 index 24b1269d5..000000000 --- a/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openai_gpt-4-0125-preview/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4-0125-preview", - "id": "openai/gpt-4-0125-preview", - "developer": "openai", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8434 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9525 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7434 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8692 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7085 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json b/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json deleted file mode 100644 index 8fbd65118..000000000 --- a/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openai_gpt-4-turbo-2024-04-09/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4-turbo-2024-04-09", - "id": "openai/gpt-4-turbo-2024-04-09", - "developer": "openai", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8395 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9525 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7544 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8757 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.827 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7363 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json b/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json deleted file mode 100644 index 7ad659dc4..000000000 --- a/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/openai_gpt-4.1-2025-04-14/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4.1-2025-04-14", - "id": "openai/gpt-4.1-2025-04-14", - "developer": "openai", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7232 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8289 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3974 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6521 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8726 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7338 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8542 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json b/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json deleted file mode 100644 index fe081bf47..000000000 --- a/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/openai_gpt-4.1-mini-2025-04-14/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4.1-mini-2025-04-14", - "id": "openai/gpt-4.1-mini-2025-04-14", - "developer": "openai", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6573 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6084 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4125 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7213 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7265 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7354 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.74 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json b/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json deleted file mode 100644 index 9236ca4a4..000000000 --- a/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/openai_gpt-4.1-nano-2025-04-14/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4.1-nano-2025-04-14", - "id": "openai/gpt-4.1-nano-2025-04-14", - "developer": "openai", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4849 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4646 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2578 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5041 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7156 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.466 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5015 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json b/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json deleted file mode 100644 index e598746ee..000000000 --- a/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openai_gpt-4o-2024-05-13/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4o-2024-05-13", - "id": "openai/gpt-4o-2024-05-13", - "developer": "openai", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8327 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9665 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7039 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8649 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8487 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7262 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json b/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json deleted file mode 100644 index 92a4b0914..000000000 --- a/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openai_gpt-4o-2024-08-06/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4o-2024-08-06", - "id": "openai/gpt-4o-2024-08-06", - "developer": "openai", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8673 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9609 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.761 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8811 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8661 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json b/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json deleted file mode 100644 index 44c5bcc27..000000000 --- a/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/openai_gpt-4o-2024-08-06/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4o-2024-08-06", - "id": "openai/gpt-4o-2024-08-06", - "developer": "openai", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6493 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5684 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.623 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8619 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7293 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7819 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json b/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json deleted file mode 100644 index 653bf1c3b..000000000 --- a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openai_gpt-4o-mini-2024-07-18/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4o-mini-2024-07-18", - "id": "openai/gpt-4o-mini-2024-07-18", - "developer": "openai", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8007 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9497 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6075 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8081 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8374 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json b/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json deleted file mode 100644 index 4e0668e59..000000000 --- a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/openai_gpt-4o-mini-2024-07-18/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openai/gpt-4o-mini-2024-07-18", - "id": "openai/gpt-4o-mini-2024-07-18", - "developer": "openai", - "additional_details": { - "model_type": "Generative RM" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5796 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4105 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3438 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5191 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7414 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6962 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json b/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json deleted file mode 100644 index f4c92b674..000000000 --- a/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openbmb_Eurus-7b-kto/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbmb/Eurus-7b-kto", - "id": "openbmb/Eurus-7b-kto", - "developer": "openbmb", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.69 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9525 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5373 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6054 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7467 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5261 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json b/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json deleted file mode 100644 index 2ceca85c7..000000000 --- a/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/openbmb_Eurus-RM-7b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbmb/Eurus-RM-7b", - "id": "openbmb/Eurus-RM-7b", - "developer": "openbmb", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5806 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3438 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5683 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6267 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7475 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5972 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json b/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json deleted file mode 100644 index 83d393244..000000000 --- a/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openbmb_Eurus-RM-7b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbmb/Eurus-RM-7b", - "id": "openbmb/Eurus-RM-7b", - "developer": "openbmb", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8159 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9804 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6557 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8135 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8633 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7172 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json b/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json deleted file mode 100644 index 94f6c97ac..000000000 --- a/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openbmb_MiniCPM-2B-dpo-fp32/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbmb/MiniCPM-2B-dpo-fp32", - "id": "openbmb/MiniCPM-2B-dpo-fp32", - "developer": "openbmb", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.673 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8911 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4934 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.573 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8233 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4958 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json b/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json deleted file mode 100644 index 2c68b9ca5..000000000 --- a/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/openbmb_UltraRM-13b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbmb/UltraRM-13b", - "id": "openbmb/UltraRM-13b", - "developer": "openbmb", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6903 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9637 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5548 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5986 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6244 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7294 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json b/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json deleted file mode 100644 index 0b4c10b89..000000000 --- a/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/openbmb_UltraRM-13b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "openbmb/UltraRM-13b", - "id": "openbmb/UltraRM-13b", - "developer": "openbmb", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4683 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5063 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3312 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5519 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5089 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6081 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3036 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json b/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json deleted file mode 100644 index 4c9d57828..000000000 --- a/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/opencompass_CompassJudger-1-1.5B-Instruct/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "opencompass/CompassJudger-1-1.5B-Instruct", - "id": "opencompass/CompassJudger-1-1.5B-Instruct", - "developer": "opencompass", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7344 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9637 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4923 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7818 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6999 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json b/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json deleted file mode 100644 index 4299d154c..000000000 --- a/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/opencompass_CompassJudger-1-14B-Instruct/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "opencompass/CompassJudger-1-14B-Instruct", - "id": "opencompass/CompassJudger-1-14B-Instruct", - "developer": "opencompass", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8409 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9749 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6228 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8392 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9268 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json b/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json deleted file mode 100644 index 49134a927..000000000 --- a/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/opencompass_CompassJudger-1-32B-Instruct/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "opencompass/CompassJudger-1-32B-Instruct", - "id": "opencompass/CompassJudger-1-32B-Instruct", - "developer": "opencompass", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8522 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9804 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6513 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8527 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9244 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json b/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json deleted file mode 100644 index 145d5b3e9..000000000 --- a/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/opencompass_CompassJudger-1-7B-Instruct/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "opencompass/CompassJudger-1-7B-Instruct", - "id": "opencompass/CompassJudger-1-7B-Instruct", - "developer": "opencompass", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8317 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9777 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6096 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8446 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8948 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json b/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json deleted file mode 100644 index e934ef88d..000000000 --- a/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/prometheus-eval_prometheus-7b-v2.0/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "prometheus-eval/prometheus-7b-v2.0", - "id": "prometheus-eval/prometheus-7b-v2.0", - "developer": "prometheus-eval", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7204 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8547 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4912 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7709 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7648 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json b/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json deleted file mode 100644 index d2deb1a71..000000000 --- a/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/prometheus-eval_prometheus-8x7b-v2.0/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "prometheus-eval/prometheus-8x7b-v2.0", - "id": "prometheus-eval/prometheus-8x7b-v2.0", - "developer": "prometheus-eval", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7451 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9302 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4715 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8047 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.774 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json b/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json deleted file mode 100644 index 494a96669..000000000 --- a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1", - "id": "sfairXC/FsfairX-LLaMA3-RM-v0.1", - "developer": "sfairXC", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8338 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9944 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6513 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8676 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8644 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7492 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json b/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json deleted file mode 100644 index 8dad45261..000000000 --- a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1", - "id": "sfairXC/FsfairX-LLaMA3-RM-v0.1", - "developer": "sfairXC", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6292 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5916 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4188 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6284 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7667 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7051 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6647 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json b/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json deleted file mode 100644 index 74acfeed3..000000000 --- a/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/stabilityai_stable-code-instruct-3b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stabilityai/stable-code-instruct-3b", - "id": "stabilityai/stable-code-instruct-3b", - "developer": "stabilityai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6216 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5782 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5855 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6554 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7528 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4506 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json b/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json deleted file mode 100644 index 491861cbd..000000000 --- a/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/stabilityai_stablelm-2-12b-chat/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stabilityai/stablelm-2-12b-chat", - "id": "stabilityai/stablelm-2-12b-chat", - "developer": "stabilityai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7642 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9665 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5548 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7811 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8945 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4839 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json b/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json deleted file mode 100644 index 3406eee45..000000000 --- a/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/stabilityai_stablelm-2-zephyr-1_6b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stabilityai/stablelm-2-zephyr-1_6b", - "id": "stabilityai/stablelm-2-zephyr-1_6b", - "developer": "stabilityai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6574 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9665 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4671 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6027 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6784 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4868 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json b/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json deleted file mode 100644 index 47f135b1d..000000000 --- a/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/stabilityai_stablelm-zephyr-3b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stabilityai/stablelm-zephyr-3b", - "id": "stabilityai/stablelm-zephyr-3b", - "developer": "stabilityai", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7146 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8631 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6009 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7405 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7573 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5075 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json b/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json deleted file mode 100644 index 47e44c557..000000000 --- a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/stanfordnlp_SteamSHP-flan-t5-large/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stanfordnlp/SteamSHP-flan-t5-large", - "id": "stanfordnlp/SteamSHP-flan-t5-large", - "developer": "stanfordnlp", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4962 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8575 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3311 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3743 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3563 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6273 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json b/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json deleted file mode 100644 index 6a0de9161..000000000 --- a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/stanfordnlp_SteamSHP-flan-t5-xl/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "stanfordnlp/SteamSHP-flan-t5-xl", - "id": "stanfordnlp/SteamSHP-flan-t5-xl", - "developer": "stanfordnlp", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5135 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8547 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3684 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3784 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3841 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6498 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json b/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json deleted file mode 100644 index b2e8c1248..000000000 --- a/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Cohere March 2024/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere March 2024", - "id": "Cohere March 2024", - "developer": "unknown", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8511 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9469 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6513 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.877 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9817 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7458 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json b/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json deleted file mode 100644 index a50e32313..000000000 --- a/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/Cohere May 2024/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "Cohere May 2024", - "id": "Cohere May 2024", - "developer": "unknown", - "additional_details": { - "model_type": "Custom Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8816 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9637 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7127 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.923 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9768 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.782 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json b/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json deleted file mode 100644 index dd83d6018..000000000 --- a/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json +++ /dev/null @@ -1,112 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/gemini-1.5-flash-8b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "gemini-1.5-flash-8b", - "id": "gemini-1.5-flash-8b", - "developer": "unknown", - "additional_details": { - "model_type": "Generative" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7601 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9441 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5987 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7399 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7575 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json b/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json deleted file mode 100644 index 5df4ce7b8..000000000 --- a/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/upstage_SOLAR-10.7B-Instruct-v1.0/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "upstage/SOLAR-10.7B-Instruct-v1.0", - "id": "upstage/SOLAR-10.7B-Instruct-v1.0", - "developer": "upstage", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7391 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8156 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6864 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8514 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7252 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4949 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json b/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json deleted file mode 100644 index 78244f3ff..000000000 --- a/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/wenbopan_Faro-Yi-9B-DPO/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "wenbopan/Faro-Yi-9B-DPO", - "id": "wenbopan/Faro-Yi-9B-DPO", - "developer": "wenbopan", - "additional_details": { - "model_type": "DPO" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6461 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9218 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5307 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5514 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5839 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6395 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json b/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json deleted file mode 100644 index 8856c5e7a..000000000 --- a/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-2B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "weqweasdas/RM-Gemma-2B", - "id": "weqweasdas/RM-Gemma-2B", - "developer": "weqweasdas", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6549 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9441 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4079 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4986 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7637 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6652 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json b/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json deleted file mode 100644 index c3e2d4a3c..000000000 --- a/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-2B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "weqweasdas/RM-Gemma-2B", - "id": "weqweasdas/RM-Gemma-2B", - "developer": "weqweasdas", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3057 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3705 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2812 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4317 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3311 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2343 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.1851 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json b/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json deleted file mode 100644 index 81934bf6a..000000000 --- a/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-7B-4096/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "weqweasdas/RM-Gemma-7B-4096", - "id": "weqweasdas/RM-Gemma-7B-4096", - "developer": "weqweasdas", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6922 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9497 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5022 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5608 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7511 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7024 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json b/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json deleted file mode 100644 index 02ba525c2..000000000 --- a/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "weqweasdas/RM-Gemma-7B", - "id": "weqweasdas/RM-Gemma-7B", - "developer": "weqweasdas", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6967 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9693 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4978 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5784 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7362 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7069 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json b/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json deleted file mode 100644 index ed03af39d..000000000 --- a/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "weqweasdas/RM-Gemma-7B", - "id": "weqweasdas/RM-Gemma-7B", - "developer": "weqweasdas", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4826 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4926 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6066 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4822 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.497 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4232 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json b/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json deleted file mode 100644 index 7abde633d..000000000 --- a/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/weqweasdas_RM-Mistral-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "weqweasdas/RM-Mistral-7B", - "id": "weqweasdas/RM-Mistral-7B", - "developer": "weqweasdas", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7982 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.9665 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6053 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8703 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7736 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.753 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json b/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json deleted file mode 100644 index d53f1986e..000000000 --- a/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/weqweasdas_RM-Mistral-7B/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "weqweasdas/RM-Mistral-7B", - "id": "weqweasdas/RM-Mistral-7B", - "developer": "weqweasdas", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.596 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5937 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3438 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5956 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6911 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.7293 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6226 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -} diff --git a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json b/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json deleted file mode 100644 index dbe32c629..000000000 --- a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json +++ /dev/null @@ -1,130 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "weqweasdas/hh_rlhf_rm_open_llama_3b", - "id": "weqweasdas/hh_rlhf_rm_open_llama_3b", - "developer": "weqweasdas", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench Score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.5027 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat", - "metric_config": { - "evaluation_description": "Chat accuracy - includes easy chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.8184 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Chat Hard", - "metric_config": { - "evaluation_description": "Chat Hard accuracy - includes hard chat subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3728 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety accuracy - includes safety subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.4149 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Reasoning", - "metric_config": { - "evaluation_description": "Reasoning accuracy - includes code and math subsets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3281 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - }, - { - "evaluation_name": "Prior Sets (0.5 weight)", - "metric_config": { - "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.6564 - }, - "source_data": { - "dataset_name": "RewardBench", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench" - } - } - ] -} diff --git a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json b/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json deleted file mode 100644 index 7e050faee..000000000 --- a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json +++ /dev/null @@ -1,148 +0,0 @@ -{ - "schema_version": "0.2.0", - "evaluation_id": "reward-bench-2/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", - "retrieved_timestamp": "1766412838.146816", - "source_metadata": { - "source_name": "RewardBench 2", - "source_type": "documentation", - "source_organization_name": "Allen Institute for AI", - "source_organization_url": "https://allenai.org", - "evaluator_relationship": "third_party" - }, - "model_info": { - "name": "weqweasdas/hh_rlhf_rm_open_llama_3b", - "id": "weqweasdas/hh_rlhf_rm_open_llama_3b", - "developer": "weqweasdas", - "additional_details": { - "model_type": "Seq. Classifier" - } - }, - "evaluation_results": [ - { - "evaluation_name": "Score", - "metric_config": { - "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2498 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Factuality", - "metric_config": { - "evaluation_description": "Factuality score - measures factual accuracy", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3642 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Precise IF", - "metric_config": { - "evaluation_description": "Precise Instruction Following score", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.275 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Math", - "metric_config": { - "evaluation_description": "Math score - measures mathematical reasoning", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.3497 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Safety", - "metric_config": { - "evaluation_description": "Safety score - measures safety awareness", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.24 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Focus", - "metric_config": { - "evaluation_description": "Focus score - measures response focus", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.2384 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - }, - { - "evaluation_name": "Ties", - "metric_config": { - "evaluation_description": "Ties score - ability to identify tie cases", - "lower_is_better": false, - "score_type": "continuous", - "min_score": 0.0, - "max_score": 1.0 - }, - "score_details": { - "score": 0.0315 - }, - "source_data": { - "dataset_name": "RewardBench 2", - "source_type": "hf_dataset", - "hf_repo": "allenai/reward-bench-2-results" - } - } - ] -}